Return-Path: Delivered-To: apmail-hadoop-hbase-commits-archive@minotaur.apache.org Received: (qmail 41802 invoked from network); 8 May 2010 06:28:18 -0000 Received: from unknown (HELO mail.apache.org) (140.211.11.3) by 140.211.11.9 with SMTP; 8 May 2010 06:28:18 -0000 Received: (qmail 92156 invoked by uid 500); 8 May 2010 06:28:18 -0000 Delivered-To: apmail-hadoop-hbase-commits-archive@hadoop.apache.org Received: (qmail 92079 invoked by uid 500); 8 May 2010 06:28:17 -0000 Mailing-List: contact hbase-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: hbase-dev@hadoop.apache.org Delivered-To: mailing list hbase-commits@hadoop.apache.org Received: (qmail 92072 invoked by uid 99); 8 May 2010 06:28:17 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 08 May 2010 06:28:17 +0000 X-ASF-Spam-Status: No, hits=-1687.5 required=10.0 tests=ALL_TRUSTED,AWL X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 08 May 2010 06:28:12 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 5239223888E8; Sat, 8 May 2010 06:27:16 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r942318 - in /hadoop/hbase/trunk: ./ core/src/main/java/org/apache/hadoop/hbase/ core/src/main/java/org/apache/hadoop/hbase/master/ core/src/main/java/org/apache/hadoop/hbase/regionserver/ core/src/test/java/org/apache/hadoop/hbase/ core/sr... Date: Sat, 08 May 2010 06:27:16 -0000 To: hbase-commits@hadoop.apache.org From: stack@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20100508062716.5239223888E8@eris.apache.org> Author: stack Date: Sat May 8 06:27:15 2010 New Revision: 942318 URL: http://svn.apache.org/viewvc?rev=942318&view=rev Log: HBASE-2431 Master does not respect generation stamps, may result in meta getting permanently offlined Added: hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/master/TestMasterTransitions.java hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/master/TestServerManager.java Removed: hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/master/TestMasterTransistions.java Modified: hadoop/hbase/trunk/CHANGES.txt hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/ClusterStatus.java hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/HServerAddress.java hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/HServerInfo.java hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/HMaster.java hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/RegionServerOperation.java hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/master/TestRegionManager.java Modified: hadoop/hbase/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/CHANGES.txt?rev=942318&r1=942317&r2=942318&view=diff ============================================================================== --- hadoop/hbase/trunk/CHANGES.txt (original) +++ hadoop/hbase/trunk/CHANGES.txt Sat May 8 06:27:15 2010 @@ -304,6 +304,8 @@ Release 0.21.0 - Unreleased crashes (Todd Lipcon via Stack) HBASE-2513 hbase-2414 added bug where we'd tight-loop if no root available HBASE-2503 PriorityQueue isn't thread safe, KeyValueHeap uses it that way + HBASE-2431 Master does not respect generation stamps, may result in meta + getting permanently offlined IMPROVEMENTS HBASE-1760 Cleanup TODOs in HTable Modified: hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/ClusterStatus.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/ClusterStatus.java?rev=942318&r1=942317&r2=942318&view=diff ============================================================================== --- hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/ClusterStatus.java (original) +++ hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/ClusterStatus.java Sat May 8 06:27:15 2010 @@ -59,17 +59,7 @@ public class ClusterStatus extends Versi * Constructor, for Writable */ public ClusterStatus() { - } - - /** - * @return the names of region servers in the cluster - */ - public Collection getServerNames() { - ArrayList names = new ArrayList(liveServerInfo.size()); - for (HServerInfo server: liveServerInfo) { - names.add(server.getName()); - } - return names; + super(); } /** Modified: hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/HServerAddress.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/HServerAddress.java?rev=942318&r1=942317&r2=942318&view=diff ============================================================================== --- hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/HServerAddress.java (original) +++ hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/HServerAddress.java Sat May 8 06:27:15 2010 @@ -27,14 +27,12 @@ import java.io.IOException; import java.net.InetSocketAddress; /** - * HServerAddress is a "label" for a HBase server that combines the host - * name and port number. + * HServerAddress is a "label" for a HBase server made of host and port number. */ public class HServerAddress implements WritableComparable { private InetSocketAddress address; String stringValue; - /** Empty constructor, used for Writable */ public HServerAddress() { this.address = null; this.stringValue = null; @@ -51,9 +49,7 @@ public class HServerAddress implements W } /** - * Construct a HServerAddress from a string of the form hostname:port - * - * @param hostAndPort format 'hostname:port' + * @param hostAndPort Hostname and port formatted as <hostname> ':' <port> */ public HServerAddress(String hostAndPort) { int colonIndex = hostAndPort.lastIndexOf(':'); @@ -68,9 +64,8 @@ public class HServerAddress implements W } /** - * Construct a HServerAddress from hostname, port number - * @param bindAddress host name - * @param port port number + * @param bindAddress Hostname + * @param port Port number */ public HServerAddress(String bindAddress, int port) { this.address = new InetSocketAddress(bindAddress, port); @@ -78,48 +73,45 @@ public class HServerAddress implements W } /** - * Construct a HServerAddress from another HServerAddress + * Copy-constructor * - * @param other the HServerAddress to copy from + * @param other HServerAddress to copy from */ public HServerAddress(HServerAddress other) { String bindAddress = other.getBindAddress(); int port = other.getPort(); - address = new InetSocketAddress(bindAddress, port); + this.address = new InetSocketAddress(bindAddress, port); stringValue = bindAddress + ":" + port; } - /** @return bind address */ + /** @return Bind address */ public String getBindAddress() { - return address.getAddress().getHostAddress(); + return this.address.getAddress().getHostAddress(); } - /** @return port number */ + /** @return Port number */ public int getPort() { - return address.getPort(); + return this.address.getPort(); } - /** @return host name */ + /** @return Hostname */ public String getHostname() { - return address.getHostName(); + return this.address.getHostName(); } - /** @return the InetSocketAddress */ + /** @return The InetSocketAddress */ public InetSocketAddress getInetSocketAddress() { - return address; + return this.address; } /** - * @see java.lang.Object#toString() + * @return String formatted as <bind address> ':' <port> */ @Override public String toString() { - return (stringValue == null ? "" : stringValue); + return (this.stringValue == null ? "" : this.stringValue); } - /** - * @see java.lang.Object#equals(java.lang.Object) - */ @Override public boolean equals(Object o) { if (this == o) { @@ -134,9 +126,6 @@ public class HServerAddress implements W return this.compareTo((HServerAddress)o) == 0; } - /** - * @see java.lang.Object#hashCode() - */ @Override public int hashCode() { int result = this.address.hashCode(); Modified: hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/HServerInfo.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/HServerInfo.java?rev=942318&r1=942317&r2=942318&view=diff ============================================================================== --- hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/HServerInfo.java (original) +++ hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/HServerInfo.java Sat May 8 06:27:15 2010 @@ -29,44 +29,46 @@ import org.apache.hadoop.io.WritableComp /** - * HServerInfo contains metainfo about an HRegionServer, Currently it only - * contains the server start code. - * - * In the future it will contain information about the source machine and - * load statistics. + * HServerInfo is meta info about an {@link HRegionServer}. + * Holds hostname, ports, regionserver startcode, and load. Each server has + * a servername where servername is made up of a concatenation of + * hostname, port, and regionserver startcode. */ public class HServerInfo implements WritableComparable { + /** + * This character is used as separator making up the servername. + * Servername is made of host, port, and startcode formatted as + * <hostname> '{@link #SERVERNAME_SEPARATOR}' <port> '{@ink #SEPARATOR"}' <startcode> + * where {@link SEPARATOR is usually a ','. + */ + public static final String SERVERNAME_SEPARATOR = ","; + private HServerAddress serverAddress; private long startCode; private HServerLoad load; private int infoPort; + // Servername is made of hostname, port and startcode. private String serverName = null; - private String name; + // Hostname of the regionserver. + private String hostname; private static Map dnsCache = new HashMap(); - /** default constructor - used by Writable */ public HServerInfo() { this(new HServerAddress(), 0, HConstants.DEFAULT_REGIONSERVER_INFOPORT, "default name"); } - /** - * Constructor - * @param serverAddress - * @param startCode - * @param infoPort Port the info server is listening on. - */ public HServerInfo(HServerAddress serverAddress, long startCode, - final int infoPort, String name) { + final int infoPort, String hostname) { this.serverAddress = serverAddress; this.startCode = startCode; this.load = new HServerLoad(); this.infoPort = infoPort; - this.name = name; + this.hostname = hostname; } /** - * Construct a new object using another as input (like a copy constructor) + * Copy-constructor * @param other */ public HServerInfo(HServerInfo other) { @@ -74,72 +76,53 @@ public class HServerInfo implements Writ this.startCode = other.getStartCode(); this.load = other.getLoad(); this.infoPort = other.getInfoPort(); - this.name = other.getName(); + this.hostname = other.hostname; } - /** - * @return the load - */ public HServerLoad getLoad() { return load; } - /** - * @param load the load to set - */ public void setLoad(HServerLoad load) { this.load = load; } - /** @return the server address */ public synchronized HServerAddress getServerAddress() { return new HServerAddress(serverAddress); } - /** - * Change the server address. - * @param serverAddress New server address - */ public synchronized void setServerAddress(HServerAddress serverAddress) { this.serverAddress = serverAddress; this.serverName = null; } - /** @return the server start code */ public synchronized long getStartCode() { return startCode; } - /** - * @return Port the info server is listening on. - */ public int getInfoPort() { return this.infoPort; } - /** - * @param infoPort - new port of info server - */ public void setInfoPort(int infoPort) { this.infoPort = infoPort; } - /** - * @param startCode the startCode to set - */ public synchronized void setStartCode(long startCode) { this.startCode = startCode; this.serverName = null; } /** - * @return the server name in the form hostname_startcode_port + * @return Server name made of the concatenation of hostname, port and + * startcode formatted as <hostname> ',' <port> ',' <startcode> */ public synchronized String getServerName() { if (this.serverName == null) { // if we have the hostname of the RS, use it - if(this.name != null) { - this.serverName = getServerName(this.name, this.serverAddress.getPort(), this.startCode); + if(this.hostname != null) { + this.serverName = + getServerName(this.hostname, this.serverAddress.getPort(), this.startCode); } // go to DNS name resolution only if we dont have the name of the RS else { @@ -150,33 +133,68 @@ public class HServerInfo implements Writ } /** - * Get the hostname of the server - * @return hostname + * @param serverAddress In form <hostname> ':' <port> + * @param startCode Server startcode + * @return Server name made of the concatenation of hostname, port and + * startcode formatted as <hostname> ',' <port> ',' <startcode> */ - public String getName() { + public static String getServerName(String serverAddress, long startCode) { + String name = null; + if (serverAddress != null) { + int colonIndex = serverAddress.lastIndexOf(':'); + if(colonIndex < 0) { + throw new IllegalArgumentException("Not a host:port pair: " + serverAddress); + } + String host = serverAddress.substring(0, colonIndex); + int port = + Integer.valueOf(serverAddress.substring(colonIndex + 1)).intValue(); + if(!dnsCache.containsKey(host)) { + HServerAddress address = new HServerAddress(serverAddress); + dnsCache.put(host, address.getHostname()); + } + host = dnsCache.get(host); + name = getServerName(host, port, startCode); + } return name; } /** - * Set the hostname of the server - * @param name hostname + * @param address Server address + * @param startCode Server startcode + * @return Server name made of the concatenation of hostname, port and + * startcode formatted as <hostname> ',' <port> ',' <startcode> */ - public void setName(String name) { - this.name = name; + public static String getServerName(HServerAddress address, long startCode) { + return getServerName(address.getHostname(), address.getPort(), startCode); + } + + /* + * @param hostName + * @param port + * @param startCode + * @return Server name made of the concatenation of hostname, port and + * startcode formatted as <hostname> ',' <port> ',' <startcode> + */ + private static String getServerName(String hostName, int port, long startCode) { + StringBuilder name = new StringBuilder(hostName); + name.append(SERVERNAME_SEPARATOR); + name.append(port); + name.append(SERVERNAME_SEPARATOR); + name.append(startCode); + return name.toString(); } /** - * @see java.lang.Object#toString() + * @return ServerName and load concatenated. + * @see #getServerName() + * @see #getLoad() */ @Override public String toString() { - return "address: " + this.serverAddress + ", startcode: " + this.startCode - + ", load: (" + this.load.toString() + ")"; + return "serverName=" + getServerName() + + ", load=(" + this.load.toString() + ")"; } - /** - * @see java.lang.Object#equals(java.lang.Object) - */ @Override public boolean equals(Object obj) { if (this == obj) { @@ -191,23 +209,17 @@ public class HServerInfo implements Writ return compareTo((HServerInfo)obj) == 0; } - /** - * @see java.lang.Object#hashCode() - */ @Override public int hashCode() { return this.getServerName().hashCode(); } - - // Writable - public void readFields(DataInput in) throws IOException { this.serverAddress.readFields(in); this.startCode = in.readLong(); this.load.readFields(in); this.infoPort = in.readInt(); - this.name = in.readUTF(); + this.hostname = in.readUTF(); } public void write(DataOutput out) throws IOException { @@ -215,61 +227,10 @@ public class HServerInfo implements Writ out.writeLong(this.startCode); this.load.write(out); out.writeInt(this.infoPort); - out.writeUTF(name); + out.writeUTF(hostname); } public int compareTo(HServerInfo o) { return this.getServerName().compareTo(o.getServerName()); } - - /** - * @param info - * @return the server name in the form hostname_startcode_port - */ - private static String getServerName(HServerInfo info) { - return getServerName(info.getServerAddress(), info.getStartCode()); - } - - /** - * @param serverAddress in the form hostname:port - * @param startCode - * @return the server name in the form hostname_startcode_port - */ - public static String getServerName(String serverAddress, long startCode) { - String name = null; - if (serverAddress != null) { - int colonIndex = serverAddress.lastIndexOf(':'); - if(colonIndex < 0) { - throw new IllegalArgumentException("Not a host:port pair: " + serverAddress); - } - String host = serverAddress.substring(0, colonIndex); - int port = - Integer.valueOf(serverAddress.substring(colonIndex + 1)).intValue(); - if(!dnsCache.containsKey(host)) { - HServerAddress address = new HServerAddress(serverAddress); - dnsCache.put(host, address.getHostname()); - } - host = dnsCache.get(host); - name = getServerName(host, port, startCode); - } - return name; - } - - /** - * @param address - * @param startCode - * @return the server name in the form hostname_startcode_port - */ - public static String getServerName(HServerAddress address, long startCode) { - return getServerName(address.getHostname(), address.getPort(), startCode); - } - - private static String getServerName(String hostName, int port, long startCode) { - StringBuilder name = new StringBuilder(hostName); - name.append(","); - name.append(port); - name.append(","); - name.append(startCode); - return name.toString(); - } } Modified: hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java?rev=942318&r1=942317&r2=942318&view=diff ============================================================================== --- hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java (original) +++ hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/LocalHBaseCluster.java Sat May 8 06:27:15 2010 @@ -20,6 +20,7 @@ package org.apache.hadoop.hbase; import java.io.IOException; +import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -127,7 +128,8 @@ public class LocalHBaseCluster implement return addRegionServer(this.regionThreads.size()); } - public JVMClusterUtil.RegionServerThread addRegionServer(final int index) throws IOException { + public JVMClusterUtil.RegionServerThread addRegionServer(final int index) + throws IOException { JVMClusterUtil.RegionServerThread rst = JVMClusterUtil.createRegionServerThread(this.conf, this.regionServerClass, index); this.regionThreads.add(rst); @@ -157,6 +159,20 @@ public class LocalHBaseCluster implement } /** + * @return List of running servers (Some servers may have been killed or + * aborted during lifetime of cluster; these servers are not included in this + * list). + */ + public List getLiveRegionServers() { + List liveServers = + new ArrayList(); + for (JVMClusterUtil.RegionServerThread rst: getRegionServers()) { + if (rst.isAlive()) liveServers.add(rst); + } + return liveServers; + } + + /** * Wait for the specified region server to stop * Removes this thread from list of running threads. * @param serverNumber Modified: hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/HMaster.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=942318&r1=942317&r2=942318&view=diff ============================================================================== --- hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original) +++ hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Sat May 8 06:27:15 2010 @@ -114,7 +114,7 @@ public class HMaster extends Thread impl // Use AtomicBoolean rather than plain boolean because we want other threads // able to set shutdown flag. Using AtomicBoolean can pass a reference // rather than have them have to know about the hosting Master class. - private final AtomicBoolean closed = new AtomicBoolean(true); + final AtomicBoolean closed = new AtomicBoolean(true); // TODO: Is this separate flag necessary? private final AtomicBoolean shutdownRequested = new AtomicBoolean(false); @@ -515,7 +515,7 @@ public class HMaster extends Thread impl HRegionInterface hri = this.connection.getHRegionConnection(address, false); HServerInfo info = hri.getHServerInfo(); - LOG.debug("Inspection found server " + info.getName()); + LOG.debug("Inspection found server " + info.getServerName()); this.serverManager.recordNewServer(info, true); regions = hri.getRegionsAssignment(); } catch (IOException e) { Modified: hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java?rev=942318&r1=942317&r2=942318&view=diff ============================================================================== --- hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java (original) +++ hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java Sat May 8 06:27:15 2010 @@ -54,7 +54,6 @@ class ProcessServerShutdown extends Regi private boolean rootRescanned; private HServerAddress deadServerAddress; - private static class ToDoEntry { boolean regionOffline; final HRegionInfo info; @@ -85,24 +84,19 @@ class ProcessServerShutdown extends Regi } private void closeMetaRegions() { - isRootServer = master.getRegionManager().isRootServer(deadServerAddress); - if (isRootServer) { - master.getRegionManager().unsetRootRegion(); - } - else { - //HBASE-1928: Check whether this server has been transitioning the ROOT table - isRootServer = master.getRegionManager().isRootServerCandidate (deadServer); - if (isRootServer) { - master.getRegionManager().unsetRootRegion(); - } + this.isRootServer = + this.master.getRegionManager().isRootServer(this.deadServerAddress) || + this.master.getRegionManager().isRootServerCandidate (deadServer); + if (this.isRootServer) { + this.master.getRegionManager().unsetRootRegion(); } + List metaStarts = + this.master.getRegionManager().listMetaRegionsForServer(deadServerAddress); - List metaStarts = master.getRegionManager().listMetaRegionsForServer(deadServerAddress); - - metaRegions = new ArrayList(); - for (byte [] region : metaStarts) { - MetaRegion r = master.getRegionManager().offlineMetaRegion(region); - metaRegions.add(r); + this.metaRegions = new ArrayList(); + for (byte [] startKey: metaStarts) { + MetaRegion r = master.getRegionManager().offlineMetaRegionWithStartKey(startKey); + this.metaRegions.add(r); } //HBASE-1928: Check whether this server has been transitioning the META table @@ -194,7 +188,7 @@ class ProcessServerShutdown extends Regi Bytes.toString(info.getRegionName()) + " from online meta regions"); } - master.getRegionManager().offlineMetaRegion(info.getStartKey()); + master.getRegionManager().offlineMetaRegionWithStartKey(info.getStartKey()); } ToDoEntry todo = new ToDoEntry(info); Modified: hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java?rev=942318&r1=942317&r2=942318&view=diff ============================================================================== --- hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java (original) +++ hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java Sat May 8 06:27:15 2010 @@ -786,11 +786,12 @@ public class RegionManager implements HC /** * Set an online MetaRegion offline - remove it from the map. - * @param startKey region name + * @param startKey Startkey to use finding region to remove. * @return the MetaRegion that was taken offline. */ - public MetaRegion offlineMetaRegion(byte [] startKey) { - LOG.info("META region removed from onlineMetaRegions"); + public MetaRegion offlineMetaRegionWithStartKey(byte [] startKey) { + LOG.info("META region whose startkey is " + Bytes.toString(startKey) + + " removed from onlineMetaRegions"); return onlineMetaRegions.remove(startKey); } @@ -920,7 +921,7 @@ public class RegionManager implements HC for ( MetaRegion region : onlineMetaRegions.values() ) { if (server.equals(region.getServer())) { LOG.info("Offlining META region: " + region); - offlineMetaRegion(region.getStartKey()); + offlineMetaRegionWithStartKey(region.getStartKey()); // Set for reassignment. setUnassigned(region.getRegionInfo(), true); hasMeta = true; @@ -1379,7 +1380,7 @@ public class RegionManager implements HC } // check if current server is overloaded - int numRegionsToClose = balanceFromOverloaded(servLoad, avg); + int numRegionsToClose = balanceFromOverloaded(info, servLoad, avg); // check if we can unload server by low loaded servers if(numRegionsToClose <= 0) { @@ -1401,13 +1402,14 @@ public class RegionManager implements HC * Check if server load is not overloaded (with load > avgLoadPlusSlop). * @return number of regions to unassign. */ - private int balanceFromOverloaded(HServerLoad srvLoad, double avgLoad) { + private int balanceFromOverloaded(final HServerInfo info, + final HServerLoad srvLoad, final double avgLoad) { int avgLoadPlusSlop = (int)Math.ceil(avgLoad * (1 + this.slop)); int numSrvRegs = srvLoad.getNumberOfRegions(); if (numSrvRegs > avgLoadPlusSlop) { if (LOG.isDebugEnabled()) { - LOG.debug("Server is overloaded: load=" + numSrvRegs + - ", avg=" + avgLoad + ", slop=" + this.slop); + LOG.debug("Server " + info.getServerName() + " is overloaded: load=" + + numSrvRegs + ", avg=" + avgLoad + ", slop=" + this.slop); } return numSrvRegs - (int)Math.ceil(avgLoad); } Modified: hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/RegionServerOperation.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/RegionServerOperation.java?rev=942318&r1=942317&r2=942318&view=diff ============================================================================== --- hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/RegionServerOperation.java (original) +++ hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/RegionServerOperation.java Sat May 8 06:27:15 2010 @@ -33,6 +33,8 @@ abstract class RegionServerOperation imp private long expire; protected final HMaster master; + /* How long we stay on queue. + */ private int delay; protected RegionServerOperation(HMaster master) { Modified: hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=942318&r1=942317&r2=942318&view=diff ============================================================================== --- hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (original) +++ hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Sat May 8 06:27:15 2010 @@ -117,12 +117,14 @@ public class ServerManager implements HC if (numDeadServers > 0) { StringBuilder sb = new StringBuilder("Dead Server ["); boolean first = true; - for (String server: deadServers) { - if (!first) { - sb.append(", "); - first = false; + synchronized (deadServers) { + for (String server: deadServers) { + if (!first) { + sb.append(", "); + first = false; + } + sb.append(server); } - sb.append(server); } sb.append("]"); deadServersList = sb.toString(); @@ -159,47 +161,37 @@ public class ServerManager implements HC /** * Let the server manager know a new regionserver has come online * @param serverInfo - * @throws Leases.LeaseStillHeldException + * @throws IOException */ void regionServerStartup(final HServerInfo serverInfo) - throws Leases.LeaseStillHeldException { + throws IOException { + // Test for case where we get a region startup message from a regionserver + // that has been quickly restarted but whose znode expiration handler has + // not yet run, or from a server whose fail we are currently processing. HServerInfo info = new HServerInfo(serverInfo); - String serverName = info.getServerName(); - if (this.serversToServerInfo.containsKey(serverName) || - this.deadServers.contains(serverName)) { - LOG.debug("Server start was rejected: " + serverInfo); - LOG.debug("serversToServerInfo.containsKey: " + - this.serversToServerInfo.containsKey(serverName)); - LOG.debug("deadServers.contains: " + - this.deadServers.contains(serverName)); - // TODO: Check zk instead. - throw new Leases.LeaseStillHeldException(serverName); - } - LOG.info("Received start message from: " + serverName); - // Go on to process the regionserver registration. - HServerLoad load = this.serversToLoad.remove(serverName); - if (load != null) { - // The startup message was from a known server. - // Remove stale information about the server's load. - synchronized (this.loadToServers) { - Set servers = loadToServers.get(load); - if (servers != null) { - servers.remove(serverName); - if (servers.size() > 0) - this.loadToServers.put(load, servers); - else - this.loadToServers.remove(load); - } - } - } - HServerInfo storedInfo = this.serversToServerInfo.remove(serverName); - if (storedInfo != null && !this.master.isClosed()) { - // The startup message was from a known server with the same name. - // Timeout the old one right away. - this.master.getRegionManager().getRootRegionLocation(); - RegionServerOperation op = new ProcessServerShutdown(master, storedInfo); - this.master.getRegionServerOperationQueue().put(op); + String hostAndPort = info.getServerAddress().toString(); + HServerInfo existingServer = + this.serverAddressToServerInfo.get(info.getServerAddress()); + if (existingServer != null) { + LOG.info("Server start rejected; we already have " + hostAndPort + + " registered; existingServer=" + existingServer + ", newServer=" + info); + if (existingServer.getStartCode() < info.getStartCode()) { + LOG.info("Triggering server recovery; existingServer looks stale"); + expireServer(existingServer); + } + throw new Leases.LeaseStillHeldException(hostAndPort); + } + if (isDead(hostAndPort, true)) { + LOG.debug("Server start rejected; currently processing " + hostAndPort + + " failure"); + throw new Leases.LeaseStillHeldException(hostAndPort); + } + if (isDead(hostAndPort, true)) { + LOG.debug("Server start rejected; currently processing " + hostAndPort + + " failure"); + throw new Leases.LeaseStillHeldException(hostAndPort); } + LOG.info("Received start message from: " + info.getServerName()); recordNewServer(info); } @@ -223,7 +215,7 @@ public class ServerManager implements HC info.setLoad(load); // We must set this watcher here because it can be set on a fresh start // or on a failover - Watcher watcher = new ServerExpirer(serverName, info.getServerAddress()); + Watcher watcher = new ServerExpirer(new HServerInfo(info)); this.master.getZooKeeperWrapper().updateRSLocationGetWatch(info, watcher); this.serversToServerInfo.put(serverName, info); this.serverAddressToServerInfo.put(info.getServerAddress(), info); @@ -318,7 +310,7 @@ public class ServerManager implements HC synchronized (this.serversToServerInfo) { removeServerInfo(info.getServerName(), info.getServerAddress()); - this.serversToServerInfo.notifyAll(); + notifyServers(); } return new HMsg[] {HMsg.REGIONSERVER_STOP}; @@ -337,53 +329,44 @@ public class ServerManager implements HC */ private void processRegionServerExit(HServerInfo serverInfo, HMsg[] msgs) { synchronized (this.serversToServerInfo) { - try { - // This method removes ROOT/META from the list and marks them to be reassigned - // in addition to other housework. - if (removeServerInfo(serverInfo.getServerName(), - serverInfo.getServerAddress())) { - // Only process the exit message if the server still has registered info. - // Otherwise we could end up processing the server exit twice. - LOG.info("Region server " + serverInfo.getServerName() + - ": MSG_REPORT_EXITING"); - // Get all the regions the server was serving reassigned - // (if we are not shutting down). - if (!this.master.isClosed()) { - for (int i = 1; i < msgs.length; i++) { - LOG.info("Processing " + msgs[i] + " from " + - serverInfo.getServerName()); - assert msgs[i].getType() == HMsg.Type.MSG_REGION_CLOSE; - HRegionInfo info = msgs[i].getRegionInfo(); - // Meta/root region offlining is handed in removeServerInfo above. - if (!info.isMetaRegion()) { - synchronized (this.master.getRegionManager()) { - if (!this.master.getRegionManager().isOfflined( - info.getRegionNameAsString())) { - this.master.getRegionManager().setUnassigned(info, true); - } else { - this.master.getRegionManager().removeRegion(info); - } + // This method removes ROOT/META from the list and marks them to be + // reassigned in addition to other housework. + if (removeServerInfo(serverInfo.getServerName(), serverInfo.getServerAddress())) { + // Only process the exit message if the server still has registered info. + // Otherwise we could end up processing the server exit twice. + LOG.info("Region server " + serverInfo.getServerName() + + ": MSG_REPORT_EXITING"); + // Get all the regions the server was serving reassigned + // (if we are not shutting down). + if (!master.closed.get()) { + for (int i = 1; i < msgs.length; i++) { + LOG.info("Processing " + msgs[i] + " from " + + serverInfo.getServerName()); + assert msgs[i].getType() == HMsg.Type.MSG_REGION_CLOSE; + HRegionInfo info = msgs[i].getRegionInfo(); + // Meta/root region offlining is handed in removeServerInfo above. + if (!info.isMetaRegion()) { + synchronized (master.getRegionManager()) { + if (!master.getRegionManager().isOfflined(info.getRegionNameAsString())) { + master.getRegionManager().setUnassigned(info, true); + } else { + master.getRegionManager().removeRegion(info); } } } } - - // There should not be any regions in transition for this server - the - // server should finish transitions itself before closing - Map inTransition = - master.getRegionManager().getRegionsInTransitionOnServer( - serverInfo.getServerName()); - for (Map.Entry entry : inTransition.entrySet()) { - LOG.warn("Region server " + serverInfo.getServerName() + - " shut down with region " + entry.getKey() + " in transition " + - "state " + entry.getValue()); - master.getRegionManager().setUnassigned(entry.getValue().getRegionInfo(), true); - } } - // We don't need to return anything to the server because it isn't - // going to do any more work. - } finally { - this.serversToServerInfo.notifyAll(); + // There should not be any regions in transition for this server - the + // server should finish transitions itself before closing + Map inTransition = master.getRegionManager() + .getRegionsInTransitionOnServer(serverInfo.getServerName()); + for (Map.Entry entry : inTransition.entrySet()) { + LOG.warn("Region server " + serverInfo.getServerName() + + " shut down with region " + entry.getKey() + " in transition " + + "state " + entry.getValue()); + master.getRegionManager().setUnassigned(entry.getValue().getRegionInfo(), + true); + } } } } @@ -526,7 +509,7 @@ public class ServerManager implements HC assignSplitDaughter(b); if (region.isMetaTable()) { // A meta region has split. - this.master.getRegionManager().offlineMetaRegion(region.getStartKey()); + this. master.getRegionManager().offlineMetaRegionWithStartKey(region.getStartKey()); this.master.getRegionManager().incrementNumMetaRegions(); } } @@ -648,7 +631,7 @@ public class ServerManager implements HC } else if (region.isMetaTable()) { // Region is part of the meta table. Remove it from onlineMetaRegions - this.master.getRegionManager().offlineMetaRegion(region.getStartKey()); + this.master.getRegionManager().offlineMetaRegionWithStartKey(region.getStartKey()); } boolean offlineRegion = @@ -816,44 +799,58 @@ public class ServerManager implements HC /** Watcher triggered when a RS znode is deleted */ private class ServerExpirer implements Watcher { - private String server; - private HServerAddress serverAddress; + private HServerInfo server; - ServerExpirer(String server, HServerAddress serverAddress) { - this.server = server; - this.serverAddress = serverAddress; + ServerExpirer(final HServerInfo hsi) { + this.server = hsi; } public void process(WatchedEvent event) { - if (event.getType().equals(EventType.NodeDeleted)) { - LOG.info(server + " znode expired"); - // Remove the server from the known servers list and update load info - serverAddressToServerInfo.remove(serverAddress); - HServerInfo info = serversToServerInfo.remove(server); - if (info != null) { - String serverName = info.getServerName(); - HServerLoad load = serversToLoad.remove(serverName); - if (load != null) { - synchronized (loadToServers) { - Set servers = loadToServers.get(load); - if (servers != null) { - servers.remove(serverName); - if(servers.size() > 0) - loadToServers.put(load, servers); - else - loadToServers.remove(load); - } - } - } - deadServers.add(server); - RegionServerOperation op = new ProcessServerShutdown(master, info); - master.getRegionServerOperationQueue().put(op); - } - synchronized (serversToServerInfo) { - serversToServerInfo.notifyAll(); + if (!event.getType().equals(EventType.NodeDeleted)) { + LOG.warn("Unexpected event=" + event); + return; + } + LOG.info(this.server.getServerName() + " znode expired"); + expireServer(this.server); + } + } + + /* + * Expire the passed server. Add it to list of deadservers and queue a + * shutdown processing. + */ + private synchronized void expireServer(final HServerInfo hsi) { + // First check a server to expire. ServerName is of the form: + // , , + String serverName = hsi.getServerName(); + HServerInfo info = this.serversToServerInfo.get(serverName); + if (info == null) { + LOG.warn("No HServerInfo for " + serverName); + return; + } + if (this.deadServers.contains(serverName)) { + LOG.warn("Already processing shutdown of " + serverName); + return; + } + // Remove the server from the known servers lists and update load info + this.serverAddressToServerInfo.remove(info.getServerAddress()); + this.serversToServerInfo.remove(serverName); + HServerLoad load = this.serversToLoad.remove(serverName); + if (load != null) { + synchronized (this.loadToServers) { + Set servers = this.loadToServers.get(load); + if (servers != null) { + servers.remove(serverName); + if (servers.isEmpty()) this.loadToServers.remove(load); } } } + // Add to dead servers and queue a shutdown processing. + LOG.debug("Added=" + serverName + + " to dead servers, added shutdown processing operation"); + this.deadServers.add(serverName); + this.master.getRegionServerOperationQueue(). + put(new ProcessServerShutdown(master, info)); } /** @@ -867,8 +864,33 @@ public class ServerManager implements HC * @param serverName * @return true if server is dead */ - boolean isDead(String serverName) { - return this.deadServers.contains(serverName); + public boolean isDead(final String serverName) { + return isDead(serverName, false); + } + + /** + * @param serverName Servername as either host:port or + * host,port,startcode. + * @param hostAndPortOnly True if serverName is host and + * port only (host:port) and if so, then we do a prefix compare + * (ignoring start codes) looking for dead server. + * @return true if server is dead + */ + boolean isDead(final String serverName, final boolean hostAndPortOnly) { + return isDead(this.deadServers, serverName, hostAndPortOnly); + } + + static boolean isDead(final Set deadServers, + final String serverName, final boolean hostAndPortOnly) { + if (!hostAndPortOnly) return deadServers.contains(serverName); + String serverNameColonReplaced = + serverName.replaceFirst(":", HServerInfo.SERVERNAME_SEPARATOR); + for (String hostPortStartCode: deadServers) { + int index = hostPortStartCode.lastIndexOf(HServerInfo.SERVERNAME_SEPARATOR); + String hostPortStrippedOfStartCode = hostPortStartCode.substring(0, index); + if (hostPortStrippedOfStartCode.equals(serverNameColonReplaced)) return true; + } + return false; } Set getDeadServers() { Modified: hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=942318&r1=942317&r2=942318&view=diff ============================================================================== --- hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original) +++ hadoop/hbase/trunk/core/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Sat May 8 06:27:15 2010 @@ -133,6 +133,8 @@ public class HRegionServer implements HC // debugging and unit tests. protected volatile boolean abortRequested; + private volatile boolean killed = false; + // If false, the file system has become unavailable protected volatile boolean fsOk; @@ -612,7 +614,9 @@ public class HRegionServer implements HC hlogRoller.interruptIfNecessary(); this.majorCompactionChecker.interrupt(); - if (abortRequested) { + if (killed) { + // Just skip out w/o closing regions. + } else if (abortRequested) { if (this.fsOk) { // Only try to clean up if the file system is available try { @@ -665,9 +669,10 @@ public class HRegionServer implements HC this.hbaseMaster = null; } - join(); - this.zooKeeperWrapper.close(); - + if (!killed) { + join(); + this.zooKeeperWrapper.close(); + } LOG.info(Thread.currentThread().getName() + " exiting"); } @@ -1208,6 +1213,16 @@ public class HRegionServer implements HC stop(); } + /* + * Simulate a kill -9 of this server. + * Exits w/o closing regions or cleaninup logs but it does close socket in + * case want to bring up server on old hostname+port immediately. + */ + protected void kill() { + this.killed = true; + abort(); + } + /** * Wait on all threads to finish. * Presumption is that all closes and stops have already been called. Modified: hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java?rev=942318&r1=942317&r2=942318&view=diff ============================================================================== --- hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java (original) +++ hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java Sat May 8 06:27:15 2010 @@ -28,23 +28,35 @@ import java.util.concurrent.ConcurrentHa import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.client.HConnectionManager; import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.util.JVMClusterUtil; +import org.apache.hadoop.hdfs.DistributedFileSystem; +import org.apache.hadoop.io.MapWritable; +import org.apache.hadoop.security.UnixUserGroupInformation; +import org.apache.hadoop.security.UserGroupInformation; /** - * This class creates a single process HBase cluster. One thread is run for - * each server started. Pass how many instances of a RegionServer you want - * running in your cluster-in-a-single-jvm. Its modeled on MiniDFSCluster. - * Uses {@link LocalHBaseCluster}. Will run on top of whatever the currently - * configured FileSystem. + * This class creates a single process HBase cluster. + * each server. The master uses the 'default' FileSystem. The RegionServers, + * if we are running on DistributedFilesystem, create a FileSystem instance + * each and will close down their instance on the way out. */ public class MiniHBaseCluster implements HConstants { static final Log LOG = LogFactory.getLog(MiniHBaseCluster.class.getName()); + // Cache this. For some reason only works first time I get it. TODO: Figure + // out why. + private final static UserGroupInformation UGI; + static { + UGI = UserGroupInformation.getCurrentUGI(); + } + private Configuration conf; public LocalHBaseCluster hbaseCluster; @@ -110,12 +122,74 @@ public class MiniHBaseCluster implements } /** - * Subclass so can get at protected methods (none at moment). + * Subclass so can get at protected methods (none at moment). Also, creates + * a FileSystem instance per instantiation. Adds a shutdown own FileSystem + * on the way out. Shuts down own Filesystem only, not All filesystems as + * the FileSystem system exit hook does. */ public static class MiniHBaseClusterRegionServer extends HRegionServer { + private static int index = 0; + public MiniHBaseClusterRegionServer(Configuration conf) throws IOException { - super(conf); + super(setDifferentUser(conf)); + } + + /* + * @param c + * @param currentfs We return this if we did not make a new one. + * @param uniqueName Same name used to help identify the created fs. + * @return A new fs instance if we are up on DistributeFileSystem. + * @throws IOException + */ + private static Configuration setDifferentUser(final Configuration c) + throws IOException { + FileSystem currentfs = FileSystem.get(c); + if (!(currentfs instanceof DistributedFileSystem)) return c; + // Else distributed filesystem. Make a new instance per daemon. Below + // code is taken from the AppendTestUtil over in hdfs. + Configuration c2 = new Configuration(c); + String username = UGI.getUserName() + ".hrs." + index++; + UnixUserGroupInformation.saveToConf(c2, + UnixUserGroupInformation.UGI_PROPERTY_NAME, + new UnixUserGroupInformation(username, new String[]{"supergroup"})); + return c2; + } + + @Override + protected void init(MapWritable c) throws IOException { + super.init(c); + // Change shutdown hook to only shutdown the FileSystem added above by + // {@link #getFileSystem(HBaseConfiguration) + if (getFileSystem() instanceof DistributedFileSystem) { + Thread t = new SingleFileSystemShutdownThread(getFileSystem()); + Runtime.getRuntime().addShutdownHook(t); + } + } + + public void kill() { + super.kill(); + } + } + + /** + * Alternate shutdown hook. + * Just shuts down the passed fs, not all as default filesystem hook does. + */ + static class SingleFileSystemShutdownThread extends Thread { + private final FileSystem fs; + SingleFileSystemShutdownThread(final FileSystem fs) { + super("Shutdown of " + fs); + this.fs = fs; + } + @Override + public void run() { + try { + LOG.info("Hook closing fs=" + this.fs); + this.fs.close(); + } catch (IOException e) { + LOG.warn("Running hook", e); + } } } @@ -178,10 +252,6 @@ public class MiniHBaseCluster implements */ public String abortRegionServer(int serverNumber) { HRegionServer server = getRegionServer(serverNumber); - /*TODO: Prove not needed in TRUNK - // // Don't run hdfs shutdown thread. - // server.setHDFSShutdownThreadOnExit(null); - */ LOG.info("Aborting " + server.toString()); server.abort(); return server.toString(); @@ -212,10 +282,6 @@ public class MiniHBaseCluster implements JVMClusterUtil.RegionServerThread server = hbaseCluster.getRegionServers().get(serverNumber); LOG.info("Stopping " + server.toString()); - if (!shutdownFS) { - // Stop the running of the hdfs shutdown thread in tests. - server.getRegionServer().setShutdownHDFS(false); - } server.getRegionServer().stop(); return server; } @@ -269,6 +335,13 @@ public class MiniHBaseCluster implements } /** + * @return List of live region server threads (skips the aborted and the killed) + */ + public List getLiveRegionServerThreads() { + return this.hbaseCluster.getLiveRegionServers(); + } + + /** * Grab a numbered region server of your choice. * @param serverNumber * @return region server Added: hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/master/TestMasterTransitions.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/master/TestMasterTransitions.java?rev=942318&view=auto ============================================================================== --- hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/master/TestMasterTransitions.java (added) +++ hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/master/TestMasterTransitions.java Sat May 8 06:27:15 2010 @@ -0,0 +1,577 @@ +/** + * Copyright 2010 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.lang.reflect.InvocationTargetException; +import java.net.BindException; +import java.util.Collection; +import java.util.Set; +import java.util.concurrent.CopyOnWriteArraySet; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.HMsg; +import org.apache.hadoop.hbase.HRegionInfo; +import org.apache.hadoop.hbase.HServerAddress; +import org.apache.hadoop.hbase.HServerInfo; +import org.apache.hadoop.hbase.MiniHBaseCluster; +import org.apache.hadoop.hbase.MiniHBaseCluster.MiniHBaseClusterRegionServer; +import org.apache.hadoop.hbase.client.Get; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.regionserver.HRegion; +import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.Threads; +import org.apache.hadoop.hbase.util.Writables; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Test transitions of state across the master. Sets up the cluster once and + * then runs a couple of tests. + */ +public class TestMasterTransitions { + private static final Log LOG = LogFactory.getLog(TestMasterTransitions.class); + private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); + private static final String TABLENAME = "master_transitions"; + private static final byte [][] FAMILIES = new byte [][] {Bytes.toBytes("a"), + Bytes.toBytes("b"), Bytes.toBytes("c")}; + + /** + * Start up a mini cluster and put a small table of many empty regions into it. + * @throws Exception + */ + @BeforeClass public static void beforeAllTests() throws Exception { + TEST_UTIL.getConfiguration().setBoolean("dfs.support.append", true); + // Start a cluster of two regionservers. + TEST_UTIL.startMiniCluster(2); + // Create a table of three families. This will assign a region. + TEST_UTIL.createTable(Bytes.toBytes(TABLENAME), FAMILIES); + HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); + int countOfRegions = TEST_UTIL.createMultiRegions(t, getTestFamily()); + waitUntilAllRegionsAssigned(countOfRegions); + addToEachStartKey(countOfRegions); + } + + @AfterClass public static void afterAllTests() throws IOException { + TEST_UTIL.shutdownMiniCluster(); + } + + @Before public void setup() throws IOException { + if (TEST_UTIL.getHBaseCluster().getLiveRegionServerThreads().size() < 2) { + // Need at least two servers. + LOG.info("Started new server=" + + TEST_UTIL.getHBaseCluster().startRegionServer()); + + } + } + + /** + * Listener for regionserver events testing hbase-2428 (Infinite loop of + * region closes if META region is offline). In particular, listen + * for the close of the 'metaServer' and when it comes in, requeue it with a + * delay as though there were an issue processing the shutdown. As part of + * the requeuing, send over a close of a region on 'otherServer' so it comes + * into a master that has its meta region marked as offline. + */ + static class HBase2428Listener implements RegionServerOperationListener { + // Map of what we've delayed so we don't do do repeated delays. + private final Set postponed = + new CopyOnWriteArraySet(); + private boolean done = false;; + private boolean metaShutdownReceived = false; + private final HServerAddress metaAddress; + private final MiniHBaseCluster cluster; + private final int otherServerIndex; + private final HRegionInfo hri; + private int closeCount = 0; + static final int SERVER_DURATION = 3 * 1000; + static final int CLOSE_DURATION = 1 * 1000; + + HBase2428Listener(final MiniHBaseCluster c, final HServerAddress metaAddress, + final HRegionInfo closingHRI, final int otherServerIndex) { + this.cluster = c; + this.metaAddress = metaAddress; + this.hri = closingHRI; + this.otherServerIndex = otherServerIndex; + } + + @Override + public boolean process(final RegionServerOperation op) throws IOException { + // If a regionserver shutdown and its of the meta server, then we want to + // delay the processing of the shutdown and send off a close of a region on + // the 'otherServer. + boolean result = true; + if (op instanceof ProcessServerShutdown) { + ProcessServerShutdown pss = (ProcessServerShutdown)op; + if (pss.getDeadServerAddress().equals(this.metaAddress)) { + // Don't postpone more than once. + if (!this.postponed.contains(pss)) { + // Close some region. + this.cluster.addMessageToSendRegionServer(this.otherServerIndex, + new HMsg(HMsg.Type.MSG_REGION_CLOSE, hri, + Bytes.toBytes("Forcing close in test"))); + this.postponed.add(pss); + // Put off the processing of the regionserver shutdown processing. + pss.setDelay(SERVER_DURATION); + this.metaShutdownReceived = true; + // Return false. This will add this op to the delayed queue. + result = false; + } + } + } else { + // Have the close run frequently. + if (isWantedCloseOperation(op) != null) { + op.setDelay(CLOSE_DURATION); + // Count how many times it comes through here. + this.closeCount++; + } + } + return result; + } + + public void processed(final RegionServerOperation op) { + if (isWantedCloseOperation(op) != null) return; + this.done = true; + } + + /* + * @param op + * @return Null if not the wanted ProcessRegionClose, else op + * cast as a ProcessRegionClose. + */ + private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) { + // Count every time we get a close operation. + if (op instanceof ProcessRegionClose) { + ProcessRegionClose c = (ProcessRegionClose)op; + if (c.regionInfo.equals(hri)) { + return c; + } + } + return null; + } + + boolean isDone() { + return this.done; + } + + boolean isMetaShutdownReceived() { + return metaShutdownReceived; + } + + int getCloseCount() { + return this.closeCount; + } + + @Override + public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { + return true; + } + } + + /** + * In 2428, the meta region has just been set offline and then a close comes + * in. + * @see HBASE-2428 + */ + @Test (timeout=300000) public void testRegionCloseWhenNoMetaHBase2428() + throws Exception { + LOG.info("Running testRegionCloseWhenNoMetaHBase2428"); + MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); + final HMaster master = cluster.getMaster(); + int metaIndex = cluster.getServerWithMeta(); + // Figure the index of the server that is not server the .META. + int otherServerIndex = -1; + for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) { + if (i == metaIndex) continue; + otherServerIndex = i; + break; + } + final HRegionServer otherServer = cluster.getRegionServer(otherServerIndex); + final HRegionServer metaHRS = cluster.getRegionServer(metaIndex); + + // Get a region out on the otherServer. + final HRegionInfo hri = + otherServer.getOnlineRegions().iterator().next().getRegionInfo(); + + // Add our RegionServerOperationsListener + HBase2428Listener listener = new HBase2428Listener(cluster, + metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex); + master.getRegionServerOperationQueue(). + registerRegionServerOperationListener(listener); + try { + // Now close the server carrying meta. + cluster.abortRegionServer(metaIndex); + + // First wait on receipt of meta server shutdown message. + while(!listener.metaShutdownReceived) Threads.sleep(100); + while(!listener.isDone()) Threads.sleep(10); + // We should not have retried the close more times than it took for the + // server shutdown message to exit the delay queue and get processed + // (Multiple by two to add in some slop in case of GC or something). + assertTrue(listener.getCloseCount() > 1); + assertTrue(listener.getCloseCount() < + ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2)); + + // Assert the closed region came back online + assertRegionIsBackOnline(hri); + } finally { + master.getRegionServerOperationQueue(). + unregisterRegionServerOperationListener(listener); + } + } + + /** + * Test adding in a new server before old one on same host+port is dead. + * Make the test more onerous by having the server under test carry the meta. + * If confusion between old and new, purportedly meta never comes back. Test + * that meta gets redeployed. + */ + @Test (timeout=300000) public void testAddingServerBeforeOldIsDead2413() + throws IOException { + LOG.info("Running testAddingServerBeforeOldIsDead2413"); + MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); + int count = count(); + int metaIndex = cluster.getServerWithMeta(); + MiniHBaseClusterRegionServer metaHRS = + (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex); + int port = metaHRS.getServerInfo().getServerAddress().getPort(); + Configuration c = TEST_UTIL.getConfiguration(); + String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0"); + try { + LOG.info("KILLED=" + metaHRS); + metaHRS.kill(); + c.set(HConstants.REGIONSERVER_PORT, Integer.toString(port)); + // Try and start new regionserver. It might clash with the old + // regionserver port so keep trying to get past the BindException. + HRegionServer hrs = null; + while (true) { + try { + hrs = cluster.startRegionServer().getRegionServer(); + break; + } catch (IOException e) { + if (e.getCause() != null && e.getCause() instanceof InvocationTargetException) { + InvocationTargetException ee = (InvocationTargetException)e.getCause(); + if (ee.getCause() != null && ee.getCause() instanceof BindException) { + LOG.info("BindException; retrying: " + e.toString()); + } + } + } + } + LOG.info("STARTED=" + hrs); + // Wait until he's been given at least 3 regions before we go on to try + // and count rows in table. + while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100); + LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() + + " regions"); + assertEquals(count, count()); + } finally { + c.set(HConstants.REGIONSERVER_PORT, oldPort); + } + } + + + /** + * HBase2482 is about outstanding region openings. If any are outstanding + * when a regionserver goes down, then they'll never deploy. They'll be + * stuck in the regions-in-transition list for ever. This listener looks + * for a region opening HMsg and if its from the server passed on construction, + * then we kill it. It also looks out for a close message on the victim + * server because that signifies start of the fireworks. + */ + static class HBase2482Listener implements RegionServerOperationListener { + private final HRegionServer victim; + private boolean abortSent = false; + // We closed regions on new server. + private volatile boolean closed = false; + // Copy of regions on new server + private final Collection copyOfOnlineRegions; + // This is the region that was in transition on the server we aborted. Test + // passes if this region comes back online successfully. + private HRegionInfo regionToFind; + + HBase2482Listener(final HRegionServer victim) { + this.victim = victim; + // Copy regions currently open on this server so I can notice when + // there is a close. + this.copyOfOnlineRegions = + this.victim.getCopyOfOnlineRegionsSortedBySize().values(); + } + + @Override + public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { + if (!victim.getServerInfo().equals(serverInfo) || + this.abortSent || !this.closed) { + return true; + } + if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true; + // Save the region that is in transition so can test later it came back. + this.regionToFind = incomingMsg.getRegionInfo(); + LOG.info("ABORTING " + this.victim + " because got a " + + HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " + + incomingMsg.getRegionInfo().getRegionNameAsString()); + this.victim.abort(); + this.abortSent = true; + return true; + } + + @Override + public boolean process(RegionServerOperation op) throws IOException { + return true; + } + + @Override + public void processed(RegionServerOperation op) { + if (this.closed || !(op instanceof ProcessRegionClose)) return; + ProcessRegionClose close = (ProcessRegionClose)op; + for (HRegion r: this.copyOfOnlineRegions) { + if (r.getRegionInfo().equals(close.regionInfo)) { + // We've closed one of the regions that was on the victim server. + // Now can start testing for when all regions are back online again + LOG.info("Found close of " + + r.getRegionInfo().getRegionNameAsString() + + "; setting close happened flag"); + this.closed = true; + break; + } + } + } + } + + /** + * In 2482, a RS with an opening region on it dies. The said region is then + * stuck in the master's regions-in-transition and never leaves it. This + * test works by bringing up a new regionserver, waiting for the load + * balancer to give it some regions. Then, we close all on the new server. + * After sending all the close messages, we send the new regionserver the + * special blocking message so it can not process any more messages. + * Meantime reopening of the just-closed regions is backed up on the new + * server. Soon as master gets an opening region from the new regionserver, + * we kill it. We then wait on all regions to come back on line. If bug + * is fixed, this should happen soon as the processing of the killed server is + * done. + * @see HBASE-2482 + */ + @Test (timeout=300000) public void testKillRSWithOpeningRegion2482() + throws Exception { + LOG.info("Running testKillRSWithOpeningRegion2482"); + MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); + if (cluster.getLiveRegionServerThreads().size() < 2) { + // Need at least two servers. + cluster.startRegionServer(); + } + // Count how many regions are online. They need to be all back online for + // this test to succeed. + int countOfMetaRegions = countOfMetaRegions(); + // Add a listener on the server. + HMaster m = cluster.getMaster(); + // Start new regionserver. + MiniHBaseClusterRegionServer hrs = + (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer(); + LOG.info("Started new regionserver: " + hrs.toString()); + // Wait until has some regions before proceeding. Balancer will give it some. + int minimumRegions = + countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2); + while (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100); + // Set the listener only after some regions have been opened on new server. + HBase2482Listener listener = new HBase2482Listener(hrs); + m.getRegionServerOperationQueue(). + registerRegionServerOperationListener(listener); + try { + // Go close all non-catalog regions on this new server + closeAllNonCatalogRegions(cluster, hrs); + // After all closes, add blocking message before the region opens start to + // come in. + cluster.addMessageToSendRegionServer(hrs, + new HMsg(HMsg.Type.TESTING_MSG_BLOCK_RS)); + // Wait till one of the above close messages has an effect before we start + // wait on all regions back online. + while (!listener.closed) Threads.sleep(100); + LOG.info("Past close"); + // Make sure the abort server message was sent. + while(!listener.abortSent) Threads.sleep(100); + LOG.info("Past abort send; waiting on all regions to redeploy"); + // Now wait for regions to come back online. + assertRegionIsBackOnline(listener.regionToFind); + } finally { + m.getRegionServerOperationQueue(). + unregisterRegionServerOperationListener(listener); + } + } + + /* + * @return Count of all non-catalog regions on the designated server + */ + private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster, + final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs) + throws IOException { + int countOfRegions = 0; + for (HRegion r: hrs.getOnlineRegions()) { + if (r.getRegionInfo().isMetaRegion()) continue; + cluster.addMessageToSendRegionServer(hrs, + new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo())); + LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() + + " on " + hrs.toString()); + countOfRegions++; + } + return countOfRegions; + } + + private void assertRegionIsBackOnline(final HRegionInfo hri) + throws IOException { + // Region should have an entry in its startkey because of addRowToEachRegion. + byte [] row = getStartKey(hri); + HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); + Get g = new Get(row); + assertTrue((t.get(g)).size() > 0); + } + + /* + * Wait until all rows in .META. have a non-empty info:server. This means + * all regions have been deployed, master has been informed and updated + * .META. with the regions deployed server. + * @param countOfRegions How many regions in .META. + * @throws IOException + */ + private static void waitUntilAllRegionsAssigned(final int countOfRegions) + throws IOException { + HTable meta = new HTable(TEST_UTIL.getConfiguration(), + HConstants.META_TABLE_NAME); + while (true) { + int rows = 0; + Scan scan = new Scan(); + scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); + ResultScanner s = meta.getScanner(scan); + for (Result r = null; (r = s.next()) != null;) { + byte [] b = + r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); + if (b == null || b.length <= 0) break; + rows++; + } + s.close(); + // If I get to here and all rows have a Server, then all have been assigned. + if (rows == countOfRegions) break; + LOG.info("Found=" + rows); + Threads.sleep(1000); + } + } + + /* + * @return Count of regions in meta table. + * @throws IOException + */ + private static int countOfMetaRegions() + throws IOException { + HTable meta = new HTable(TEST_UTIL.getConfiguration(), + HConstants.META_TABLE_NAME); + int rows = 0; + Scan scan = new Scan(); + scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); + ResultScanner s = meta.getScanner(scan); + for (Result r = null; (r = s.next()) != null;) { + byte [] b = + r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); + if (b == null || b.length <= 0) break; + rows++; + } + s.close(); + return rows; + } + + /* + * Add to each of the regions in .META. a value. Key is the startrow of the + * region (except its 'aaa' for first region). Actual value is the row name. + * @param expected + * @return + * @throws IOException + */ + private static int addToEachStartKey(final int expected) throws IOException { + HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); + HTable meta = new HTable(TEST_UTIL.getConfiguration(), + HConstants.META_TABLE_NAME); + int rows = 0; + Scan scan = new Scan(); + scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER); + ResultScanner s = meta.getScanner(scan); + for (Result r = null; (r = s.next()) != null;) { + byte [] b = + r.getValue(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER); + if (b == null || b.length <= 0) break; + HRegionInfo hri = Writables.getHRegionInfo(b); + // If start key, add 'aaa'. + byte [] row = getStartKey(hri); + Put p = new Put(row); + p.add(getTestFamily(), getTestQualifier(), row); + t.put(p); + rows++; + } + s.close(); + Assert.assertEquals(expected, rows); + return rows; + } + + /* + * @return Count of rows in TABLENAME + * @throws IOException + */ + private static int count() throws IOException { + HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); + int rows = 0; + Scan scan = new Scan(); + ResultScanner s = t.getScanner(scan); + for (Result r = null; (r = s.next()) != null;) { + rows++; + } + s.close(); + LOG.info("Counted=" + rows); + return rows; + } + + /* + * @param hri + * @return Start key for hri (If start key is '', then return 'aaa'. + */ + private static byte [] getStartKey(final HRegionInfo hri) { + return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())? + Bytes.toBytes("aaa"): hri.getStartKey(); + } + + private static byte [] getTestFamily() { + return FAMILIES[0]; + } + + private static byte [] getTestQualifier() { + return getTestFamily(); + } +} Modified: hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/master/TestRegionManager.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/master/TestRegionManager.java?rev=942318&r1=942317&r2=942318&view=diff ============================================================================== --- hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/master/TestRegionManager.java (original) +++ hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/master/TestRegionManager.java Sat May 8 06:27:15 2010 @@ -62,7 +62,7 @@ public class TestRegionManager extends H HRegionInfo regionInfoX = new HRegionInfo(tableDesc, startKeyX, endKeyX); - master.getRegionManager().offlineMetaRegion(startKey0); + master.getRegionManager().offlineMetaRegionWithStartKey(startKey0); master.getRegionManager().putMetaRegionOnline(meta0); master.getRegionManager().putMetaRegionOnline(meta1); master.getRegionManager().putMetaRegionOnline(meta2); Added: hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/master/TestServerManager.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/master/TestServerManager.java?rev=942318&view=auto ============================================================================== --- hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/master/TestServerManager.java (added) +++ hadoop/hbase/trunk/core/src/test/java/org/apache/hadoop/hbase/master/TestServerManager.java Sat May 8 06:27:15 2010 @@ -0,0 +1,40 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master; + +import static org.junit.Assert.*; + +import java.util.HashSet; +import java.util.Set; + +import org.junit.Test; + + +public class TestServerManager { + @Test public void testIsDead() { + Set deadServers = new HashSet(); + final String hostname123 = "one,123,3"; + assertFalse(ServerManager.isDead(deadServers, hostname123, false)); + assertFalse(ServerManager.isDead(deadServers, hostname123, true)); + deadServers.add(hostname123); + assertTrue(ServerManager.isDead(deadServers, hostname123, false)); + assertFalse(ServerManager.isDead(deadServers, "one:1", true)); + assertFalse(ServerManager.isDead(deadServers, "one:1234", true)); + assertTrue(ServerManager.isDead(deadServers, "one:123", true)); + } +} \ No newline at end of file