Return-Path: Delivered-To: apmail-lucene-hadoop-commits-archive@locus.apache.org Received: (qmail 22358 invoked from network); 5 Nov 2007 05:06:59 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.2) by minotaur.apache.org with SMTP; 5 Nov 2007 05:06:59 -0000 Received: (qmail 48945 invoked by uid 500); 5 Nov 2007 05:06:47 -0000 Delivered-To: apmail-lucene-hadoop-commits-archive@lucene.apache.org Received: (qmail 48914 invoked by uid 500); 5 Nov 2007 05:06:47 -0000 Mailing-List: contact hadoop-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: hadoop-dev@lucene.apache.org Delivered-To: mailing list hadoop-commits@lucene.apache.org Received: (qmail 48905 invoked by uid 99); 5 Nov 2007 05:06:47 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 04 Nov 2007 21:06:47 -0800 X-ASF-Spam-Status: No, hits=-100.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.3] (HELO eris.apache.org) (140.211.11.3) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 05 Nov 2007 05:06:58 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 016C51A9832; Sun, 4 Nov 2007 21:06:37 -0800 (PST) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r591880 - in /lucene/hadoop/trunk/src/contrib/hbase: CHANGES.txt src/test/org/apache/hadoop/hbase/TestRegionServerExit.java Date: Mon, 05 Nov 2007 05:06:37 -0000 To: hadoop-commits@lucene.apache.org From: jimk@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20071105050638.016C51A9832@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: jimk Date: Sun Nov 4 21:06:35 2007 New Revision: 591880 URL: http://svn.apache.org/viewvc?rev=591880&view=rev Log: HADOOP-2109 - Fix another race condition in processing dead servers, - Fix error online meta regions: was using region name and not startKey as key for map.put. - Change TestRegionServerExit to always kill the region server for the META region. This makes the test more deterministic and getting META reassigned was problematic. Modified: lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestRegionServerExit.java Modified: lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt?rev=591880&r1=591879&r2=591880&view=diff ============================================================================== --- lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt (original) +++ lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt Sun Nov 4 21:06:35 2007 @@ -22,7 +22,10 @@ HADOOP-2137 hql.jsp : The character 0x19 is not valid HADOOP-2109 Fix another race condition in processing dead servers, Fix error online meta regions: was using region name and not - startKey as key for map.put + startKey as key for map.put. Change TestRegionServerExit to + always kill the region server for the META region. This makes + the test more deterministic and getting META reassigned was + problematic. IMPROVEMENTS HADOOP-2401 Add convenience put method that takes writable Modified: lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestRegionServerExit.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestRegionServerExit.java?rev=591880&r1=591879&r2=591880&view=diff ============================================================================== --- lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestRegionServerExit.java (original) +++ lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestRegionServerExit.java Sun Nov 4 21:06:35 2007 @@ -20,6 +20,8 @@ package org.apache.hadoop.hbase; import java.io.IOException; +import java.util.Collection; +import java.util.List; import java.util.TreeMap; import org.apache.commons.logging.Log; @@ -55,9 +57,9 @@ // Start up a new region server to take over serving of root and meta // after we shut down the current meta/root host. this.cluster.startRegionServer(); - // Now abort the region server and wait for it to go down. - this.cluster.abortRegionServer(0); - LOG.info(this.cluster.waitOnRegionServer(0) + " has been aborted"); + // Now abort the meta region server and wait for it to go down and come back + stopOrAbortMetaRegionServer(true); + // Verify that everything is back up. Thread t = startVerificationThread(tableName, row); t.start(); threadDumpingJoin(t); @@ -76,9 +78,9 @@ // Start up a new region server to take over serving of root and meta // after we shut down the current meta/root host. this.cluster.startRegionServer(); - // Now shutdown the region server and wait for it to go down. - this.cluster.stopRegionServer(0); - LOG.info(this.cluster.waitOnRegionServer(0) + " has been shutdown"); + // Now abort the meta region server and wait for it to go down and come back + stopOrAbortMetaRegionServer(false); + // Verify that everything is back up. Thread t = startVerificationThread(tableName, row); t.start(); threadDumpingJoin(t); @@ -98,6 +100,41 @@ table.commit(lockid); return row; } + + /* + * Stop the region server serving the meta region and wait for the meta region + * to get reassigned. This is always the most problematic case. + * + * @param abort set to true if region server should be aborted, if false it + * is just shut down. + */ + private void stopOrAbortMetaRegionServer(boolean abort) { + List regionThreads = + cluster.getRegionThreads(); + + int server = -1; + for (int i = 0; i < regionThreads.size() && server == -1; i++) { + HRegionServer s = regionThreads.get(i).getRegionServer(); + Collection regions = s.getOnlineRegions().values(); + for (HRegion r : regions) { + if (r.getTableDesc().getName().equals(HConstants.META_TABLE_NAME)) { + server = i; + } + } + } + if (server == -1) { + LOG.fatal("could not find region server serving meta region"); + fail(); + } + if (abort) { + this.cluster.abortRegionServer(server); + + } else { + this.cluster.stopRegionServer(server); + } + LOG.info(this.cluster.waitOnRegionServer(server) + " has been " + + (abort ? "aborted" : "shut down")); + } /* * Run verification in a thread so I can concurrently run a thread-dumper @@ -111,6 +148,18 @@ final Text row) { Runnable runnable = new Runnable() { public void run() { + try { + // Now try to open a scanner on the meta table. Should stall until + // meta server comes back up. + HTable t = new HTable(conf, HConstants.META_TABLE_NAME); + HScannerInterface s = + t.obtainScanner(HConstants.COLUMN_FAMILY_ARRAY, new Text()); + s.close(); + + } catch (IOException e) { + LOG.fatal("could not re-open meta table because", e); + fail(); + } HScannerInterface scanner = null; try { // Verify that the client can find the data after the region has moved