Return-Path: Delivered-To: apmail-hadoop-hbase-commits-archive@locus.apache.org Received: (qmail 70547 invoked from network); 11 Dec 2008 21:53:59 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.2) by minotaur.apache.org with SMTP; 11 Dec 2008 21:53:59 -0000 Received: (qmail 40002 invoked by uid 500); 11 Dec 2008 21:54:12 -0000 Delivered-To: apmail-hadoop-hbase-commits-archive@hadoop.apache.org Received: (qmail 39989 invoked by uid 500); 11 Dec 2008 21:54:12 -0000 Mailing-List: contact hbase-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: hbase-dev@hadoop.apache.org Delivered-To: mailing list hbase-commits@hadoop.apache.org Received: (qmail 39978 invoked by uid 99); 11 Dec 2008 21:54:12 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 11 Dec 2008 13:54:12 -0800 X-ASF-Spam-Status: No, hits=0.0 required=10.0 tests= X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 11 Dec 2008 21:53:57 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id 063C42388882; Thu, 11 Dec 2008 13:53:36 -0800 (PST) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r725828 - in /hadoop/hbase/trunk: CHANGES.txt src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Date: Thu, 11 Dec 2008 21:53:35 -0000 To: hbase-commits@hadoop.apache.org From: jimk@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20081211215336.063C42388882@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: jimk Date: Thu Dec 11 13:53:35 2008 New Revision: 725828 URL: http://svn.apache.org/viewvc?rev=725828&view=rev Log: HBASE-1052 Stopping a HRegionServer with unflushed cache causes data loss from org.apache.hadoop.hbase.DroppedSnapshotException Modified: hadoop/hbase/trunk/CHANGES.txt hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Modified: hadoop/hbase/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/CHANGES.txt?rev=725828&r1=725827&r2=725828&view=diff ============================================================================== --- hadoop/hbase/trunk/CHANGES.txt (original) +++ hadoop/hbase/trunk/CHANGES.txt Thu Dec 11 13:53:35 2008 @@ -104,6 +104,8 @@ HBASE-900 Regionserver memory leak causing OOME during relatively modest bulk importing; part 1 HBASE-1054 Index NPE on scanning (Clint Morgan via Andrew Purtell) + HBASE-1052 Stopping a HRegionServer with unflushed cache causes data loss + from org.apache.hadoop.hbase.DroppedSnapshotException IMPROVEMENTS HBASE-901 Add a limit to key length, check key and value length on client side Modified: hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java URL: http://svn.apache.org/viewvc/hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=725828&r1=725827&r2=725828&view=diff ============================================================================== --- hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original) +++ hadoop/hbase/trunk/src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Thu Dec 11 13:53:35 2008 @@ -24,6 +24,7 @@ import java.lang.management.ManagementFactory; import java.lang.management.MemoryUsage; import java.lang.reflect.Constructor; +import java.lang.reflect.Field; import java.net.InetSocketAddress; import java.util.ArrayList; import java.util.Arrays; @@ -282,10 +283,6 @@ for(int i = 0; i < nbBlocks; i++) { reservedSpace.add(new byte[DEFAULT_SIZE_RESERVATION_BLOCK]); } - - // Register shutdown hook for HRegionServer, runs an orderly shutdown - // when a kill signal is recieved - Runtime.getRuntime().addShutdownHook(new ShutdownThread(this)); } /** @@ -522,6 +519,15 @@ this.hbaseMaster = null; } join(); + + LOG.info("Running hdfs shutdown thread"); + hdfsShutdownThread.start(); + try { + hdfsShutdownThread.join(); + LOG.info("Hdfs shutdown thread completed."); + } catch (InterruptedException e) { + LOG.warn("hdfsShutdownThread.join() was interrupted", e); + } LOG.info(Thread.currentThread().getName() + " exiting"); } @@ -552,6 +558,13 @@ // to defaults). this.conf.set("fs.default.name", this.conf.get("hbase.rootdir")); this.fs = FileSystem.get(this.conf); + + // Register shutdown hook for HRegionServer, runs an orderly shutdown + // when a kill signal is recieved + Runtime.getRuntime().addShutdownHook(new ShutdownThread(this, + Thread.currentThread())); + this.hdfsShutdownThread = suppressHdfsShutdownHook(); + this.rootDir = new Path(this.conf.get(HConstants.HBASE_DIR)); this.log = setupHLog(); this.logFlusher.setHLog(log); @@ -693,25 +706,34 @@ */ private static class ShutdownThread extends Thread { private final HRegionServer instance; + private final Thread mainThread; /** * @param instance + * @param mainThread */ - public ShutdownThread(HRegionServer instance) { + public ShutdownThread(HRegionServer instance, Thread mainThread) { this.instance = instance; + this.mainThread = mainThread; } @Override public void run() { LOG.info("Starting shutdown thread."); - // tell the region server to stop and wait for it to complete + // tell the region server to stop instance.stop(); - instance.join(); + + // Wait for main thread to exit. + Threads.shutdown(mainThread); + LOG.info("Shutdown thread complete"); } } + // We need to call HDFS shutdown when we are done shutting down + private Thread hdfsShutdownThread; + /* * Inner class that runs on a long period checking if regions need major * compaction. @@ -745,6 +767,43 @@ } /** + * So, HDFS caches FileSystems so when you call FileSystem.get it's fast. In + * order to make sure things are cleaned up, it also creates a shutdown hook + * so that all filesystems can be closed when the process is terminated. This + * conveniently runs concurrently with our own shutdown handler, and + * therefore causes all the filesystems to be closed before the server can do + * all its necessary cleanup. + * + * The crazy dirty reflection in this method sneaks into the FileSystem cache + * and grabs the shutdown hook, removes it from the list of active shutdown + * hooks, and hangs onto it until later. Then, after we're properly done with + * our graceful shutdown, we can execute the hdfs hook manually to make sure + * loose ends are tied up. + * + * This seems quite fragile and susceptible to breaking if Hadoop changes + * anything about the way this cleanup is managed. Keep an eye on things. + */ + private Thread suppressHdfsShutdownHook() { + try { + Field field = FileSystem.class.getDeclaredField ("clientFinalizer"); + field.setAccessible(true); + Thread hdfsClientFinalizer = (Thread)field.get(null); + if (hdfsClientFinalizer == null) { + throw new RuntimeException("client finalizer is null, can't suppress!"); + } + Runtime.getRuntime().removeShutdownHook(hdfsClientFinalizer); + return hdfsClientFinalizer; + + } catch (NoSuchFieldException nsfe) { + LOG.fatal("Couldn't find field 'clientFinalizer' in FileSystem!", nsfe); + throw new RuntimeException("Failed to suppress HDFS shutdown hook"); + } catch (IllegalAccessException iae) { + LOG.fatal("Couldn't access field 'clientFinalizer' in FileSystem!", iae); + throw new RuntimeException("Failed to suppress HDFS shutdown hook"); + } + } + + /** * Report the status of the server. A server is online once all the startup * is completed (setting up filesystem, starting service threads, etc.). This * method is designed mostly to be useful in tests.