Return-Path: X-Original-To: apmail-hbase-commits-archive@www.apache.org Delivered-To: apmail-hbase-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 1ADFD117C7 for ; Wed, 17 Sep 2014 19:53:48 +0000 (UTC) Received: (qmail 38370 invoked by uid 500); 17 Sep 2014 19:53:47 -0000 Delivered-To: apmail-hbase-commits-archive@hbase.apache.org Received: (qmail 38325 invoked by uid 500); 17 Sep 2014 19:53:47 -0000 Mailing-List: contact commits-help@hbase.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@hbase.apache.org Delivered-To: mailing list commits@hbase.apache.org Received: (qmail 38316 invoked by uid 99); 17 Sep 2014 19:53:47 -0000 Received: from tyr.zones.apache.org (HELO tyr.zones.apache.org) (140.211.11.114) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 17 Sep 2014 19:53:47 +0000 Received: by tyr.zones.apache.org (Postfix, from userid 65534) id 991D3A193C4; Wed, 17 Sep 2014 19:53:47 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: jmhsieh@apache.org To: commits@hbase.apache.org Message-Id: <5df30e8b21364a55b4dd97fe1b1f7ecd@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: git commit: HBASE-11405 Multiple invocations of hbck in parallel disables balancer permanently (Bharath Vissapragada) Date: Wed, 17 Sep 2014 19:53:47 +0000 (UTC) Repository: hbase Updated Branches: refs/heads/branch-1 88e7da321 -> 3b30a1042 HBASE-11405 Multiple invocations of hbck in parallel disables balancer permanently (Bharath Vissapragada) Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/3b30a104 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/3b30a104 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/3b30a104 Branch: refs/heads/branch-1 Commit: 3b30a104247e65b6443bc94545235d789cf13185 Parents: 88e7da3 Author: Jonathan M Hsieh Authored: Wed Sep 17 12:51:53 2014 -0700 Committer: Jonathan M Hsieh Committed: Wed Sep 17 12:51:53 2014 -0700 ---------------------------------------------------------------------- .../org/apache/hadoop/hbase/util/HBaseFsck.java | 87 +++++++++++++++++++- .../apache/hadoop/hbase/util/TestHBaseFsck.java | 48 +++++++++++ 2 files changed, 134 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/3b30a104/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java index 017410a..6d3f2e1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.InterruptedIOException; import java.io.PrintWriter; import java.io.StringWriter; +import java.net.InetAddress; import java.net.URI; import java.util.ArrayList; import java.util.Arrays; @@ -45,18 +46,22 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.ScheduledThreadPoolExecutor; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsAction; +import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.hbase.Abortable; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.ClusterStatus; @@ -109,7 +114,10 @@ import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; import org.apache.hadoop.hbase.zookeeper.ZKTableStateClientSideReader; import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; import org.apache.hadoop.hbase.security.AccessDeniedException; +import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException; import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.ipc.RemoteException; +import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.Tool; @@ -178,6 +186,8 @@ public class HBaseFsck extends Configured { private static final int DEFAULT_OVERLAPS_TO_SIDELINE = 2; private static final int DEFAULT_MAX_MERGE = 5; private static final String TO_BE_LOADED = "to_be_loaded"; + private static final String HBCK_LOCK_FILE = "hbase-hbck.lock"; + /********************** * Internal resources @@ -192,6 +202,11 @@ public class HBaseFsck extends Configured { private long startMillis = System.currentTimeMillis(); private HFileCorruptionChecker hfcc; private int retcode = 0; + private static Path HBCK_LOCK_PATH; + private FSDataOutputStream hbckOutFd; + // This lock is to prevent cleanup of balancer resources twice between + // ShutdownHook and the main code. + private static AtomicBoolean hbckLockCleanup = new AtomicBoolean(false); /*********** * Options @@ -301,10 +316,78 @@ public class HBaseFsck extends Configured { } /** + * This method maintains a lock using a file. If the creation fails we return null + * + * @return FSDataOutputStream object corresponding to the newly opened lock file + * @throws IOException + */ + private FSDataOutputStream checkAndMarkRunningHbck() throws IOException { + try { + FileSystem fs = FSUtils.getCurrentFileSystem(getConf()); + FsPermission defaultPerms = FSUtils.getFilePermissions(fs, getConf(), + HConstants.DATA_FILE_UMASK_KEY); + Path tmpDir = new Path(FSUtils.getRootDir(getConf()), HConstants.HBASE_TEMP_DIRECTORY); + fs.mkdirs(tmpDir); + HBCK_LOCK_PATH = new Path(tmpDir, HBCK_LOCK_FILE); + final FSDataOutputStream out = FSUtils.create(fs, HBCK_LOCK_PATH, defaultPerms, false); + out.writeBytes(InetAddress.getLocalHost().toString()); + out.flush(); + return out; + } catch(RemoteException e) { + if(AlreadyBeingCreatedException.class.getName().equals(e.getClassName())){ + return null; + } else { + throw e; + } + } + } + + private void unlockHbck() throws IOException { + if(hbckLockCleanup.compareAndSet(false, true)){ + IOUtils.closeStream(hbckOutFd); + try{ + FSUtils.delete(FSUtils.getCurrentFileSystem(getConf()), HBCK_LOCK_PATH, true); + //Reset the hbckLockCleanup to false so that subsequent calls using the same + // Hbck object succeed. This is added for tests, which keep re-using the same + // objects + hbckLockCleanup.set(false); + } catch(IOException ioe) { + LOG.warn("Failed to delete " + HBCK_LOCK_PATH); + LOG.debug(ioe); + } + } + } + + /** * To repair region consistency, one must call connect() in order to repair * online state. */ public void connect() throws IOException { + + // Check if another instance of balancer is running + hbckOutFd = checkAndMarkRunningHbck(); + if (hbckOutFd == null) { + setRetCode(-1); + LOG.error("Another instance of hbck is running, exiting this instance.[If you are sure" + + " no other instance is running, delete the lock file " + + HBCK_LOCK_PATH + " and rerun the tool]"); + throw new IOException("Duplicate hbck - Abort"); + } + + // Add a shutdown hook to this thread, incase user tries to + // kill the hbck with a ctrl-c, we want to cleanup the lock so that + // it is available for further calls + Runtime.getRuntime().addShutdownHook(new Thread() { + public void run() { + try{ + unlockHbck(); + } catch(Exception e){ + LOG.debug("Error while removing hbck lock " + e.getMessage()); + } + } + }); + LOG.debug("Launching hbck"); + connection = HConnectionManager.createConnection(getConf()); admin = new HBaseAdmin(connection); meta = new HTable(TableName.META_TABLE_NAME, connection); @@ -499,6 +582,9 @@ public class HBaseFsck extends Configured { checkAndFixTableLocks(); + // Remove the hbck lock + unlockHbck(); + // Print table summary printTableSummary(tablesInfo); return errors.summarize(); @@ -3842,7 +3928,6 @@ public class HBaseFsck extends Configured { Path hbasedir = FSUtils.getRootDir(conf); URI defaultFs = hbasedir.getFileSystem(conf).getUri(); FSUtils.setFsDefault(conf, new Path(defaultFs)); - int ret = ToolRunner.run(new HBaseFsckTool(conf), args); System.exit(ret); } http://git-wip-us.apache.org/repos/asf/hbase/blob/3b30a104/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java index b464de0..fe068c9 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java @@ -36,8 +36,13 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.NavigableMap; +import java.util.Set; +import java.util.concurrent.Callable; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.SynchronousQueue; import java.util.concurrent.ThreadPoolExecutor; @@ -527,6 +532,49 @@ public class TestHBaseFsck { } /** + * This test makes sure that parallel instances of Hbck is disabled. + * + * @throws Exception + */ + @Test + public void testParallelHbck() throws Exception { + final ExecutorService service; + final Future hbck1,hbck2; + + class RunHbck implements Callable{ + boolean fail = true; + public HBaseFsck call(){ + try{ + return doFsck(conf, false); + } catch(Exception e){ + if (e.getMessage().contains("Duplicate hbck")) { + fail = false; + } + } + // If we reach here, then an exception was caught + if (fail) fail(); + return null; + } + } + service = Executors.newFixedThreadPool(2); + hbck1 = service.submit(new RunHbck()); + hbck2 = service.submit(new RunHbck()); + service.shutdown(); + //wait till hbck calls finish + service.awaitTermination(Integer.MAX_VALUE, TimeUnit.SECONDS); + HBaseFsck h1 = hbck1.get(); + HBaseFsck h2 = hbck2.get(); + // Make sure only one of the calls was successful + assert(h1 == null || h2 == null); + if (h1 != null) { + assert(h1.getRetCode() >= 0); + } + if (h2 != null) { + assert(h2.getRetCode() >= 0); + } + } + + /** * This create and fixes a bad table with regions that have a duplicate * start key */