Return-Path: X-Original-To: apmail-hbase-commits-archive@www.apache.org Delivered-To: apmail-hbase-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id B9CE017E5B for ; Wed, 20 May 2015 15:25:14 +0000 (UTC) Received: (qmail 2062 invoked by uid 500); 20 May 2015 15:25:14 -0000 Delivered-To: apmail-hbase-commits-archive@hbase.apache.org Received: (qmail 1970 invoked by uid 500); 20 May 2015 15:25:14 -0000 Mailing-List: contact commits-help@hbase.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@hbase.apache.org Delivered-To: mailing list commits@hbase.apache.org Received: (qmail 1785 invoked by uid 99); 20 May 2015 15:25:14 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 20 May 2015 15:25:14 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 5CDCCDFCF1; Wed, 20 May 2015 15:25:14 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: jmhsieh@apache.org To: commits@hbase.apache.org Date: Wed, 20 May 2015 15:25:16 -0000 Message-Id: <146b8baf13f04c3090554d9abf3136da@git.apache.org> In-Reply-To: <0fe4822c7b4844099ae46ed01fe78d01@git.apache.org> References: <0fe4822c7b4844099ae46ed01fe78d01@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [03/50] [abbrv] hbase git commit: HBASE-13576 HBCK enhancement: Failure in checking one region should not fail the entire HBCK operation. (Stephen Yuan Jiang) HBASE-13576 HBCK enhancement: Failure in checking one region should not fail the entire HBCK operation. (Stephen Yuan Jiang) Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/11b76732 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/11b76732 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/11b76732 Branch: refs/heads/hbase-11339 Commit: 11b76732c0ec80a2cccbe7937440bd107e577c8b Parents: 67c6352 Author: Enis Soztutar Authored: Wed May 6 12:08:36 2015 -0700 Committer: Enis Soztutar Committed: Wed May 6 12:08:36 2015 -0700 ---------------------------------------------------------------------- .../org/apache/hadoop/hbase/util/HBaseFsck.java | 64 +++++++++++++++++--- .../hadoop/hbase/util/HBaseFsckRepair.java | 6 +- .../apache/hadoop/hbase/util/TestHBaseFsck.java | 1 - 3 files changed, 59 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/11b76732/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java index 46fad96..cf4d002 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java @@ -211,7 +211,7 @@ public class HBaseFsck extends Configured implements Closeable { private Table meta; // threads to do ||izable tasks: retrieve data from regionservers, handle overlapping regions protected ExecutorService executor; - private long startMillis = System.currentTimeMillis(); + private long startMillis = EnvironmentEdgeManager.currentTime(); private HFileCorruptionChecker hfcc; private int retcode = 0; private Path HBCK_LOCK_PATH; @@ -294,6 +294,7 @@ public class HBaseFsck extends Configured implements Closeable { new HashMap(); private final RetryCounterFactory lockFileRetryCounterFactory; + private Map> skippedRegions = new HashMap>(); /** * Constructor @@ -556,6 +557,7 @@ public class HBaseFsck extends Configured implements Closeable { errors.clear(); tablesInfo.clear(); orphanHdfsDirs.clear(); + skippedRegions.clear(); } /** @@ -1717,7 +1719,7 @@ public class HBaseFsck extends Configured implements Closeable { return false; } ServerName sn = metaLocation.getServerName(); - MetaEntry m = new MetaEntry(metaLocation.getRegionInfo(), sn, System.currentTimeMillis()); + MetaEntry m = new MetaEntry(metaLocation.getRegionInfo(), sn, EnvironmentEdgeManager.currentTime()); HbckInfo hbckInfo = regionInfoMap.get(metaLocation.getRegionInfo().getEncodedName()); if (hbckInfo == null) { regionInfoMap.put(metaLocation.getRegionInfo().getEncodedName(), new HbckInfo(m)); @@ -1817,6 +1819,17 @@ public class HBaseFsck extends Configured implements Closeable { checkRegionConsistencyConcurrently(replicaWorkItems); setCheckHdfs(prevHdfsCheck); + // If some regions is skipped during checkRegionConsistencyConcurrently() phase, we might + // not get accurate state of the hbase if continuing. The config here allows users to tune + // the tolerance of number of skipped region. + // TODO: evaluate the consequence to continue the hbck operation without config. + int terminateThreshold = getConf().getInt("hbase.hbck.skipped.regions.limit", 0); + int numOfSkippedRegions = skippedRegions.size(); + if (numOfSkippedRegions > 0 && numOfSkippedRegions > terminateThreshold) { + throw new IOException(numOfSkippedRegions + + " region(s) could not be checked or repaired. See logs for detail."); + } + if (shouldCheckHdfs()) { checkAndFixTableStates(); } @@ -1862,11 +1875,32 @@ public class HBaseFsck extends Configured implements Closeable { @Override public synchronized Void call() throws Exception { - checkRegionConsistency(key, hbi); + try { + checkRegionConsistency(key, hbi); + } catch (Exception e) { + // If the region is non-META region, skip this region and send warning/error message; if + // the region is META region, we should not continue. + LOG.warn("Unable to complete check or repair the region '" + hbi.getRegionNameAsString() + + "'.", e); + if (hbi.getHdfsHRI().isMetaRegion()) { + throw e; + } + LOG.warn("Skip region '" + hbi.getRegionNameAsString() + "'"); + addSkippedRegion(hbi); + } return null; } } + private void addSkippedRegion(final HbckInfo hbi) { + Set skippedRegionNames = skippedRegions.get(hbi.getTableName()); + if (skippedRegionNames == null) { + skippedRegionNames = new HashSet(); + } + skippedRegionNames.add(hbi.getRegionNameAsString()); + skippedRegions.put(hbi.getTableName(), skippedRegionNames); + } + /** * Check and fix table states, assumes full info available: * - tableInfos @@ -2156,7 +2190,7 @@ public class HBaseFsck extends Configured implements Closeable { inMeta && hbi.metaEntry.isSplit() && hbi.metaEntry.isOffline(); boolean shouldBeDeployed = inMeta && !isTableDisabled(hbi.metaEntry.getTable()); boolean recentlyModified = inHdfs && - hbi.getModTime() + timelag > System.currentTimeMillis(); + hbi.getModTime() + timelag > EnvironmentEdgeManager.currentTime(); // ========== First the healthy cases ============= if (hbi.containsOnlyHdfsEdits()) { @@ -3161,7 +3195,7 @@ public class HBaseFsck extends Configured implements Closeable { */ HTableDescriptor[] getTables(AtomicInteger numSkipped) { List tableNames = new ArrayList(); - long now = System.currentTimeMillis(); + long now = EnvironmentEdgeManager.currentTime(); for (HbckInfo hbi : regionInfoMap.values()) { MetaEntry info = hbi.metaEntry; @@ -3697,14 +3731,30 @@ public class HBaseFsck extends Configured implements Closeable { */ private void printTableSummary(SortedMap tablesInfo) { StringBuilder sb = new StringBuilder(); + int numOfSkippedRegions; errors.print("Summary:"); for (TableInfo tInfo : tablesInfo.values()) { + numOfSkippedRegions = (skippedRegions.containsKey(tInfo.getName())) ? + skippedRegions.get(tInfo.getName()).size() : 0; + if (errors.tableHasErrors(tInfo)) { errors.print("Table " + tInfo.getName() + " is inconsistent."); - } else { - errors.print(" " + tInfo.getName() + " is okay."); + } else if (numOfSkippedRegions > 0){ + errors.print("Table " + tInfo.getName() + " is okay (with " + + numOfSkippedRegions + " skipped regions)."); + } + else { + errors.print("Table " + tInfo.getName() + " is okay."); } errors.print(" Number of regions: " + tInfo.getNumRegions()); + if (numOfSkippedRegions > 0) { + Set skippedRegionStrings = skippedRegions.get(tInfo.getName()); + System.out.println(" Number of skipped regions: " + numOfSkippedRegions); + System.out.println(" List of skipped regions:"); + for(String sr : skippedRegionStrings) { + System.out.println(" " + sr); + } + } sb.setLength(0); // clear out existing buffer, if any. sb.append(" Deployed on: "); for (ServerName server : tInfo.deployedOn) { http://git-wip-us.apache.org/repos/asf/hbase/blob/11b76732/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java index 4c742e3..7de7af8 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java @@ -25,7 +25,6 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.MetaTableAccessor; -import org.apache.hadoop.hbase.NotServingRegionException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.ZooKeeperConnectionException; @@ -35,7 +34,6 @@ import org.apache.hadoop.hbase.client.ClusterConnection; import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.client.HConnection; -import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.master.RegionState; @@ -123,8 +121,8 @@ public class HBaseFsckRepair { public static void waitUntilAssigned(Admin admin, HRegionInfo region) throws IOException, InterruptedException { long timeout = admin.getConfiguration().getLong("hbase.hbck.assign.timeout", 120000); - long expiration = timeout + System.currentTimeMillis(); - while (System.currentTimeMillis() < expiration) { + long expiration = timeout + EnvironmentEdgeManager.currentTime(); + while (EnvironmentEdgeManager.currentTime() < expiration) { try { Map rits= admin.getClusterStatus().getRegionsInTransition(); http://git-wip-us.apache.org/repos/asf/hbase/blob/11b76732/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java index fa16068..28b80ff 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java @@ -103,7 +103,6 @@ import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.regionserver.SplitTransactionFactory; import org.apache.hadoop.hbase.regionserver.SplitTransactionImpl; import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction; -import org.apache.hadoop.hbase.security.access.AccessControlClient; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.testclassification.MiscTests; import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;