Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 8D1ED200CB0 for ; Fri, 9 Jun 2017 03:39:54 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 8BB60160BE5; Fri, 9 Jun 2017 01:39:54 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 86C97160BD5 for ; Fri, 9 Jun 2017 03:39:53 +0200 (CEST) Received: (qmail 62382 invoked by uid 500); 9 Jun 2017 01:39:52 -0000 Mailing-List: contact commits-help@hbase.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@hbase.apache.org Delivered-To: mailing list commits@hbase.apache.org Received: (qmail 62373 invoked by uid 99); 9 Jun 2017 01:39:52 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 09 Jun 2017 01:39:52 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 9412FDFAF5; Fri, 9 Jun 2017 01:39:52 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: garyh@apache.org To: commits@hbase.apache.org Message-Id: <5018a737ead642bdb3dc5c9ddbc9c666@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: hbase git commit: HBASE-18141 Regionserver fails to shutdown when abort triggered during RPC call Date: Fri, 9 Jun 2017 01:39:52 +0000 (UTC) archived-at: Fri, 09 Jun 2017 01:39:54 -0000 Repository: hbase Updated Branches: refs/heads/branch-1.3 422775733 -> 92f3db6cb HBASE-18141 Regionserver fails to shutdown when abort triggered during RPC call Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/92f3db6c Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/92f3db6c Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/92f3db6c Branch: refs/heads/branch-1.3 Commit: 92f3db6cb99da542059560c04746c192930e8646 Parents: 4227757 Author: Gary Helmling Authored: Thu Jun 1 15:47:04 2017 -0700 Committer: Gary Helmling Committed: Thu Jun 8 18:37:42 2017 -0700 ---------------------------------------------------------------------- .../hbase/regionserver/HRegionServer.java | 50 ++++- .../regionserver/TestRegionServerAbort.java | 210 +++++++++++++++++++ 2 files changed, 250 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/92f3db6c/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index dd8bb5b..3329fd6 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -27,6 +27,7 @@ import java.lang.reflect.Constructor; import java.net.BindException; import java.net.InetAddress; import java.net.InetSocketAddress; +import java.security.PrivilegedExceptionAction; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -150,6 +151,7 @@ import org.apache.hadoop.hbase.regionserver.wal.MetricsWAL; import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener; import org.apache.hadoop.hbase.replication.regionserver.ReplicationLoad; import org.apache.hadoop.hbase.security.Superusers; +import org.apache.hadoop.hbase.security.User; import org.apache.hadoop.hbase.security.UserProvider; import org.apache.hadoop.hbase.trace.SpanReceiverHost; import org.apache.hadoop.hbase.util.Addressing; @@ -1914,18 +1916,32 @@ public class HRegionServer extends HasThread implements @Override public void stop(final String msg) { + stop(msg, false); + } + + /** + * Stops the regionserver. + * @param msg Status message + * @param force True if this is a regionserver abort + */ + public void stop(final String msg, final boolean force) { if (!this.stopped) { - try { - if (this.rsHost != null) { + if (this.rsHost != null) { + // when forced via abort don't allow CPs to override + try { this.rsHost.preStop(msg); + } catch (IOException ioe) { + if (!force) { + LOG.warn("The region server did not stop", ioe); + return; + } + LOG.warn("Skipping coprocessor exception on preStop() due to forced shutdown", ioe); } - this.stopped = true; - LOG.info("STOPPED: " + msg); - // Wakes run() if it is sleeping - sleeper.skipSleepCycle(); - } catch (IOException exp) { - LOG.warn("The region server did not stop", exp); } + this.stopped = true; + LOG.info("STOPPED: " + msg); + // Wakes run() if it is sleeping + sleeper.skipSleepCycle(); } } @@ -2092,7 +2108,7 @@ public class HRegionServer extends HasThread implements * the exception that caused the abort, or null */ @Override - public void abort(String reason, Throwable cause) { + public void abort(final String reason, Throwable cause) { String msg = "ABORTING region server " + this + ": " + reason; if (cause != null) { LOG.fatal(msg, cause); @@ -2130,7 +2146,21 @@ public class HRegionServer extends HasThread implements } catch (Throwable t) { LOG.warn("Unable to report fatal error to master", t); } - stop(reason); + // shutdown should be run as the internal user + if (User.isHBaseSecurityEnabled(conf)) { + try { + User.runAsLoginUser(new PrivilegedExceptionAction() { + @Override + public Object run() throws Exception { + stop(reason, true); + return null; + } + }); + } catch (IOException neverThrown) { + } + } else { + stop(reason, true); + } } /** http://git-wip-us.apache.org/repos/asf/hbase/blob/92f3db6c/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbort.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbort.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbort.java new file mode 100644 index 0000000..3d66c5f --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbort.java @@ -0,0 +1,210 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.regionserver; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.HRegionInfo; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.MiniHBaseCluster; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Admin; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.coprocessor.BaseRegionServerObserver; +import org.apache.hadoop.hbase.coprocessor.CoprocessorHost; +import org.apache.hadoop.hbase.coprocessor.ObserverContext; +import org.apache.hadoop.hbase.coprocessor.RegionServerCoprocessorEnvironment; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.testclassification.RegionServerTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.JVMClusterUtil; +import org.apache.hadoop.hbase.wal.WAL; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import java.io.IOException; +import java.util.Collection; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +/** + * Tests around regionserver shutdown and abort + */ +@Category({RegionServerTests.class, MediumTests.class}) +public class TestRegionServerAbort { + private static final byte[] FAMILY_BYTES = Bytes.toBytes("f"); + + private static final Log LOG = LogFactory.getLog(TestRegionServerAbort.class); + + private HBaseTestingUtility testUtil; + private Configuration conf; + private MiniDFSCluster dfsCluster; + private MiniHBaseCluster cluster; + + @Before + public void setup() throws Exception { + testUtil = new HBaseTestingUtility(); + conf = testUtil.getConfiguration(); + conf.set(CoprocessorHost.REGIONSERVER_COPROCESSOR_CONF_KEY, + StopBlockingRegionObserver.class.getName()); + // make sure we have multiple blocks so that the client does not prefetch all block locations + conf.set("dfs.blocksize", Long.toString(100 * 1024)); + // prefetch the first block + conf.set(DFSConfigKeys.DFS_CLIENT_READ_PREFETCH_SIZE_KEY, Long.toString(100 * 1024)); + conf.set(HConstants.REGION_IMPL, ErrorThrowingHRegion.class.getName()); + + testUtil.startMiniZKCluster(); + dfsCluster = testUtil.startMiniDFSCluster(2); + cluster = testUtil.startMiniHBaseCluster(1, 2); + } + + @After + public void tearDown() throws Exception { + for (JVMClusterUtil.RegionServerThread t : cluster.getRegionServerThreads()) { + HRegionServer rs = t.getRegionServer(); + RegionServerCoprocessorHost cpHost = rs.getRegionServerCoprocessorHost(); + StopBlockingRegionObserver cp = (StopBlockingRegionObserver) + cpHost.findCoprocessor(StopBlockingRegionObserver.class.getName()); + cp.setStopAllowed(true); + } + testUtil.shutdownMiniCluster(); + } + + /** + * Test that a regionserver is able to abort properly, even when a coprocessor + * throws an exception in preStopRegionServer(). + */ + @Test + public void testAbortFromRPC() throws Exception { + TableName tableName = TableName.valueOf("testAbortFromRPC"); + // create a test table + HTable table = testUtil.createTable(tableName, FAMILY_BYTES); + + // write some edits + testUtil.loadTable(table, FAMILY_BYTES); + LOG.info("Wrote data"); + // force a flush + cluster.flushcache(tableName); + LOG.info("Flushed table"); + + // delete a store file from the table region + HRegion firstRegion = cluster.findRegionsForTable(tableName).get(0); + + // aborting from region + HRegionFileSystem regionFS = firstRegion.getRegionFileSystem(); + Collection storeFileInfos = regionFS.getStoreFiles(FAMILY_BYTES); + assertFalse(storeFileInfos.isEmpty()); + StoreFileInfo firstStoreFile = storeFileInfos.iterator().next(); + + // move the store file away + // we will still be able to read the first block, since the location was pre-fetched on open + // but attempts to read subsequent blocks will fail + LOG.info("Moving store file " + firstStoreFile.getPath()); + FileSystem fs = regionFS.getFileSystem(); + Path tmpdir = new Path("/tmp"); + fs.mkdirs(tmpdir); + assertTrue(fs.rename(firstStoreFile.getPath(), + new Path(tmpdir, firstStoreFile.getPath().getName()))); + + // start a scan, this should trigger a regionserver abort + ResultScanner scanner = table.getScanner(new Scan()); + int count = 0; + for (Result f : scanner) { + count++; + } + LOG.info("Finished scan with " + count + " results"); + // should have triggered an abort due to FileNotFoundException + + // verify that the regionserver is stopped + assertTrue(firstRegion.getRegionServerServices().isAborted()); + assertTrue(firstRegion.getRegionServerServices().isStopped()); + } + + /** + * Test that a coprocessor is able to override a normal regionserver stop request. + */ + @Test + public void testStopOverrideFromCoprocessor() throws Exception { + Admin admin = testUtil.getHBaseAdmin(); + HRegionServer regionserver = cluster.getRegionServer(0); + admin.stopRegionServer(regionserver.getServerName().getHostAndPort()); + + // regionserver should have failed to stop due to coprocessor + assertFalse(cluster.getRegionServer(0).isAborted()); + assertFalse(cluster.getRegionServer(0).isStopped()); + } + + public static class StopBlockingRegionObserver extends BaseRegionServerObserver { + private boolean stopAllowed; + + @Override + public void preStopRegionServer(ObserverContext env) + throws IOException { + if (!stopAllowed) { + throw new IOException("Stop not allowed"); + } + } + + public void setStopAllowed(boolean allowed) { + this.stopAllowed = allowed; + } + + public boolean isStopAllowed() { + return stopAllowed; + } + } + + /** + * Throws an exception during store file refresh in order to trigger a regionserver abort. + */ + public static class ErrorThrowingHRegion extends HRegion { + public ErrorThrowingHRegion(Path tableDir, WAL wal, FileSystem fs, Configuration confParam, + HRegionInfo regionInfo, HTableDescriptor htd, + RegionServerServices rsServices) { + super(tableDir, wal, fs, confParam, regionInfo, htd, rsServices); + } + + public ErrorThrowingHRegion(HRegionFileSystem fs, WAL wal, Configuration confParam, + HTableDescriptor htd, RegionServerServices rsServices) { + super(fs, wal, confParam, htd, rsServices); + } + + @Override + protected boolean refreshStoreFiles(boolean force) throws IOException { + // forced when called through RegionScannerImpl.handleFileNotFound() + if (force) { + throw new IOException("Failing file refresh for testing"); + } + return super.refreshStoreFiles(force); + } + } +}