Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 0015D200CB0 for ; Fri, 9 Jun 2017 02:34:47 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id F29B9160BE5; Fri, 9 Jun 2017 00:34:47 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id ED377160BD5 for ; Fri, 9 Jun 2017 02:34:46 +0200 (CEST) Received: (qmail 35269 invoked by uid 500); 9 Jun 2017 00:34:46 -0000 Mailing-List: contact commits-help@hbase.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@hbase.apache.org Delivered-To: mailing list commits@hbase.apache.org Received: (qmail 35260 invoked by uid 99); 9 Jun 2017 00:34:46 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 09 Jun 2017 00:34:45 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 991DDDFAF1; Fri, 9 Jun 2017 00:34:43 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: garyh@apache.org To: commits@hbase.apache.org Message-Id: <857d0b6d4bc94e87b5f1d0d37fdd7b10@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: hbase git commit: HBASE-18141 Regionserver fails to shutdown when abort triggered during RPC call Date: Fri, 9 Jun 2017 00:34:43 +0000 (UTC) archived-at: Fri, 09 Jun 2017 00:34:48 -0000 Repository: hbase Updated Branches: refs/heads/branch-2 ea7d51e12 -> 17966525e HBASE-18141 Regionserver fails to shutdown when abort triggered during RPC call Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/17966525 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/17966525 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/17966525 Branch: refs/heads/branch-2 Commit: 17966525e94524f7fc4a72aeeb6804b05020c97c Parents: ea7d51e Author: Gary Helmling Authored: Thu Jun 1 15:47:04 2017 -0700 Committer: Gary Helmling Committed: Thu Jun 8 17:30:20 2017 -0700 ---------------------------------------------------------------------- .../hbase/regionserver/HRegionServer.java | 40 +++- .../RegionServerCoprocessorHost.java | 4 +- .../regionserver/TestRegionServerAbort.java | 213 +++++++++++++++++++ 3 files changed, 245 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/17966525/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index ef99e47..4a75bb1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -27,6 +27,7 @@ import java.lang.reflect.Constructor; import java.net.BindException; import java.net.InetAddress; import java.net.InetSocketAddress; +import java.security.PrivilegedExceptionAction; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -106,9 +107,11 @@ import org.apache.hadoop.hbase.io.hfile.CacheConfig; import org.apache.hadoop.hbase.io.hfile.HFile; import org.apache.hadoop.hbase.io.util.MemorySizeUtil; import org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils; +import org.apache.hadoop.hbase.ipc.RpcCallContext; import org.apache.hadoop.hbase.ipc.RpcClient; import org.apache.hadoop.hbase.ipc.RpcClientFactory; import org.apache.hadoop.hbase.ipc.RpcControllerFactory; +import org.apache.hadoop.hbase.ipc.RpcServer; import org.apache.hadoop.hbase.ipc.RpcServerInterface; import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException; import org.apache.hadoop.hbase.ipc.ServerRpcController; @@ -133,6 +136,7 @@ import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener; import org.apache.hadoop.hbase.replication.regionserver.Replication; import org.apache.hadoop.hbase.replication.regionserver.ReplicationLoad; import org.apache.hadoop.hbase.security.Superusers; +import org.apache.hadoop.hbase.security.User; import org.apache.hadoop.hbase.security.UserProvider; import org.apache.hadoop.hbase.shaded.com.google.protobuf.*; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; @@ -2060,19 +2064,34 @@ public class HRegionServer extends HasThread implements @Override public void stop(final String msg) { + stop(msg, false, RpcServer.getRequestUser()); + } + + /** + * Stops the regionserver. + * @param msg Status message + * @param force True if this is a regionserver abort + * @param user The user executing the stop request, or null if no user is associated + */ + public void stop(final String msg, final boolean force, final User user) { if (!this.stopped) { LOG.info("***** STOPPING region server '" + this + "' *****"); - try { - if (this.rsHost != null) { - this.rsHost.preStop(msg); + if (this.rsHost != null) { + // when forced via abort don't allow CPs to override + try { + this.rsHost.preStop(msg, user); + } catch (IOException ioe) { + if (!force) { + LOG.warn("The region server did not stop", ioe); + return; + } + LOG.warn("Skipping coprocessor exception on preStop() due to forced shutdown", ioe); } - this.stopped = true; - LOG.info("STOPPED: " + msg); - // Wakes run() if it is sleeping - sleeper.skipSleepCycle(); - } catch (IOException exp) { - LOG.warn("The region server did not stop", exp); } + this.stopped = true; + LOG.info("STOPPED: " + msg); + // Wakes run() if it is sleeping + sleeper.skipSleepCycle(); } } @@ -2321,7 +2340,8 @@ public class HRegionServer extends HasThread implements } catch (Throwable t) { LOG.warn("Unable to report fatal error to master", t); } - stop(reason); + // shutdown should be run as the internal user + stop(reason, true, null); } /** http://git-wip-us.apache.org/repos/asf/hbase/blob/17966525/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerCoprocessorHost.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerCoprocessorHost.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerCoprocessorHost.java index 9d68d1b..9bcf201 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerCoprocessorHost.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerCoprocessorHost.java @@ -80,10 +80,10 @@ public class RegionServerCoprocessorHost extends sequence, conf, this.rsServices); } - public void preStop(String message) throws IOException { + public void preStop(String message, User user) throws IOException { // While stopping the region server all coprocessors method should be executed first then the // coprocessor should be cleaned up. - execShutdown(coprocessors.isEmpty() ? null : new CoprocessorOperation() { + execShutdown(coprocessors.isEmpty() ? null : new CoprocessorOperation(user) { @Override public void call(RegionServerObserver oserver, ObserverContext ctx) throws IOException { http://git-wip-us.apache.org/repos/asf/hbase/blob/17966525/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbort.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbort.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbort.java new file mode 100644 index 0000000..784c079 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbort.java @@ -0,0 +1,213 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.regionserver; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.HRegionInfo; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.MiniHBaseCluster; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Admin; +import org.apache.hadoop.hbase.client.Durability; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.coprocessor.CoprocessorHost; +import org.apache.hadoop.hbase.coprocessor.ObserverContext; +import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment; +import org.apache.hadoop.hbase.coprocessor.RegionObserver; +import org.apache.hadoop.hbase.coprocessor.RegionServerCoprocessorEnvironment; +import org.apache.hadoop.hbase.coprocessor.RegionServerObserver; +import org.apache.hadoop.hbase.regionserver.wal.WALEdit; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.testclassification.RegionServerTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.JVMClusterUtil; +import org.apache.hadoop.hbase.wal.WAL; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import java.io.IOException; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +/** + * Tests around regionserver shutdown and abort + */ +@Category({RegionServerTests.class, MediumTests.class}) +public class TestRegionServerAbort { + private static final byte[] FAMILY_BYTES = Bytes.toBytes("f"); + + private static final Log LOG = LogFactory.getLog(TestRegionServerAbort.class); + + private HBaseTestingUtility testUtil; + private Configuration conf; + private MiniDFSCluster dfsCluster; + private MiniHBaseCluster cluster; + + @Before + public void setup() throws Exception { + testUtil = new HBaseTestingUtility(); + conf = testUtil.getConfiguration(); + conf.set(CoprocessorHost.REGIONSERVER_COPROCESSOR_CONF_KEY, + StopBlockingRegionObserver.class.getName()); + conf.set(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, + StopBlockingRegionObserver.class.getName()); + // make sure we have multiple blocks so that the client does not prefetch all block locations + conf.set("dfs.blocksize", Long.toString(100 * 1024)); + // prefetch the first block + conf.set(DFSConfigKeys.DFS_CLIENT_READ_PREFETCH_SIZE_KEY, Long.toString(100 * 1024)); + conf.set(HConstants.REGION_IMPL, ErrorThrowingHRegion.class.getName()); + + testUtil.startMiniZKCluster(); + dfsCluster = testUtil.startMiniDFSCluster(2); + cluster = testUtil.startMiniHBaseCluster(1, 2); + } + + @After + public void tearDown() throws Exception { + for (JVMClusterUtil.RegionServerThread t : cluster.getRegionServerThreads()) { + HRegionServer rs = t.getRegionServer(); + RegionServerCoprocessorHost cpHost = rs.getRegionServerCoprocessorHost(); + StopBlockingRegionObserver cp = (StopBlockingRegionObserver) + cpHost.findCoprocessor(StopBlockingRegionObserver.class.getName()); + cp.setStopAllowed(true); + } + ((StopBlockingRegionObserver) cluster.getMaster().getRegionServerCoprocessorHost().findCoprocessor( + StopBlockingRegionObserver.class.getName() + )).setStopAllowed(true); + testUtil.shutdownMiniCluster(); + } + + /** + * Test that a regionserver is able to abort properly, even when a coprocessor + * throws an exception in preStopRegionServer(). + */ + @Test + public void testAbortFromRPC() throws Exception { + TableName tableName = TableName.valueOf("testAbortFromRPC"); + // create a test table + Table table = testUtil.createTable(tableName, FAMILY_BYTES); + + // write some edits + testUtil.loadTable(table, FAMILY_BYTES); + LOG.info("Wrote data"); + // force a flush + cluster.flushcache(tableName); + LOG.info("Flushed table"); + + // Send a poisoned put to trigger the abort + Put put = new Put(new byte[]{0, 0, 0, 0}); + put.addColumn(FAMILY_BYTES, Bytes.toBytes("c"), new byte[]{}); + put.setAttribute(StopBlockingRegionObserver.DO_ABORT, new byte[]{1}); + + table.put(put); + // should have triggered an abort due to FileNotFoundException + + // verify that the regionserver is stopped + HRegion firstRegion = cluster.findRegionsForTable(tableName).get(0); + assertNotNull(firstRegion); + assertNotNull(firstRegion.getRegionServerServices()); + LOG.info("isAborted = " + firstRegion.getRegionServerServices().isAborted()); + assertTrue(firstRegion.getRegionServerServices().isAborted()); + LOG.info("isStopped = " + firstRegion.getRegionServerServices().isStopped()); + assertTrue(firstRegion.getRegionServerServices().isStopped()); + } + + /** + * Test that a coprocessor is able to override a normal regionserver stop request. + */ + @Test + public void testStopOverrideFromCoprocessor() throws Exception { + Admin admin = testUtil.getHBaseAdmin(); + HRegionServer regionserver = cluster.getRegionServer(0); + admin.stopRegionServer(regionserver.getServerName().getHostAndPort()); + + // regionserver should have failed to stop due to coprocessor + assertFalse(cluster.getRegionServer(0).isAborted()); + assertFalse(cluster.getRegionServer(0).isStopped()); + } + + public static class StopBlockingRegionObserver implements RegionServerObserver, RegionObserver { + public static final String DO_ABORT = "DO_ABORT"; + private boolean stopAllowed; + + @Override + public void prePut(ObserverContext c, Put put, WALEdit edit, + Durability durability) throws IOException { + if (put.getAttribute(DO_ABORT) != null) { + HRegionServer rs = (HRegionServer) c.getEnvironment().getRegionServerServices(); + LOG.info("Triggering abort for regionserver " + rs.getServerName()); + rs.abort("Aborting for test"); + } + } + + @Override + public void preStopRegionServer(ObserverContext env) + throws IOException { + if (!stopAllowed) { + throw new IOException("Stop not allowed"); + } + } + + public void setStopAllowed(boolean allowed) { + this.stopAllowed = allowed; + } + + public boolean isStopAllowed() { + return stopAllowed; + } + } + + /** + * Throws an exception during store file refresh in order to trigger a regionserver abort. + */ + public static class ErrorThrowingHRegion extends HRegion { + public ErrorThrowingHRegion(Path tableDir, WAL wal, FileSystem fs, Configuration confParam, + HRegionInfo regionInfo, HTableDescriptor htd, + RegionServerServices rsServices) { + super(tableDir, wal, fs, confParam, regionInfo, htd, rsServices); + } + + public ErrorThrowingHRegion(HRegionFileSystem fs, WAL wal, Configuration confParam, + HTableDescriptor htd, RegionServerServices rsServices) { + super(fs, wal, confParam, htd, rsServices); + } + + @Override + protected boolean refreshStoreFiles(boolean force) throws IOException { + // forced when called through RegionScannerImpl.handleFileNotFound() + if (force) { + throw new IOException("Failing file refresh for testing"); + } + return super.refreshStoreFiles(force); + } + } +}