hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From zg...@apache.org
Subject [hbase] branch branch-2.0 updated: HBASE-21402 Backport parent "HBASE-21325 Force to terminate regionserver when abort hang in somewhere"
Date Fri, 01 Feb 2019 01:22:25 GMT
This is an automated email from the ASF dual-hosted git repository.

zghao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new ad96aeb  HBASE-21402 Backport parent "HBASE-21325 Force to terminate regionserver
when abort hang in somewhere"
ad96aeb is described below

commit ad96aeb92877781b3e5e21a55ff046e72aa4d895
Author: Pankaj <pankaj.kr@huawei.com>
AuthorDate: Thu Jan 17 00:18:59 2019 +0530

    HBASE-21402 Backport parent "HBASE-21325 Force to terminate regionserver when abort hang
in somewhere"
    
    Signed-off-by: Guanghao Zhang <zghao@apache.org>
---
 .../hadoop/hbase/regionserver/HRegionServer.java   |  34 +++++
 .../regionserver/TestRegionServerAbortTimeout.java | 158 +++++++++++++++++++++
 2 files changed, 192 insertions(+)

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
index 63bb517..6a945e5 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
@@ -38,6 +38,8 @@ import java.util.Map.Entry;
 import java.util.Objects;
 import java.util.Set;
 import java.util.SortedMap;
+import java.util.Timer;
+import java.util.TimerTask;
 import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.concurrent.ConcurrentHashMap;
@@ -306,6 +308,11 @@ public class HRegionServer extends HasThread implements
   // Go down hard. Used if file system becomes unavailable and also in
   // debugging and unit tests.
   private volatile boolean abortRequested;
+  public static final String ABORT_TIMEOUT = "hbase.regionserver.abort.timeout";
+  // Default abort timeout is 1200 seconds for safe
+  private static final long DEFAULT_ABORT_TIMEOUT = 1200000;
+  // Will run this task when abort timeout
+  public static final String ABORT_TIMEOUT_TASK = "hbase.regionserver.abort.timeout.task";
 
   ConcurrentMap<String, Integer> rowlocks = new ConcurrentHashMap<>();
 
@@ -1004,6 +1011,22 @@ public class HRegionServer extends HasThread implements
         abort(prefix + t.getMessage(), t);
       }
     }
+
+    if (abortRequested) {
+      Timer abortMonitor = new Timer("Abort regionserver monitor", true);
+      TimerTask abortTimeoutTask = null;
+      try {
+        abortTimeoutTask =
+            Class.forName(conf.get(ABORT_TIMEOUT_TASK, SystemExitWhenAbortTimeout.class.getName()))
+                .asSubclass(TimerTask.class).getDeclaredConstructor().newInstance();
+      } catch (Exception e) {
+        LOG.warn("Initialize abort timeout task failed", e);
+      }
+      if (abortTimeoutTask != null) {
+        abortMonitor.schedule(abortTimeoutTask, conf.getLong(ABORT_TIMEOUT, DEFAULT_ABORT_TIMEOUT));
+      }
+    }
+
     if (this.leases != null) {
       this.leases.closeAfterLeasesExpire();
     }
@@ -3661,4 +3684,15 @@ public class HRegionServer extends HasThread implements
     return ConnectionUtils.createShortCircuitConnection(conf, null, user, this.serverName,
         this.rpcServices, this.rpcServices);
   }
+
+  /**
+   * Force to terminate region server when abort timeout.
+   */
+  private static class SystemExitWhenAbortTimeout extends TimerTask {
+    @Override
+    public void run() {
+      LOG.warn("Aborting region server timed out, terminate forcibly...");
+      System.exit(1);
+    }
+  }
 }
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbortTimeout.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbortTimeout.java
new file mode 100644
index 0000000..5013594
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbortTimeout.java
@@ -0,0 +1,158 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.regionserver;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+import java.util.Optional;
+import java.util.TimerTask;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.client.TableDescriptor;
+import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
+import org.apache.hadoop.hbase.coprocessor.ObserverContext;
+import org.apache.hadoop.hbase.coprocessor.RegionCoprocessor;
+import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
+import org.apache.hadoop.hbase.coprocessor.RegionObserver;
+import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.testclassification.RegionServerTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.Threads;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Category({ RegionServerTests.class, MediumTests.class })
+public class TestRegionServerAbortTimeout {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+      HBaseClassTestRule.forClass(TestRegionServerAbortTimeout.class);
+
+  private static final Logger LOG = LoggerFactory.getLogger(TestRegionServerAbortTimeout.class);
+
+  private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
+
+  private static TableName TABLE_NAME = TableName.valueOf("RSAbort");
+
+  private static byte[] CF = Bytes.toBytes("cf");
+
+  private static byte[] CQ = Bytes.toBytes("cq");
+
+  private static final int REGIONS_NUM = 5;
+
+  private static final int SLEEP_TIME_WHEN_CLOSE_REGION = 1000;
+
+  private static volatile boolean abortTimeoutTaskScheduled = false;
+
+  @BeforeClass
+  public static void setUp() throws Exception {
+    Configuration conf = UTIL.getConfiguration();
+    // Will schedule a abort timeout task after SLEEP_TIME_WHEN_CLOSE_REGION ms
+    conf.setLong(HRegionServer.ABORT_TIMEOUT, SLEEP_TIME_WHEN_CLOSE_REGION);
+    conf.set(HRegionServer.ABORT_TIMEOUT_TASK, TestAbortTimeoutTask.class.getName());
+    UTIL.startMiniCluster(2);
+    TableDescriptor td = TableDescriptorBuilder.newBuilder(TABLE_NAME)
+        .setCoprocessor(SleepWhenCloseCoprocessor.class.getName())
+        .setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(CF).build()).build();
+    UTIL.getAdmin().createTable(td, Bytes.toBytes("0"), Bytes.toBytes("9"), REGIONS_NUM);
+  }
+
+  @AfterClass
+  public static void tearDown() throws Exception {
+    // Wait the SCP of abort rs to finish
+    UTIL.waitFor(30000, () -> UTIL.getMiniHBaseCluster().getMaster().getProcedures().stream()
+        .filter(p -> p instanceof ServerCrashProcedure && p.isFinished()).count()
> 0);
+    UTIL.getAdmin().disableTable(TABLE_NAME);
+    UTIL.getAdmin().deleteTable(TABLE_NAME);
+    UTIL.shutdownMiniCluster();
+  }
+
+  @Test
+  public void testAbortTimeout() throws Exception {
+    Thread writer = new Thread(() -> {
+      try {
+        try (Table table = UTIL.getConnection().getTable(TABLE_NAME)) {
+          for (int i = 0; i < 10000; i++) {
+            table.put(new Put(Bytes.toBytes(i)).addColumn(CF, CQ, Bytes.toBytes(i)));
+          }
+        }
+      } catch (IOException e) {
+        LOG.warn("Failed to load data");
+      }
+    });
+    writer.setDaemon(true);
+    writer.start();
+
+    // Abort one region server
+    UTIL.getMiniHBaseCluster().getRegionServer(0).abort("Abort RS for test");
+
+    long startTime = System.currentTimeMillis();
+    long timeout = REGIONS_NUM * SLEEP_TIME_WHEN_CLOSE_REGION * 10;
+    while (System.currentTimeMillis() - startTime < timeout) {
+      if (UTIL.getMiniHBaseCluster().getLiveRegionServerThreads().size() == 1) {
+        assertTrue("Abort timer task should be scheduled", abortTimeoutTaskScheduled);
+        return;
+      }
+      Threads.sleep(SLEEP_TIME_WHEN_CLOSE_REGION);
+    }
+    fail("Failed to abort a region server in " + timeout + " ms");
+  }
+
+  static class TestAbortTimeoutTask extends TimerTask {
+
+    public TestAbortTimeoutTask() {
+    }
+
+    @Override
+    public void run() {
+      LOG.info("TestAbortTimeoutTask was scheduled");
+      abortTimeoutTaskScheduled = true;
+    }
+  }
+
+  public static class SleepWhenCloseCoprocessor implements RegionCoprocessor, RegionObserver
{
+
+    public SleepWhenCloseCoprocessor() {
+    }
+
+    @Override
+    public Optional<RegionObserver> getRegionObserver() {
+      return Optional.of(this);
+    }
+
+    @Override
+    public void preClose(ObserverContext<RegionCoprocessorEnvironment> c, boolean abortRequested)
+        throws IOException {
+      Threads.sleep(SLEEP_TIME_WHEN_CLOSE_REGION);
+    }
+  }
+}


Mime
View raw message