hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From apurt...@apache.org
Subject [hbase] branch branch-1 updated: HBASE-25130 - Fix master in-memory server holding map after: (#3402)
Date Mon, 28 Jun 2021 17:01:34 GMT
This is an automated email from the ASF dual-hosted git repository.

apurtell pushed a commit to branch branch-1
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-1 by this push:
     new 395eb0c  HBASE-25130 - Fix master in-memory server holding map after: (#3402)
395eb0c is described below

commit 395eb0c8e03382d6c558f5451785a25ea1caaa3e
Author: Victor <vli02@hotmail.com>
AuthorDate: Mon Jun 28 10:00:46 2021 -0700

    HBASE-25130 - Fix master in-memory server holding map after: (#3402)
    
    HBASE-25130 [branch-1] Masters in-memory serverHoldings map is not cleared during hbck
repair
    
    Signed-off-by: Andrew Purtell <apurtell@apache.org>
---
 .../hadoop/hbase/master/AssignmentManager.java     | 28 +++++++--
 .../hadoop/hbase/master/MasterRpcServices.java     |  2 +-
 .../apache/hadoop/hbase/master/RegionStates.java   |  8 +--
 .../apache/hadoop/hbase/util/TestHBaseFsck.java    | 70 ++++++++++++++++++++++
 4 files changed, 99 insertions(+), 9 deletions(-)

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
index 2b8c521..f7cfc4c 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
@@ -1661,6 +1661,19 @@ public class AssignmentManager extends ZooKeeperListener {
     regionOffline(regionInfo, null);
   }
 
+  /**
+   * Marks the region as offline. In addition whether removing it from
+   * replicas and master in-memory server holding map.
+   * <p>
+   * @param regionInfo - region info.
+   * @param force - setting to true to force this region to be removed from replicas and
master
+   *   in-memory server holding map, to make this region not be re-opened on any other region
+   *   servers. The only use case is hbck for now.
+   */
+  public void regionOffline(final HRegionInfo regionInfo, boolean force) {
+    regionOffline(regionInfo, null, force);
+  }
+
   public void offlineDisabledRegion(HRegionInfo regionInfo) {
     if (useZKForAssignment) {
       // Disabling so should not be reassigned, just delete the CLOSED node
@@ -4551,13 +4564,20 @@ public class AssignmentManager extends ZooKeeperListener {
 
   public Map<String, AtomicInteger> getFailedOpenTracker() {return failedOpenTracker;}
 
+  private void regionOffline(final HRegionInfo regionInfo, final State state) {
+    regionOffline(regionInfo, state, false);
+  }
+
   /**
    * A region is offline.  The new state should be the specified one,
    * if not null.  If the specified state is null, the new state is Offline.
    * The specified state can be Split/Merged/Offline/null only.
+   *
+   * If region offline is initiated by rpc call from admin, we force offline it.
    */
-  private void regionOffline(final HRegionInfo regionInfo, final State state) {
-    regionStates.regionOffline(regionInfo, state);
+  private void regionOffline(final HRegionInfo regionInfo, final State state,
+      final boolean force) {
+    regionStates.regionOffline(regionInfo, state, force);
     removeClosedRegion(regionInfo);
     // remove the region plan as well just in case.
     clearRegionPlan(regionInfo);
@@ -4566,7 +4586,7 @@ public class AssignmentManager extends ZooKeeperListener {
     // Tell our listeners that a region was closed
     sendRegionClosedNotification(regionInfo);
     // also note that all the replicas of the primary should be closed
-    if (state != null && state.equals(State.SPLIT)) {
+    if (force || (state != null && state.equals(State.SPLIT))) {
       Collection<HRegionInfo> c = new ArrayList<HRegionInfo>(1);
       c.add(regionInfo);
       Map<ServerName, List<HRegionInfo>> map = regionStates.getRegionAssignments(c);
@@ -4575,7 +4595,7 @@ public class AssignmentManager extends ZooKeeperListener {
         replicasToClose.addAll(list);
       }
     }
-    else if (state != null && state.equals(State.MERGED)) {
+    else if (force || (state != null && state.equals(State.MERGED))) {
       Collection<HRegionInfo> c = new ArrayList<HRegionInfo>(1);
       c.add(regionInfo);
       Map<ServerName, List<HRegionInfo>> map = regionStates.getRegionAssignments(c);
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java
index 0b82613..e87f664 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java
@@ -1401,7 +1401,7 @@ public class MasterRpcServices extends RSRpcServices
         master.cpHost.preRegionOffline(hri);
       }
       LOG.info(master.getClientIdAuditPrefix() + " offline " + hri.getRegionNameAsString());
-      master.assignmentManager.regionOffline(hri);
+      master.assignmentManager.regionOffline(hri, true);
       if (master.cpHost != null) {
         master.cpHost.postRegionOffline(hri);
       }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
index 0d7904b..4fc4c58 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
@@ -669,7 +669,7 @@ public class RegionStates {
    * A region is offline, won't be in transition any more.
    */
   public void regionOffline(final HRegionInfo hri) {
-    regionOffline(hri, null);
+    regionOffline(hri, null, false);
   }
 
   /**
@@ -678,7 +678,7 @@ public class RegionStates {
    * Split/Merged/Offline/null(=Offline)/SplittingNew/MergingNew.
    */
   public void regionOffline(
-      final HRegionInfo hri, final State expectedState) {
+      final HRegionInfo hri, final State expectedState, final boolean force) {
     Preconditions.checkArgument(expectedState == null
       || RegionState.isUnassignable(expectedState),
         "Offlined region should not be " + expectedState);
@@ -713,9 +713,9 @@ public class RegionStates {
       regionsInTransition.remove(encodedName);
       ServerName oldServerName = regionAssignments.remove(hri);
       if (oldServerName != null && serverHoldings.containsKey(oldServerName)) {
-        if (newState == State.MERGED || newState == State.SPLIT
+        if (force || (newState == State.MERGED || newState == State.SPLIT
             || hri.isMetaRegion() || tableStateManager.isTableState(hri.getTable(),
-            TableState.State.DISABLED, TableState.State.DISABLING)) {
+            TableState.State.DISABLED, TableState.State.DISABLING))) {
           // Offline the region only if it's merged/split, or the table is disabled/disabling.
           // Otherwise, offline it from this server only when it is online on a different
server.
           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java
index 7b6a4b3..28d3556 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java
@@ -882,6 +882,76 @@ public class TestHBaseFsck {
       assertNoErrors(hbck2);
       assertEquals(0, hbck2.getOverlapGroups(table).size());
       assertEquals(ROWKEYS.length, countRows());
+
+      MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+      long totalRegions = cluster.countServedRegions();
+
+      // stop a region servers and run fsck again
+      cluster.stopRegionServer(server);
+      cluster.waitForRegionServerToStop(server, 60);
+
+      // wait for all regions to come online.
+      while (cluster.countServedRegions() < totalRegions) {
+        Thread.sleep(100);
+      }
+
+      // check again after stopping a region server.
+      HBaseFsck hbck3 = doFsck(conf,false);
+      assertNoErrors(hbck3);
+    } finally {
+      cleanupTable(table);
+    }
+  }
+
+  /**
+   * This create and fixes a bad table with regions that have overlap regions.
+   */
+  @Test(timeout=180000)
+  public void testOverlapRegions() throws Exception {
+    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+    TableName table =
+        TableName.valueOf("tableOverlapRegions");
+    HRegionInfo hri;
+    ServerName server;
+    try {
+      setupTable(table);
+      assertNoErrors(doFsck(conf, false));
+      assertEquals(ROWKEYS.length, countRows());
+
+      // Now let's mess it up, by adding a region which overlaps with others
+      hri = createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B2"));
+      TEST_UTIL.assignRegion(hri);
+      server = regionStates.getRegionServerOfRegion(hri);
+      TEST_UTIL.assertRegionOnServer(hri, server, REGION_ONLINE_TIMEOUT);
+
+      HBaseFsck hbck = doFsck(conf, false);
+      assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
+        ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
+      assertEquals(3, hbck.getOverlapGroups(table).size());
+      assertEquals(ROWKEYS.length, countRows());
+
+      // fix the overlap regions.
+      doFsck(conf, true);
+
+      // check that the overlap regions are gone and no data loss
+      HBaseFsck hbck2 = doFsck(conf,false);
+      assertNoErrors(hbck2);
+      assertEquals(0, hbck2.getOverlapGroups(table).size());
+      assertEquals(ROWKEYS.length, countRows());
+
+      long totalRegions = cluster.countServedRegions();
+
+      // stop a region servers and run fsck again
+      cluster.stopRegionServer(server);
+      cluster.waitForRegionServerToStop(server, 60);
+
+      // wait for all regions to come online.
+      while (cluster.countServedRegions() < totalRegions) {
+        Thread.sleep(100);
+      }
+
+      HBaseFsck hbck3 = doFsck(conf,false);
+      assertNoErrors(hbck3);
     } finally {
       cleanupTable(table);
     }

Mime
View raw message