hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jxi...@apache.org
Subject svn commit: r1462215 - in /hbase/branches/0.95/hbase-server/src: main/java/org/apache/hadoop/hbase/master/ test/java/org/apache/hadoop/hbase/master/
Date Thu, 28 Mar 2013 17:02:32 GMT
Author: jxiang
Date: Thu Mar 28 17:02:31 2013
New Revision: 1462215

URL: http://svn.apache.org/r1462215
Log:
HBASE-8144 Limit number of attempts to assign a region

Modified:
    hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
    hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
    hbase/branches/0.95/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java

Modified: hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1462215&r1=1462214&r2=1462215&view=diff
==============================================================================
--- hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
(original)
+++ hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
Thu Mar 28 17:02:31 2013
@@ -187,6 +187,16 @@ public class AssignmentManager extends Z
   private final boolean tomActivated;
 
   /**
+   * A map to track the count a region fails to open in a row.
+   * So that we don't try to open a region forever if the failure is
+   * unrecoverable.  We don't put this information in region states
+   * because we don't expect this to happen frequently; we don't
+   * want to copy this information over during each state transition either.
+   */
+  private final ConcurrentHashMap<String, AtomicInteger>
+    failedOpenTracker = new ConcurrentHashMap<String, AtomicInteger>();
+
+  /**
    * Constructs a new assignment manager.
    *
    * @param server
@@ -541,7 +551,7 @@ public class AssignmentManager extends Z
                 public void process() throws IOException {
                   ReentrantLock lock = locker.acquireLock(regionInfo.getEncodedName());
                   try {
-                    unassign(regionInfo, rs, expectedVersion, sn, true);
+                    unassign(regionInfo, rs, expectedVersion, sn, true, null);
                   } finally {
                     lock.unlock();
                   }
@@ -880,9 +890,25 @@ public class AssignmentManager extends Z
           // When there are more than one region server a new RS is selected as the
           // destination and the same is updated in the regionplan. (HBASE-5546)
           if (regionState != null) {
-            getRegionPlan(regionState.getRegion(), sn, true);
-            this.executorService.submit(new ClosedRegionHandler(server,
-              this, regionState.getRegion()));
+            AtomicInteger failedOpenCount = failedOpenTracker.get(encodedName);
+            if (failedOpenCount == null) {
+              failedOpenCount = new AtomicInteger();
+              // No need to use putIfAbsent, or extra synchronization since
+              // this whole handleRegion block is locked on the encoded region
+              // name, and failedOpenTracker is updated only in this block
+              failedOpenTracker.put(encodedName, failedOpenCount);
+            }
+            if (failedOpenCount.incrementAndGet() >= maximumAttempts) {
+              regionStates.updateRegionState(
+                regionState.getRegion(), RegionState.State.FAILED_OPEN);
+              // remove the tracking info to save memory, also reset
+              // the count for next open initiative
+              failedOpenTracker.remove(encodedName);
+            } else {
+              getRegionPlan(regionState.getRegion(), sn, true);
+              this.executorService.submit(new ClosedRegionHandler(server,
+                this, regionState.getRegion()));
+            }
           }
           break;
 
@@ -909,11 +935,16 @@ public class AssignmentManager extends Z
               + " from server " + sn + " but region was in the state " + regionState
               + " and not in expected PENDING_OPEN or OPENING states,"
               + " or not on the expected server");
+            // Close it without updating the internal region states,
+            // so as not to create double assignments in unlucky scenarios
+            // mentioned in OpenRegionHandler#process
+            unassign(regionState.getRegion(), null, -1, null, false, sn);
             return;
           }
           // Handle OPENED by removing from transition and deleted zk node
           regionState = regionStates.updateRegionState(rt, RegionState.State.OPEN);
           if (regionState != null) {
+            failedOpenTracker.remove(encodedName); // reset the count, if any
             this.executorService.submit(new OpenedRegionHandler(
               server, this, regionState.getRegion(), sn, expectedVersion));
           }
@@ -1571,12 +1602,23 @@ public class AssignmentManager extends Z
   }
 
   /**
-   * Send CLOSE RPC if the server is online, otherwise, offline the region
+   * Send CLOSE RPC if the server is online, otherwise, offline the region.
+   *
+   * The RPC will be sent only to the region sever found in the region state
+   * if it is passed in, otherwise, to the src server specified. If region
+   * state is not specified, we don't update region state at all, instead
+   * we just send the RPC call. This is useful for some cleanup without
+   * messing around the region states (see handleRegion, on region opened
+   * on an unexpected server scenario, for an example)
    */
   private void unassign(final HRegionInfo region,
       final RegionState state, final int versionOfClosingNode,
-      final ServerName dest, final boolean transitionInZK) {
-    ServerName server = state.getServerName();
+      final ServerName dest, final boolean transitionInZK,
+      final ServerName src) {
+    ServerName server = src;
+    if (state != null) {
+      server = state.getServerName();
+    }
     for (int i = 1; i <= this.maximumAttempts; i++) {
       // ClosedRegionhandler can remove the server from this.regions
       if (!serverManager.isServerOnline(server)) {
@@ -1584,7 +1626,9 @@ public class AssignmentManager extends Z
           // delete the node. if no node exists need not bother.
           deleteClosingOrClosedNode(region);
         }
-        regionOffline(region);
+        if (state != null) {
+          regionOffline(region);
+        }
         return;
       }
       try {
@@ -1608,9 +1652,12 @@ public class AssignmentManager extends Z
           if (transitionInZK) {
             deleteClosingOrClosedNode(region);
           }
-          regionOffline(region);
+          if (state != null) {
+            regionOffline(region);
+          }
           return;
-        } else if (t instanceof RegionAlreadyInTransitionException) {
+        } else if (state != null
+            && t instanceof RegionAlreadyInTransitionException) {
           // RS is already processing this region, only need to update the timestamp
           LOG.debug("update " + state + " the timestamp.");
           state.updateTimestampToNow();
@@ -1622,7 +1669,7 @@ public class AssignmentManager extends Z
       }
     }
     // Run out of attempts
-    if (!tomActivated) {
+    if (!tomActivated && state != null) {
       regionStates.updateRegionState(region, RegionState.State.FAILED_CLOSE);
     }
   }
@@ -1649,7 +1696,7 @@ public class AssignmentManager extends Z
       case CLOSING:
       case PENDING_CLOSE:
       case FAILED_CLOSE:
-        unassign(region, state, -1, null, false);
+        unassign(region, state, -1, null, false, null);
         state = regionStates.getRegionState(region);
         if (state.isOffline()) break;
       case FAILED_OPEN:
@@ -1717,14 +1764,17 @@ public class AssignmentManager extends Z
         }
       }
       if (setOfflineInZK && versionOfOfflineNode == -1) {
-        LOG.warn("Unable to set offline in ZooKeeper to assign " + region);
-        if (!tomActivated) {
-          regionStates.updateRegionState(region, RegionState.State.FAILED_OPEN);
+        LOG.info("Unable to set offline in ZooKeeper to assign " + region);
+        // Setting offline in ZK must have been failed due to ZK racing or some
+        // exception which may make the server to abort. If it is ZK racing,
+        // we should retry since we already reset the region state,
+        // existing (re)assignment will fail anyway.
+        if (!server.isAborted()) {
+          continue;
         }
-        return;
       }
-      if (this.server.isStopped()) {
-        LOG.debug("Server stopped; skipping assign of " + region);
+      if (this.server.isStopped() || this.server.isAborted()) {
+        LOG.debug("Server stopped/aborted; skipping assign of " + region);
         return;
       }
       LOG.info("Assigning region " + region.getRegionNameAsString() +
@@ -2144,7 +2194,7 @@ public class AssignmentManager extends Z
         return;
       }
 
-      unassign(region, state, versionOfClosingNode, dest, true);
+      unassign(region, state, versionOfClosingNode, dest, true, null);
     } finally {
       lock.unlock();
     }

Modified: hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java?rev=1462215&r1=1462214&r2=1462215&view=diff
==============================================================================
--- hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
(original)
+++ hbase/branches/0.95/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java
Thu Mar 28 17:02:31 2013
@@ -253,6 +253,10 @@ public class RegionStates {
       newServerName = null;
     }
 
+    if (state == State.FAILED_CLOSE || state == State.FAILED_OPEN) {
+      LOG.warn("Failed to transition " + hri + " on " + serverName + ": " + state);
+    }
+
     String regionName = hri.getEncodedName();
     RegionState regionState = new RegionState(
       hri, state, System.currentTimeMillis(), newServerName);

Modified: hbase/branches/0.95/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.95/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java?rev=1462215&r1=1462214&r2=1462215&view=diff
==============================================================================
--- hbase/branches/0.95/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
(original)
+++ hbase/branches/0.95/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java
Thu Mar 28 17:02:31 2013
@@ -26,6 +26,8 @@ import java.io.IOException;
 import java.util.List;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.HBaseTestingUtility;
 import org.apache.hadoop.hbase.HColumnDescriptor;
 import org.apache.hadoop.hbase.HConstants;
@@ -44,6 +46,7 @@ import org.apache.hadoop.hbase.coprocess
 import org.apache.hadoop.hbase.master.balancer.StochasticLoadBalancer;
 import org.apache.hadoop.hbase.regionserver.HRegionServer;
 import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.FSUtils;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Test;
@@ -61,11 +64,13 @@ public class TestAssignmentManagerOnClus
 
   @BeforeClass
   public static void setUpBeforeClass() throws Exception {
-    // Using the test load balancer to control region plans
+    // Using the mock load balancer to control region plans
     conf.setClass(HConstants.HBASE_MASTER_LOADBALANCER_CLASS,
-      TestLoadBalancer.class, LoadBalancer.class);
+      MockLoadBalancer.class, LoadBalancer.class);
     conf.setClass(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY,
-      TestRegionObserver.class, RegionObserver.class);
+      MockRegionObserver.class, RegionObserver.class);
+    // Reduce the maximum attempts to speed up the test
+    conf.setInt("hbase.assignment.maximum.attempts", 3);
 
     TEST_UTIL.startMiniCluster(3);
     admin = TEST_UTIL.getHBaseAdmin();
@@ -141,7 +146,7 @@ public class TestAssignmentManagerOnClus
   /**
    * This tests moving a region
    */
-  @Test
+  @Test (timeout=50000)
   public void testMoveRegion() throws Exception {
     String table = "testMoveRegion";
     try {
@@ -163,7 +168,7 @@ public class TestAssignmentManagerOnClus
       TEST_UTIL.getHBaseAdmin().move(hri.getEncodedNameAsBytes(),
         Bytes.toBytes(destServerName.getServerName()));
 
-      long timeoutTime = System.currentTimeMillis() + 800;
+      long timeoutTime = System.currentTimeMillis() + 30000;
       while (true) {
         ServerName sn = regionStates.getRegionServerOfRegion(hri);
         if (sn != null && sn.equals(destServerName)) {
@@ -172,7 +177,8 @@ public class TestAssignmentManagerOnClus
         }
         long now = System.currentTimeMillis();
         if (now > timeoutTime) {
-          fail("Failed to move the region in time");
+          fail("Failed to move the region in time: "
+            + regionStates.getRegionState(hri));
         }
         regionStates.waitForUpdate(50);
       }
@@ -206,9 +212,59 @@ public class TestAssignmentManagerOnClus
   }
 
   /**
+   * This tests forcefully assign a region
+   * while it's closing and re-assigned.
+   *
+   * This test should not be flaky. If it is flaky, it means something
+   * wrong with AssignmentManager which should be reported and fixed
+   */
+  @Test (timeout=60000)
+  public void testForceAssignWhileClosing() throws Exception {
+    String table = "testForceAssignWhileClosing";
+    try {
+      HTableDescriptor desc = new HTableDescriptor(table);
+      desc.addFamily(new HColumnDescriptor(FAMILY));
+      admin.createTable(desc);
+
+      HTable meta = new HTable(conf, HConstants.META_TABLE_NAME);
+      HRegionInfo hri = new HRegionInfo(
+        desc.getName(), Bytes.toBytes("A"), Bytes.toBytes("Z"));
+      MetaEditor.addRegionToMeta(meta, hri);
+
+      HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
+      master.assignRegion(hri);
+      AssignmentManager am = master.getAssignmentManager();
+      assertTrue(am.waitForAssignment(hri));
+
+      MockRegionObserver.enabled = true;
+      am.unassign(hri);
+      RegionState state = am.getRegionStates().getRegionState(hri);
+      assertEquals(RegionState.State.FAILED_CLOSE, state.getState());
+
+      MockRegionObserver.enabled = false;
+      am.unassign(hri, true);
+
+      // region is closing now, will be re-assigned automatically.
+      // now, let's forcefully assign it again. it should be
+      // assigned properly and no double-assignment
+      am.assign(hri, true, true);
+
+      // region should be closed and re-assigned
+      assertTrue(am.waitForAssignment(hri));
+
+      ServerName serverName = master.getAssignmentManager().
+        getRegionStates().getRegionServerOfRegion(hri);
+      TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
+    } finally {
+      MockRegionObserver.enabled = false;
+      TEST_UTIL.deleteTable(Bytes.toBytes(table));
+    }
+  }
+
+  /**
    * This tests region close failed
    */
-  @Test
+  @Test (timeout=30000)
   public void testCloseFailed() throws Exception {
     String table = "testCloseFailed";
     try {
@@ -226,24 +282,25 @@ public class TestAssignmentManagerOnClus
       AssignmentManager am = master.getAssignmentManager();
       assertTrue(am.waitForAssignment(hri));
 
-      TestRegionObserver.enabled = true;
+      MockRegionObserver.enabled = true;
       am.unassign(hri);
       RegionState state = am.getRegionStates().getRegionState(hri);
       assertEquals(RegionState.State.FAILED_CLOSE, state.getState());
 
-      TestRegionObserver.enabled = false;
+      MockRegionObserver.enabled = false;
       am.unassign(hri, true);
-      state = am.getRegionStates().getRegionState(hri);
-      assertTrue(RegionState.State.FAILED_CLOSE != state.getState());
 
-      am.assign(hri, true, true);
-      assertTrue(am.waitForAssignment(hri));
+      // region may still be assigned now since it's closing,
+      // let's check if it's assigned after it's out of transition
+      am.waitOnRegionToClearRegionsInTransition(hri);
 
+      // region should be closed and re-assigned
+      assertTrue(am.waitForAssignment(hri));
       ServerName serverName = master.getAssignmentManager().
         getRegionStates().getRegionServerOfRegion(hri);
       TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
     } finally {
-      TestRegionObserver.enabled = false;
+      MockRegionObserver.enabled = false;
       TEST_UTIL.deleteTable(Bytes.toBytes(table));
     }
   }
@@ -251,7 +308,7 @@ public class TestAssignmentManagerOnClus
   /**
    * This tests region open failed
    */
-  @Test
+  @Test (timeout=30000)
   public void testOpenFailed() throws Exception {
     String table = "testOpenFailed";
     try {
@@ -264,7 +321,51 @@ public class TestAssignmentManagerOnClus
         desc.getName(), Bytes.toBytes("A"), Bytes.toBytes("Z"));
       MetaEditor.addRegionToMeta(meta, hri);
 
-      TestLoadBalancer.controledRegion = hri.getEncodedName();
+      MockLoadBalancer.controledRegion = hri.getEncodedName();
+
+      HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
+      master.assignRegion(hri);
+      AssignmentManager am = master.getAssignmentManager();
+      assertFalse(am.waitForAssignment(hri));
+
+      RegionState state = am.getRegionStates().getRegionState(hri);
+      assertEquals(RegionState.State.FAILED_OPEN, state.getState());
+
+      MockLoadBalancer.controledRegion = null;
+      master.assignRegion(hri);
+      assertTrue(am.waitForAssignment(hri));
+
+      ServerName serverName = master.getAssignmentManager().
+        getRegionStates().getRegionServerOfRegion(hri);
+      TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
+    } finally {
+      MockLoadBalancer.controledRegion = null;
+      TEST_UTIL.deleteTable(Bytes.toBytes(table));
+    }
+  }
+
+  /**
+   * This tests region open failure which is not recoverable
+   */
+  @Test (timeout=30000)
+  public void testOpenFailedUnrecoverable() throws Exception {
+    String table = "testOpenFailedUnrecoverable";
+    try {
+      HTableDescriptor desc = new HTableDescriptor(table);
+      desc.addFamily(new HColumnDescriptor(FAMILY));
+      admin.createTable(desc);
+
+      HTable meta = new HTable(conf, HConstants.META_TABLE_NAME);
+      HRegionInfo hri = new HRegionInfo(
+        desc.getName(), Bytes.toBytes("A"), Bytes.toBytes("Z"));
+      MetaEditor.addRegionToMeta(meta, hri);
+
+      FileSystem fs = FileSystem.get(conf);
+      Path tableDir= FSUtils.getTablePath(FSUtils.getRootDir(conf), table);
+      Path regionDir = new Path(tableDir, hri.getEncodedName());
+      // create a file named the same as the region dir to
+      // mess up with region opening
+      fs.create(regionDir, true);
 
       HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
       master.assignRegion(hri);
@@ -274,7 +375,8 @@ public class TestAssignmentManagerOnClus
       RegionState state = am.getRegionStates().getRegionState(hri);
       assertEquals(RegionState.State.FAILED_OPEN, state.getState());
 
-      TestLoadBalancer.controledRegion = null;
+      // remove the blocking file, so that region can be opened
+      fs.delete(regionDir, true);
       master.assignRegion(hri);
       assertTrue(am.waitForAssignment(hri));
 
@@ -282,12 +384,11 @@ public class TestAssignmentManagerOnClus
         getRegionStates().getRegionServerOfRegion(hri);
       TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
     } finally {
-      TestLoadBalancer.controledRegion = null;
       TEST_UTIL.deleteTable(Bytes.toBytes(table));
     }
   }
 
-  static class TestLoadBalancer extends StochasticLoadBalancer {
+  static class MockLoadBalancer extends StochasticLoadBalancer {
     // For this region, if specified, always assign to nowhere
     static volatile String controledRegion = null;
 
@@ -301,7 +402,7 @@ public class TestAssignmentManagerOnClus
     }
   }
 
-  public static class TestRegionObserver extends BaseRegionObserver {
+  public static class MockRegionObserver extends BaseRegionObserver {
     // If enabled, fail all preClose calls
     static volatile boolean enabled = false;
 



Mime
View raw message