hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From te...@apache.org
Subject svn commit: r1244137 - in /hbase/branches/0.90: ./ src/main/java/org/apache/hadoop/hbase/executor/ src/main/java/org/apache/hadoop/hbase/master/ src/main/java/org/apache/hadoop/hbase/regionserver/handler/ src/test/java/org/apache/hadoop/hbase/regionser...
Date Tue, 14 Feb 2012 17:54:23 GMT
Author: tedyu
Date: Tue Feb 14 17:54:23 2012
New Revision: 1244137

URL: http://svn.apache.org/viewvc?rev=1244137&view=rev
Log:
HBASE-5379  Backport HBASE-4287 to 0.90 - If region opening fails, try to transition region
back to
               "offline" in ZK (Ram)

Modified:
    hbase/branches/0.90/CHANGES.txt
    hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/EventHandler.java
    hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java
    hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
    hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java
    hbase/branches/0.90/src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java

Modified: hbase/branches/0.90/CHANGES.txt
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/CHANGES.txt?rev=1244137&r1=1244136&r2=1244137&view=diff
==============================================================================
--- hbase/branches/0.90/CHANGES.txt (original)
+++ hbase/branches/0.90/CHANGES.txt Tue Feb 14 17:54:23 2012
@@ -45,6 +45,8 @@ Release 0.90.6 - Unreleased
    HBASE-5321  this.allRegionServersOffline not set to false after one RS comes online and

                assignment is done in 0.90. (Ram)
    HBASE-5377  Fix licenses on the 0.90 branch.
+   HBASE-5379  Backport HBASE-4287 to 0.90 - If region opening fails, try to transition region
back to
+               "offline" in ZK
 
   IMPROVEMENT
    HBASE-5102  Change the default value of the property "hbase.connection.per.config" to
false in

Modified: hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/EventHandler.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/EventHandler.java?rev=1244137&r1=1244136&r2=1244137&view=diff
==============================================================================
--- hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/EventHandler.java (original)
+++ hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/EventHandler.java Tue
Feb 14 17:54:23 2012
@@ -107,6 +107,7 @@ public abstract class EventHandler imple
     RS_ZK_REGION_CLOSED       (2),   // RS has finished closing a region
     RS_ZK_REGION_OPENING      (3),   // RS is in process of opening a region
     RS_ZK_REGION_OPENED       (4),   // RS has finished opening a region
+    RS_ZK_REGION_FAILED_OPEN  (5),   // RS failed to open a region
 
     // Messages originating from Master to RS
     M_RS_OPEN_REGION          (20),  // Master asking RS to open a region

Modified: hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java?rev=1244137&r1=1244136&r2=1244137&view=diff
==============================================================================
--- hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java
(original)
+++ hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java
Tue Feb 14 17:54:23 2012
@@ -107,6 +107,7 @@ public class ExecutorService {
       // Master executor services
 
       case RS_ZK_REGION_CLOSED:
+      case RS_ZK_REGION_FAILED_OPEN:
         return ExecutorType.MASTER_CLOSE_REGION;
 
       case RS_ZK_REGION_OPENED:

Modified: hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1244137&r1=1244136&r2=1244137&view=diff
==============================================================================
--- hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
(original)
+++ hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
Tue Feb 14 17:54:23 2012
@@ -24,6 +24,7 @@ import java.io.DataOutput;
 import java.io.IOException;
 import java.lang.Thread.UncaughtExceptionHandler;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
@@ -138,6 +139,10 @@ public class AssignmentManager extends Z
     new TreeMap<HRegionInfo,HServerInfo>();
 
   private final ExecutorService executorService;
+  
+  private List<EventType> ignoreStatesRSOffline = Arrays
+      .asList(new EventType[] { EventType.RS_ZK_REGION_FAILED_OPEN,
+          EventType.RS_ZK_REGION_CLOSED });
 
   /**
    * Constructs a new assignment manager.
@@ -346,6 +351,7 @@ public class AssignmentManager extends Z
         break;
 
       case RS_ZK_REGION_CLOSED:
+      case RS_ZK_REGION_FAILED_OPEN:
         // Region is closed, insert into RIT and handle it
         addToRITandCallClose(regionInfo, RegionState.State.CLOSED, data);
         break;
@@ -473,7 +479,8 @@ public class AssignmentManager extends Z
       }
       // Verify this is a known server
       if (!serverManager.isServerOnline(data.getServerName()) &&
-          !this.master.getServerName().equals(data.getServerName())) {
+          !this.master.getServerName().equals(data.getServerName())
+          && !ignoreStatesRSOffline.contains(data.getEventType())) {
         LOG.warn("Attempted to handle region transition for server but " +
           "server is not online: " + Bytes.toString(data.getRegionName()));
         return;
@@ -541,6 +548,20 @@ public class AssignmentManager extends Z
           // Transition to OPENING (or update stamp if already OPENING)
           regionState.update(RegionState.State.OPENING, data.getStamp());
           break;
+        case RS_ZK_REGION_FAILED_OPEN:
+        if (regionState == null
+            || (!regionState.isPendingOpen() && !regionState.isOpening())) {
+          LOG.warn("Received FAILED_OPEN for region " + prettyPrintedRegionName
+              + " from server " + data.getServerName() + " but region was in "
+              + " the state " + regionState
+              + " and not in PENDING_OPEN or OPENING");
+          return;
+        }
+        // Handle this the same as if it were opened and then closed.
+        regionState.update(RegionState.State.CLOSED, data.getStamp());
+        this.executorService.submit(new ClosedRegionHandler(master, this,
+            regionState.getRegion()));
+        break;
 
         case RS_ZK_REGION_OPENED:
           // Should see OPENED after OPENING but possible after PENDING_OPEN
@@ -1996,8 +2017,8 @@ public class AssignmentManager extends Z
               LOG.debug("Region has transitioned to OPENED, allowing " +
                   "watched event handlers to process");
               break;
-            } else if (data.getEventType() !=
-                EventType.RS_ZK_REGION_OPENING) {
+            } else if (data.getEventType() != EventType.RS_ZK_REGION_OPENING
+              && data.getEventType() != EventType.RS_ZK_REGION_FAILED_OPEN) {
               LOG.warn("While timing out a region in state OPENING, " +
                   "found ZK node in unexpected state: " +
                   data.getEventType());

Modified: hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java?rev=1244137&r1=1244136&r2=1244137&view=diff
==============================================================================
--- hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java
(original)
+++ hbase/branches/0.90/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java
Tue Feb 14 17:54:23 2012
@@ -27,6 +27,7 @@ import org.apache.commons.logging.LogFac
 import org.apache.hadoop.hbase.HRegionInfo;
 import org.apache.hadoop.hbase.Server;
 import org.apache.hadoop.hbase.executor.EventHandler;
+import org.apache.hadoop.hbase.executor.EventHandler.EventType;
 import org.apache.hadoop.hbase.regionserver.HRegion;
 import org.apache.hadoop.hbase.regionserver.RegionServerServices;
 import org.apache.hadoop.hbase.util.CancelableProgressable;
@@ -95,13 +96,17 @@ public class OpenRegionHandler extends E
       if (!transitionZookeeperOfflineToOpening(encodedName)) {
         LOG.warn("Region was hijacked? It no longer exists, encodedName=" +
           encodedName);
+        tryTransitionToFailedOpen(regionInfo);
         return;
       }
 
       // Open region.  After a successful open, failures in subsequent
       // processing needs to do a close as part of cleanup.
       region = openRegion();
-      if (region == null) return;
+      if (region == null) {
+        tryTransitionToFailedOpen(regionInfo);
+        return;
+      }
       boolean failed = true;
       if (tickleOpening("post_region_open")) {
         if (updateMeta(region)) failed = false;
@@ -110,10 +115,17 @@ public class OpenRegionHandler extends E
       if (failed || this.server.isStopped() ||
           this.rsServices.isStopping()) {
         cleanupFailedOpen(region);
+        tryTransitionToFailedOpen(regionInfo);
         return;
       }
 
       if (!transitionToOpened(region)) {
+        // If we fail to transition to opened, it's because of one of two cases:
+        //    (a) we lost our ZK lease
+        // OR (b) someone else opened the region before us
+        // In either case, we don't need to transition to FAILED_OPEN state.
+        // In case (a), the Master will process us as a dead server. In case
+        // (b) the region is already being handled elsewhere anyway.
         cleanupFailedOpen(region);
         return;
       }
@@ -125,6 +137,35 @@ public class OpenRegionHandler extends E
           remove(this.regionInfo.getEncodedNameAsBytes());
     }
   }
+  /**
+   * @param  Region we're working on.
+   * This is not guaranteed to succeed, we just do our best.
+   * @return whether znode is successfully transitioned to FAILED_OPEN state.
+   */
+  private boolean tryTransitionToFailedOpen(final HRegionInfo hri) {
+    boolean result = false;
+    final String name = hri.getRegionNameAsString();
+    try {
+      LOG.info("Opening of region " + hri
+          + " failed, marking as FAILED_OPEN in ZK");
+      if (ZKAssign.transitionNode(
+          this.server.getZooKeeper(), hri,
+          this.server.getServerName(),
+          EventType.RS_ZK_REGION_OPENING,
+          EventType.RS_ZK_REGION_FAILED_OPEN,
+          this.version) == -1) {
+        LOG.warn("Unable to mark region " + hri + " as FAILED_OPEN. " +
+            "It's likely that the master already timed out this open " +
+            "attempt, and thus another RS already has the region.");
+      } else {
+        result = true;
+      }
+    } catch (KeeperException e) {
+      LOG.error("Failed transitioning node " + name +
+        " from OPENING to FAILED_OPEN", e);
+    }
+    return result;
+  }
 
   /**
    * Update ZK, ROOT or META.  This can take a while if for example the
@@ -133,7 +174,7 @@ public class OpenRegionHandler extends E
    * state meantime so master doesn't timeout our region-in-transition.
    * Caller must cleanup region if this fails.
    */
-  private boolean updateMeta(final HRegion r) {
+  boolean updateMeta(final HRegion r) {
     if (this.server.isStopped() || this.rsServices.isStopping()) {
       return false;
     }
@@ -285,11 +326,12 @@ public class OpenRegionHandler extends E
             return tickleOpening("open_region_progress");
           }
         });
-    } catch (IOException e) {
-      // We failed open.  Let our znode expire in regions-in-transition and
-      // Master will assign elsewhere.  Presumes nothing to close.
+    } catch (Throwable t) {
+      // We failed open. Our caller will see the 'null' return value
+      // and transition the node back to FAILED_OPEN. If that fails,
+      // we rely on the Timeout Monitor in the master to reassign.
       LOG.error("Failed open of region=" +
-        this.regionInfo.getRegionNameAsString(), e);
+        this.regionInfo.getRegionNameAsString(), t);
     }
     return region;
   }

Modified: hbase/branches/0.90/src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java?rev=1244137&r1=1244136&r2=1244137&view=diff
==============================================================================
--- hbase/branches/0.90/src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java
(original)
+++ hbase/branches/0.90/src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java
Tue Feb 14 17:54:23 2012
@@ -19,6 +19,8 @@
  */
 package org.apache.hadoop.hbase.regionserver.handler;
 
+import static org.junit.Assert.assertEquals;
+
 import java.io.IOException;
 
 import org.apache.hadoop.hbase.HBaseTestingUtility;
@@ -26,8 +28,11 @@ import org.apache.hadoop.hbase.HConstant
 import org.apache.hadoop.hbase.HRegionInfo;
 import org.apache.hadoop.hbase.HTableDescriptor;
 import org.apache.hadoop.hbase.Server;
+import org.apache.hadoop.hbase.executor.RegionTransitionData;
+import org.apache.hadoop.hbase.executor.EventHandler.EventType;
 import org.apache.hadoop.hbase.regionserver.HRegion;
 import org.apache.hadoop.hbase.regionserver.RegionServerServices;
+import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.MockRegionServerServices;
 import org.apache.hadoop.hbase.util.MockServer;
 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
@@ -36,6 +41,7 @@ import org.apache.hadoop.hbase.zookeeper
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.KeeperException.NodeExistsException;
 import org.junit.AfterClass;
+import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
@@ -44,14 +50,31 @@ import org.junit.Test;
  */
 public class TestOpenRegionHandler {
   private final static HBaseTestingUtility HTU = new HBaseTestingUtility();
+  private static HTableDescriptor TEST_HTD;
+  private HRegionInfo TEST_HRI;
+  
+  private int testIndex = 0;
 
   @BeforeClass public static void before() throws Exception {
     HTU.startMiniZKCluster();
+    TEST_HTD = new HTableDescriptor("TestOpenRegionHandler.java");
   }
 
   @AfterClass public static void after() throws IOException {
+    TEST_HTD = null;
     HTU.shutdownMiniZKCluster();
   }
+  /**
+   * Before each test, use a different HRI, so the different tests
+   * don't interfere with each other. This allows us to use just
+   * a single ZK cluster for the whole suite.
+   */
+  @Before
+  public void setupHRI() {
+    TEST_HRI = new HRegionInfo(TEST_HTD, Bytes.toBytes(testIndex), Bytes
+        .toBytes(testIndex + 1));
+    testIndex++;
+  }
 
   /**
    * Test the openregionhandler can deal with its znode being yanked out from
@@ -94,4 +117,55 @@ public class TestOpenRegionHandler {
     // post OPENING; again will expect it to come back w/o NPE or exception.
     handler.process();
   }
+  @Test
+  public void testFailedOpenRegion() throws Exception {
+    Server server = new MockServer();
+    RegionServerServices rsServices = new MockRegionServerServices();
+
+    // Create it OFFLINE, which is what it expects
+    ZKAssign.createNodeOffline(server.getZooKeeper(), TEST_HRI, server
+        .getServerName());
+
+    // Create the handler
+    OpenRegionHandler handler =
+      new OpenRegionHandler(server, rsServices, TEST_HRI) {
+        @Override
+        HRegion openRegion() {
+          // Fake failure of opening a region due to an IOE, which is caught
+          return null;
+        }
+    };
+    handler.process();
+
+    // Handler should have transitioned it to FAILED_OPEN
+    RegionTransitionData data =
+      ZKAssign.getData(server.getZooKeeper(), TEST_HRI.getEncodedName());
+    assertEquals(EventType.RS_ZK_REGION_FAILED_OPEN, data.getEventType());
+  }
+  
+  @Test
+  public void testFailedUpdateMeta() throws Exception {
+    Server server = new MockServer();
+    RegionServerServices rsServices = new MockRegionServerServices();
+
+    // Create it OFFLINE, which is what it expects
+    ZKAssign.createNodeOffline(server.getZooKeeper(), TEST_HRI, server
+        .getServerName());
+
+    // Create the handler
+    OpenRegionHandler handler =
+      new OpenRegionHandler(server, rsServices, TEST_HRI) {
+        @Override
+        boolean updateMeta(final HRegion r) {
+          // Fake failure of updating META
+          return false;
+        }
+    };
+    handler.process();
+
+    // Handler should have transitioned it to FAILED_OPEN
+    RegionTransitionData data =
+      ZKAssign.getData(server.getZooKeeper(), TEST_HRI.getEncodedName());
+    assertEquals(EventType.RS_ZK_REGION_FAILED_OPEN, data.getEventType());
+  }
 }



Mime
View raw message