hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From st...@apache.org
Subject svn commit: r1001911 - in /hbase/trunk: ./ src/main/java/org/apache/hadoop/hbase/executor/ src/main/java/org/apache/hadoop/hbase/master/ src/main/java/org/apache/hadoop/hbase/regionserver/
Date Mon, 27 Sep 2010 21:02:05 GMT
Author: stack
Date: Mon Sep 27 21:02:04 2010
New Revision: 1001911

URL: http://svn.apache.org/viewvc?rev=1001911&view=rev
Log:
HBASE-3039 Stuck in regionsInTransition because rebalance came in at same time as a split

M src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
  Add region name to warning log message (w/o it message is no good).
M src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
  Add src of split message else need to deduce where it came from by looking
  elsewhere.
M src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
  Updated log messages to include region and where appropritate source
  server name; debug hard w/o
  Changed regionOnline and regionOffline to check for unexpected
  states and log warnings rather than proceed regardless.
  Added in fix for concurrent balance+split; split message now
  updates regionsintransition where previous it did not.
  Remove checkRegion method.  Its a reimplementation of
  what regionOnline and regionOffline do only less comprehensive
  regards what gets updated (this.regions + this.servers rather
  than this.regions, this.servers and regionsInTransition)
  That they were less comprehensive is root of this bug.
M src/main/java/org/apache/hadoop/hbase/master/HMaster.java
  Make the message about why we are not running balancer richer
  (print out how many reigons in transition and more of the
  regionsintrnasition list).
M src/main/java/org/apache/hadoop/hbase/executor/RegionTransitionData.java
  Javadoc and minor formatting.

Modified:
    hbase/trunk/CHANGES.txt
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/RegionTransitionData.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java

Modified: hbase/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hbase/trunk/CHANGES.txt?rev=1001911&r1=1001910&r2=1001911&view=diff
==============================================================================
--- hbase/trunk/CHANGES.txt (original)
+++ hbase/trunk/CHANGES.txt Mon Sep 27 21:02:04 2010
@@ -542,6 +542,8 @@ Release 0.21.0 - Unreleased
    HBASE-2992  [replication] MalformedObjectNameException in ReplicationMetrics
    HBASE-3037  When new master joins running cluster does "Received report from
                unknown server -- telling it to STOP_REGIONSERVER.
+   HBASE-3039  Stuck in regionsInTransition because rebalance came in at same
+               time as a split
 
   IMPROVEMENTS
    HBASE-1760  Cleanup TODOs in HTable

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/RegionTransitionData.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/RegionTransitionData.java?rev=1001911&r1=1001910&r2=1001911&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/RegionTransitionData.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/RegionTransitionData.java Mon
Sep 27 21:02:04 2010
@@ -68,7 +68,7 @@ public class RegionTransitionData implem
    * {@link EventType#M2ZK_REGION_OFFLINE}.
    *
    * @param eventType type of event
-   * @param regionName name of region
+   * @param regionName name of region as per {@link HRegionInfo#getRegionName()}
    */
   public RegionTransitionData(EventType eventType, byte [] regionName) {
     this(eventType, regionName, null);
@@ -85,7 +85,7 @@ public class RegionTransitionData implem
    * and {@link EventType#RS2ZK_REGION_OPENED}.
    *
    * @param eventType type of event
-   * @param regionName name of region
+   * @param regionName name of region as per {@link HRegionInfo#getRegionName()}
    * @param serverName name of server setting data
    */
   public RegionTransitionData(EventType eventType, byte [] regionName,
@@ -117,7 +117,7 @@ public class RegionTransitionData implem
    * Gets the name of the region being transitioned.
    *
    * <p>Region name is required so this never returns null.
-   * @return region name
+   * @return region name, the result of a call to {@link HRegionInfo#getRegionName()}
    */
   public byte [] getRegionName() {
     return regionName;

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1001911&r1=1001910&r2=1001911&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Mon Sep
27 21:02:04 2010
@@ -60,8 +60,8 @@ import org.apache.hadoop.hbase.util.Thre
 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
 import org.apache.hadoop.hbase.zookeeper.ZKTableDisable;
 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
-import org.apache.hadoop.hbase.zookeeper.ZKUtil.NodeAndData;
 import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
+import org.apache.hadoop.hbase.zookeeper.ZKUtil.NodeAndData;
 import org.apache.hadoop.io.Writable;
 import org.apache.zookeeper.KeeperException;
 
@@ -234,7 +234,7 @@ public class AssignmentManager extends Z
    * yet).
    * @param data
    */
-  private void handleRegion(RegionTransitionData data) {
+  private void handleRegion(final RegionTransitionData data) {
     synchronized(regionsInTransition) {
       // Verify this is a known server
       if (!serverManager.isServerOnline(data.getServerName()) &&
@@ -248,7 +248,7 @@ public class AssignmentManager extends Z
       LOG.debug("Handling transition=" + data.getEventType() +
         ", server=" + data.getServerName() + ", region=" + prettyPrintedRegionName);
       RegionState regionState = regionsInTransition.get(encodedName);
-      switch(data.getEventType()) {
+      switch (data.getEventType()) {
         case M_ZK_REGION_OFFLINE:
           // Nothing to do.
           break;
@@ -259,9 +259,9 @@ public class AssignmentManager extends Z
           if (regionState == null ||
               (!regionState.isPendingClose() && !regionState.isClosing())) {
             LOG.warn("Received CLOSING for region " + prettyPrintedRegionName +
-                " from server " + data.getServerName() + " but region was in " +
-                " the state " + regionState + " and not " +
-                "in expected PENDING_CLOSE or CLOSING states");
+              " from server " + data.getServerName() + " but region was in " +
+              " the state " + regionState + " and not " +
+              "in expected PENDING_CLOSE or CLOSING states");
             return;
           }
           // Transition to CLOSING (or update stamp if already CLOSING)
@@ -422,12 +422,21 @@ public class AssignmentManager extends Z
    * @param serverInfo
    */
   public void regionOnline(HRegionInfo regionInfo, HServerInfo serverInfo) {
-    synchronized(regionsInTransition) {
-      regionsInTransition.remove(regionInfo.getEncodedName());
-      regionsInTransition.notifyAll();
+    synchronized (this.regionsInTransition) {
+      RegionState rs =
+        this.regionsInTransition.remove(regionInfo.getEncodedName());
+      if (rs != null) {
+        this.regionsInTransition.notifyAll();
+        LOG.warn("Asked online a region that was already in " +
+          "regionsInTransition: " + rs);
+      }
     }
-    synchronized(regions) {
-      regions.put(regionInfo, serverInfo);
+    synchronized (this.regions) {
+      // Add check
+      HServerInfo hsi = this.regions.get(regionInfo);
+      if (hsi != null) LOG.warn("Overwriting " + regionInfo.getEncodedName() +
+        " on " + hsi);
+      this.regions.put(regionInfo, serverInfo);
       addToServers(serverInfo, regionInfo);
     }
   }
@@ -440,15 +449,20 @@ public class AssignmentManager extends Z
    * @param regionInfo
    * @param serverInfo
    */
-  public void regionOffline(HRegionInfo regionInfo) {
-    synchronized(regionsInTransition) {
-      regionsInTransition.remove(regionInfo.getEncodedName());
-      regionsInTransition.notifyAll();
+  public void regionOffline(final HRegionInfo regionInfo) {
+    synchronized(this.regionsInTransition) {
+      if (this.regionsInTransition.remove(regionInfo.getEncodedName()) != null) {
+        this.regionsInTransition.notifyAll();
+      }
     }
-    synchronized(regions) {
-      HServerInfo serverInfo = regions.remove(regionInfo);
-      List<HRegionInfo> serverRegions = servers.get(serverInfo);
-      serverRegions.remove(regionInfo);
+    synchronized(this.regions) {
+      HServerInfo serverInfo = this.regions.remove(regionInfo);
+      if (serverInfo != null) {
+        List<HRegionInfo> serverRegions = this.servers.get(serverInfo);
+        serverRegions.remove(regionInfo);
+      } else {
+        LOG.warn("Asked offline a region that was not online: " + regionInfo);
+      }
     }
   }
 
@@ -580,17 +594,17 @@ public class AssignmentManager extends Z
       region.getRegionNameAsString() + " (offlining)");
     // Check if this region is currently assigned
     if (!regions.containsKey(region)) {
-      LOG.debug("Attempted to unassign region " + region.getRegionNameAsString() +
-        " but it is not " +
+      LOG.debug("Attempted to unassign region " +
+        region.getRegionNameAsString() + " but it is not " +
         "currently assigned anywhere");
       return;
     }
     String encodedName = region.getEncodedName();
     // Grab the state of this region and synchronize on it
     RegionState state;
-    synchronized(regionsInTransition) {
+    synchronized (regionsInTransition) {
       state = regionsInTransition.get(encodedName);
-      if(state == null) {
+      if (state == null) {
         state = new RegionState(region, RegionState.State.PENDING_CLOSE);
         regionsInTransition.put(encodedName, state);
       } else {
@@ -923,27 +937,29 @@ public class AssignmentManager extends Z
             HRegionInfo regionInfo = regionState.getRegion();
             LOG.info("Regions in transition timed out:  " + regionState);
             // Expired!  Do a retry.
-            switch(regionState.getState()) {
+            switch (regionState.getState()) {
               case OFFLINE:
               case CLOSED:
                 LOG.info("Region has been OFFLINE or CLOSED for too long, " +
-                    "reassigning " + regionInfo.getRegionNameAsString());
+                  "reassigning " + regionInfo.getRegionNameAsString());
                 assign(regionState.getRegion());
                 break;
               case PENDING_OPEN:
               case OPENING:
                 LOG.info("Region has been PENDING_OPEN or OPENING for too " +
-                    "long, reassigning " + regionInfo.getRegionNameAsString());
+                  "long, reassigning region=" +
+                   regionInfo.getRegionNameAsString());
                 assign(regionState.getRegion());
                 break;
               case OPEN:
                 LOG.warn("Long-running region in OPEN state?  This should " +
-                    "not happen");
+                  "not happen; region=" + regionInfo.getRegionNameAsString());
                 break;
               case PENDING_CLOSE:
               case CLOSING:
                 LOG.info("Region has been PENDING_CLOSE or CLOSING for too " +
-                    "long, resending close rpc");
+                  "long, running unassign again on region=" +
+                  regionInfo.getRegionNameAsString());
                 unassign(regionInfo);
                 break;
             }
@@ -1017,61 +1033,9 @@ public class AssignmentManager extends Z
    */
   public void handleSplitReport(final HServerInfo hsi, final HRegionInfo parent,
       final HRegionInfo a, final HRegionInfo b) {
-    synchronized (this.regions) {
-      checkRegion(hsi, parent, true);
-      checkRegion(hsi, a, false);
-      checkRegion(hsi, b, false);
-      this.regions.put(a, hsi);
-      this.regions.put(b, hsi);
-      removeFromServers(hsi, parent, true);
-      removeFromServers(hsi, a, false);
-      removeFromServers(hsi, b, false);
-      addToServers(hsi, a);
-      addToServers(hsi, b);
-    }
-  }
-
-  /*
-   * Caller must hold locks on this.regions Map.
-   * @param hsi
-   * @param hri
-   * @param expected True if we expect <code>hri</code> to be in this.regions.
-   */
-  private void checkRegion(final HServerInfo hsi, final HRegionInfo hri,
-      final boolean expected) {
-    HServerInfo serverInfo = regions.remove(hri);
-    if (expected) {
-      if (serverInfo == null) {
-        LOG.info("Region not on a server: " + hri.getRegionNameAsString());
-      }
-    } else {
-      if (serverInfo != null) {
-        LOG.warn("Region present on " + hsi + "; unexpected");
-      }
-    }
-  }
-
-  /*
-   * Caller must hold locks on servers Map.
-   * @param hsi
-   * @param hri
-   * @param expected
-   */
-  private void removeFromServers(final HServerInfo hsi, final HRegionInfo hri,
-      final boolean expected) {
-    List<HRegionInfo> serverRegions = this.servers.get(hsi);
-    boolean removed = serverRegions.remove(hri);
-    if (expected) {
-      if (!removed) {
-        LOG.warn(hri.getRegionNameAsString() + " not found on " + hsi +
-          "; unexpected");
-      }
-    } else {
-      if (removed) {
-        LOG.warn(hri.getRegionNameAsString() + " found on " + hsi +
-        "; unexpected");
-      }
-    }
+    regionOffline(parent);
+    regionOnline(a, hsi);
+    regionOnline(b, hsi);
   }
 
   /**

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1001911&r1=1001910&r2=1001911&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Mon Sep 27 21:02:04
2010
@@ -572,9 +572,11 @@ implements HMasterInterface, HMasterRegi
     synchronized (this.balancer) {
       // Only allow one balance run at at time.
       if (this.assignmentManager.isRegionsInTransition()) {
-        LOG.debug("Not running balancer because regions in transition: " +
+        LOG.debug("Not running balancer because " +
+          this.assignmentManager.getRegionsInTransition().size() +
+          " region(s) in transition: " +
           org.apache.commons.lang.StringUtils.
-            abbreviate(this.assignmentManager.getRegionsInTransition().toString(), 64));
+            abbreviate(this.assignmentManager.getRegionsInTransition().toString(), 256));
         return false;
       }
       if (!this.serverManager.getDeadServers().isEmpty()) {

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1001911&r1=1001910&r2=1001911&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Mon Sep 27
21:02:04 2010
@@ -285,7 +285,7 @@ public class ServerManager {
     }
 
     for (HMsg msg: msgs) {
-      LOG.info("Received " + msg);
+      LOG.info("Received " + msg + " from " + serverInfo.getServerName());
       switch (msg.getType()) {
       case REGION_SPLIT:
         this.services.getAssignmentManager().handleSplitReport(serverInfo,
@@ -527,8 +527,9 @@ public class ServerManager {
   throws NotServingRegionException {
     HRegionInterface hri = getServerConnection(server);
     if(hri == null) {
-      LOG.warn("Attempting to send CLOSE RPC to server " + server.getServerName()
-          + " failed because no RPC connection found to this server");
+      LOG.warn("Attempting to send CLOSE RPC to server " +
+        server.getServerName() + " failed because no RPC connection found " +
+        "to this server");
       return;
     }
     hri.closeRegion(region);

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=1001911&r1=1001910&r2=1001911&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Mon
Sep 27 21:02:04 2010
@@ -1924,7 +1924,8 @@ public class HRegionServer implements HR
     // TODO: Need to check if this is being served here but currently undergoing
     // a split (so master needs to retry close after split is complete)
     if (!onlineRegions.containsKey(region.getEncodedName())) {
-      LOG.warn("Received close for region we are not serving");
+      LOG.warn("Received close for region we are not serving; " +
+        region.getEncodedName());
       throw new NotServingRegionException("Received close for "
           + region.getRegionNameAsString() + " but we are not serving it");
     }



Mime
View raw message