Return-Path: X-Original-To: apmail-hbase-commits-archive@www.apache.org Delivered-To: apmail-hbase-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 67D1B101BA for ; Fri, 18 Oct 2013 15:57:56 +0000 (UTC) Received: (qmail 10021 invoked by uid 500); 18 Oct 2013 15:57:53 -0000 Delivered-To: apmail-hbase-commits-archive@hbase.apache.org Received: (qmail 9885 invoked by uid 500); 18 Oct 2013 15:57:53 -0000 Mailing-List: contact commits-help@hbase.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@hbase.apache.org Delivered-To: mailing list commits@hbase.apache.org Received: (qmail 9797 invoked by uid 99); 18 Oct 2013 15:57:52 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 18 Oct 2013 15:57:52 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 18 Oct 2013 15:57:50 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 7F70823889BB; Fri, 18 Oct 2013 15:57:30 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1533526 - in /hbase/branches/0.96/hbase-server/src: main/java/org/apache/hadoop/hbase/master/AssignmentManager.java test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java Date: Fri, 18 Oct 2013 15:57:30 -0000 To: commits@hbase.apache.org From: jxiang@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20131018155730.7F70823889BB@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: jxiang Date: Fri Oct 18 15:57:30 2013 New Revision: 1533526 URL: http://svn.apache.org/r1533526 Log: HBASE-9793 Offline a region before it's closed could cause double assignment Modified: hbase/branches/0.96/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java Modified: hbase/branches/0.96/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1533526&r1=1533525&r2=1533526&view=diff ============================================================================== --- hbase/branches/0.96/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (original) +++ hbase/branches/0.96/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Fri Oct 18 15:57:30 2013 @@ -114,6 +114,10 @@ public class AssignmentManager extends Z public static final String ASSIGNMENT_TIMEOUT_MANAGEMENT = "hbase.assignment.timeout.management"; public static final boolean DEFAULT_ASSIGNMENT_TIMEOUT_MANAGEMENT = false; + public static final String ALREADY_IN_TRANSITION_WAITTIME + = "hbase.assignment.already.intransition.waittime"; + public static final int DEFAULT_ALREADY_IN_TRANSITION_WAITTIME = 60000; // 1 minute + protected final Server server; private ServerManager serverManager; @@ -1666,7 +1670,12 @@ public class AssignmentManager extends Z if (state != null) { server = state.getServerName(); } + long maxWaitTime = -1; for (int i = 1; i <= this.maximumAttempts; i++) { + if (this.server.isStopped() || this.server.isAborted()) { + LOG.debug("Server stopped/aborted; skipping unassign of " + region); + return; + } // ClosedRegionhandler can remove the server from this.regions if (!serverManager.isServerOnline(server)) { if (transitionInZK) { @@ -1685,7 +1694,10 @@ public class AssignmentManager extends Z LOG.debug("Sent CLOSE to " + server + " for region " + region.getRegionNameAsString()); if (!transitionInZK && state != null) { - regionOffline(region); + // Retry to make sure the region is + // closed so as to avoid double assignment. + unassign(region, state, versionOfClosingNode, + dest, transitionInZK,src); } return; } @@ -1711,11 +1723,34 @@ public class AssignmentManager extends Z // RS is already processing this region, only need to update the timestamp LOG.debug("update " + state + " the timestamp."); state.updateTimestampToNow(); + if (maxWaitTime < 0) { + maxWaitTime = EnvironmentEdgeManager.currentTimeMillis() + + this.server.getConfiguration().getLong(ALREADY_IN_TRANSITION_WAITTIME, + DEFAULT_ALREADY_IN_TRANSITION_WAITTIME); + } + try { + long now = EnvironmentEdgeManager.currentTimeMillis(); + if (now < maxWaitTime) { + LOG.debug("Region is already in transition; " + + "waiting up to " + (maxWaitTime - now) + "ms", t); + Thread.sleep(100); + i--; // reset the try count + } + } catch (InterruptedException ie) { + LOG.warn("Failed to unassign " + + region.getRegionNameAsString() + " since interrupted", ie); + Thread.currentThread().interrupt(); + if (!tomActivated) { + regionStates.updateRegionState(region, State.FAILED_CLOSE); + } + return; + } + } else { + LOG.info("Server " + server + " returned " + t + " for " + + region.getRegionNameAsString() + ", try=" + i + + " of " + this.maximumAttempts, t); + // Presume retry or server will expire. } - LOG.info("Server " + server + " returned " + t + " for " - + region.getRegionNameAsString() + ", try=" + i - + " of " + this.maximumAttempts, t); - // Presume retry or server will expire. } } // Run out of attempts @@ -1819,10 +1854,15 @@ public class AssignmentManager extends Z RegionState currentState = state; int versionOfOfflineNode = -1; RegionPlan plan = null; - long maxRegionServerStartupWaitTime = -1; + long maxWaitTime = -1; HRegionInfo region = state.getRegion(); RegionOpeningState regionOpenState; - for (int i = 1; i <= maximumAttempts && !server.isStopped(); i++) { + for (int i = 1; i <= maximumAttempts; i++) { + if (server.isStopped() || server.isAborted()) { + LOG.info("Skip assigning " + region.getRegionNameAsString() + + ", the server is stopped/aborted"); + return; + } if (plan == null) { // Get a server for the region at first try { plan = getRegionPlan(region, forceNewPlan); @@ -1883,10 +1923,6 @@ public class AssignmentManager extends Z continue; } } - if (this.server.isStopped() || this.server.isAborted()) { - LOG.debug("Server stopped/aborted; skipping assign of " + region); - return; - } LOG.info("Assigning " + region.getRegionNameAsString() + " to " + plan.getDestination().toString()); // Transition RegionState to PENDING_OPEN @@ -1940,22 +1976,27 @@ public class AssignmentManager extends Z if (hold) { LOG.warn(assignMsg + ", waiting a little before trying on the same region server " + - "try=" + i + " of " + this.maximumAttempts, t); + "try=" + i + " of " + this.maximumAttempts, t); - if (maxRegionServerStartupWaitTime < 0) { - maxRegionServerStartupWaitTime = EnvironmentEdgeManager.currentTimeMillis() + - this.server.getConfiguration(). - getLong("hbase.regionserver.rpc.startup.waittime", 60000); + if (maxWaitTime < 0) { + if (t instanceof RegionAlreadyInTransitionException) { + maxWaitTime = EnvironmentEdgeManager.currentTimeMillis() + + this.server.getConfiguration().getLong(ALREADY_IN_TRANSITION_WAITTIME, + DEFAULT_ALREADY_IN_TRANSITION_WAITTIME); + } else { + maxWaitTime = this.server.getConfiguration(). + getLong("hbase.regionserver.rpc.startup.waittime", 60000); + } } try { + needNewPlan = false; long now = EnvironmentEdgeManager.currentTimeMillis(); - if (now < maxRegionServerStartupWaitTime) { - LOG.debug("Server is not yet up; waiting up to " + - (maxRegionServerStartupWaitTime - now) + "ms", t); + if (now < maxWaitTime) { + LOG.debug("Server is not yet up or region is already in transition; " + + "waiting up to " + (maxWaitTime - now) + "ms", t); Thread.sleep(100); i--; // reset the try count - needNewPlan = false; - } else { + } else if (!(t instanceof RegionAlreadyInTransitionException)) { LOG.debug("Server is not up for a while; try a new one", t); needNewPlan = true; } Modified: hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java?rev=1533526&r1=1533525&r2=1533526&view=diff ============================================================================== --- hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java (original) +++ hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java Fri Oct 18 15:57:30 2013 @@ -553,6 +553,8 @@ public class TestAssignmentManagerOnClus // Unassign it again forcefully so that we can trigger already // in transition exception. This test is to make sure this scenario // is handled properly. + am.server.getConfiguration().setLong( + AssignmentManager.ALREADY_IN_TRANSITION_WAITTIME, 1000); am.unassign(hri, true); RegionState state = am.getRegionStates().getRegionState(hri); assertEquals(RegionState.State.FAILED_CLOSE, state.getState());