Return-Path: X-Original-To: apmail-hbase-commits-archive@www.apache.org Delivered-To: apmail-hbase-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 37D37107D8 for ; Tue, 1 Oct 2013 22:02:38 +0000 (UTC) Received: (qmail 30581 invoked by uid 500); 1 Oct 2013 22:02:37 -0000 Delivered-To: apmail-hbase-commits-archive@hbase.apache.org Received: (qmail 30491 invoked by uid 500); 1 Oct 2013 22:02:37 -0000 Mailing-List: contact commits-help@hbase.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@hbase.apache.org Delivered-To: mailing list commits@hbase.apache.org Received: (qmail 30483 invoked by uid 99); 1 Oct 2013 22:02:37 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 01 Oct 2013 22:02:37 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED,T_FILL_THIS_FORM_SHORT X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 01 Oct 2013 22:02:26 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 820822388860; Tue, 1 Oct 2013 22:02:03 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1528227 [1/2] - in /hbase/trunk: hbase-client/src/main/java/org/apache/hadoop/hbase/ hbase-it/src/test/java/org/apache/hadoop/hbase/ hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/ hbase-it/src/test/java/org/apache/hadoop/hba... Date: Tue, 01 Oct 2013 22:02:02 -0000 To: commits@hbase.apache.org From: jxiang@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20131001220203.820822388860@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: jxiang Date: Tue Oct 1 22:02:01 2013 New Revision: 1528227 URL: http://svn.apache.org/r1528227 Log: HBASE-9514 Prevent region from assigning before log splitting is done Added: hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DumpClusterStatusAction.java (with props) Modified: hbase/trunk/hbase-client/src/main/java/org/apache/hadoop/hbase/ClusterStatus.java hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/IntegrationTestBase.java hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/AddColumnAction.java hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/CompactRandomRegionOfTableAction.java hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/FlushRandomRegionOfTableAction.java hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MergeRandomAdjacentRegionsOfTableAction.java hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRandomRegionOfTableAction.java hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRegionsOfTableAction.java hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/test/IntegrationTestBigLinkedList.java hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/GeneralBulkAssigner.java hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ClosedRegionHandler.java hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/EnableTableHandler.java hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/MetaServerShutdownHandler.java hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/TestDrainingServer.java hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java Modified: hbase/trunk/hbase-client/src/main/java/org/apache/hadoop/hbase/ClusterStatus.java URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-client/src/main/java/org/apache/hadoop/hbase/ClusterStatus.java?rev=1528227&r1=1528226&r2=1528227&view=diff ============================================================================== --- hbase/trunk/hbase-client/src/main/java/org/apache/hadoop/hbase/ClusterStatus.java (original) +++ hbase/trunk/hbase-client/src/main/java/org/apache/hadoop/hbase/ClusterStatus.java Tue Oct 1 22:02:01 2013 @@ -272,11 +272,39 @@ public class ClusterStatus extends Versi return balancerOn != null && balancerOn; } - public Boolean getBalancerOn(){ + public Boolean getBalancerOn() { return balancerOn; } - /** + public String toString() { + StringBuilder sb = new StringBuilder(1024); + sb.append("Master: " + master); + sb.append("\nNumber of backup masters: " + backupMasters.size()); + for (ServerName serverName: backupMasters) { + sb.append("\n " + serverName); + } + + sb.append("\nNumber of live region servers: " + liveServers.size()); + for (ServerName serverName: liveServers.keySet()) { + sb.append("\n " + serverName.getServerName()); + } + + sb.append("\nNumber of dead region servers: " + deadServers.size()); + for (ServerName serverName: deadServers) { + sb.append("\n " + serverName); + } + + sb.append("\nAverage load: " + getAverageLoad()); + sb.append("\nNumber of requests: " + getRequestsCount()); + sb.append("\nNumber of regions: " + getRegionsCount()); + sb.append("\nNumber of regions in transition: " + intransition.size()); + for (RegionState state: intransition.values()) { + sb.append("\n " + state.toDescriptiveString()); + } + return sb.toString(); + } + + /** * Convert a ClusterStatus to a protobuf ClusterStatus * * @return the protobuf ClusterStatus Modified: hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/IntegrationTestBase.java URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/IntegrationTestBase.java?rev=1528227&r1=1528226&r2=1528227&view=diff ============================================================================== --- hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/IntegrationTestBase.java (original) +++ hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/IntegrationTestBase.java Tue Oct 1 22:02:01 2013 @@ -95,8 +95,14 @@ public abstract class IntegrationTestBas @After public void cleanUpMonkey() throws Exception { - monkey.stop("Ending test"); - monkey.waitForStop(); + cleanUpMonkey("Ending test"); + } + + protected void cleanUpMonkey(String why) throws Exception { + if (monkey != null && !monkey.isStopped()) { + monkey.stop(why); + monkey.waitForStop(); + } } protected IntegrationTestingUtility getTestingUtil(Configuration conf) { Modified: hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/AddColumnAction.java URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/AddColumnAction.java?rev=1528227&r1=1528226&r2=1528227&view=diff ============================================================================== --- hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/AddColumnAction.java (original) +++ hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/AddColumnAction.java Tue Oct 1 22:02:01 2013 @@ -19,7 +19,6 @@ package org.apache.hadoop.hbase.chaos.actions; import java.io.IOException; -import java.util.Random; import org.apache.commons.lang.RandomStringUtils; import org.apache.hadoop.hbase.HColumnDescriptor; Modified: hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/CompactRandomRegionOfTableAction.java URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/CompactRandomRegionOfTableAction.java?rev=1528227&r1=1528226&r2=1528227&view=diff ============================================================================== --- hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/CompactRandomRegionOfTableAction.java (original) +++ hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/CompactRandomRegionOfTableAction.java Tue Oct 1 22:02:01 2013 @@ -53,13 +53,18 @@ public class CompactRandomRegionOfTableA public void perform() throws Exception { HBaseTestingUtility util = context.getHaseIntegrationTestingUtility(); HBaseAdmin admin = util.getHBaseAdmin(); - List regions = admin.getTableRegions(tableNameBytes); boolean major = RandomUtils.nextInt(100) < majorRatio; LOG.info("Performing action: Compact random region of table " + tableName + ", major=" + major); + List regions = admin.getTableRegions(tableNameBytes); + if (regions == null || regions.isEmpty()) { + LOG.info("Table " + tableName + " doesn't have regions to compact"); + return; + } + HRegionInfo region = PolicyBasedChaosMonkey.selectRandomItem( - regions.toArray(new HRegionInfo[regions.size()])); + regions.toArray(new HRegionInfo[regions.size()])); if (major) { LOG.debug("Major compacting region " + region.getRegionNameAsString()); Added: hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DumpClusterStatusAction.java URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DumpClusterStatusAction.java?rev=1528227&view=auto ============================================================================== --- hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DumpClusterStatusAction.java (added) +++ hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DumpClusterStatusAction.java Tue Oct 1 22:02:01 2013 @@ -0,0 +1,38 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.chaos.actions; + +import java.io.IOException; + +/** + * Action to dump the cluster status. + */ +public class DumpClusterStatusAction extends Action { + + @Override + public void init(ActionContext context) throws IOException { + super.init(context); + } + + @Override + public void perform() throws Exception { + LOG.debug("Performing action: Dump cluster status"); + LOG.info("Cluster status\n" + cluster.getClusterStatus()); + } +} Propchange: hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DumpClusterStatusAction.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/FlushRandomRegionOfTableAction.java URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/FlushRandomRegionOfTableAction.java?rev=1528227&r1=1528226&r2=1528227&view=diff ============================================================================== --- hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/FlushRandomRegionOfTableAction.java (original) +++ hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/FlushRandomRegionOfTableAction.java Tue Oct 1 22:02:01 2013 @@ -51,8 +51,13 @@ public class FlushRandomRegionOfTableAct LOG.info("Performing action: Flush random region of table " + tableName); List regions = admin.getTableRegions(tableNameBytes); + if (regions == null || regions.isEmpty()) { + LOG.info("Table " + tableName + " doesn't have regions to flush"); + return; + } + HRegionInfo region = PolicyBasedChaosMonkey.selectRandomItem( - regions.toArray(new HRegionInfo[regions.size()])); + regions.toArray(new HRegionInfo[regions.size()])); LOG.debug("Flushing region " + region.getRegionNameAsString()); admin.flush(region.getRegionName()); if (sleepTime > 0) { Modified: hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MergeRandomAdjacentRegionsOfTableAction.java URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MergeRandomAdjacentRegionsOfTableAction.java?rev=1528227&r1=1528226&r2=1528227&view=diff ============================================================================== --- hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MergeRandomAdjacentRegionsOfTableAction.java (original) +++ hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MergeRandomAdjacentRegionsOfTableAction.java Tue Oct 1 22:02:01 2013 @@ -51,8 +51,8 @@ public class MergeRandomAdjacentRegionsO LOG.info("Performing action: Merge random adjacent regions of table " + tableName); List regions = admin.getTableRegions(tableNameBytes); - if (regions.size() < 2) { - LOG.info("Table " + tableName + " doesn't have enough region to merge"); + if (regions == null || regions.size() < 2) { + LOG.info("Table " + tableName + " doesn't have enough regions to merge"); return; } Modified: hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRandomRegionOfTableAction.java URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRandomRegionOfTableAction.java?rev=1528227&r1=1528226&r2=1528227&view=diff ============================================================================== --- hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRandomRegionOfTableAction.java (original) +++ hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRandomRegionOfTableAction.java Tue Oct 1 22:02:01 2013 @@ -55,8 +55,13 @@ public class MoveRandomRegionOfTableActi LOG.info("Performing action: Move random region of table " + tableName); List regions = admin.getTableRegions(tableNameBytes); + if (regions == null || regions.isEmpty()) { + LOG.info("Table " + tableName + " doesn't have regions to move"); + return; + } + HRegionInfo region = PolicyBasedChaosMonkey.selectRandomItem( - regions.toArray(new HRegionInfo[regions.size()])); + regions.toArray(new HRegionInfo[regions.size()])); LOG.debug("Unassigning region " + region.getRegionNameAsString()); admin.unassign(region.getRegionName(), false); if (sleepTime > 0) { Modified: hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRegionsOfTableAction.java URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRegionsOfTableAction.java?rev=1528227&r1=1528226&r2=1528227&view=diff ============================================================================== --- hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRegionsOfTableAction.java (original) +++ hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRegionsOfTableAction.java Tue Oct 1 22:02:01 2013 @@ -52,12 +52,16 @@ public class MoveRegionsOfTableAction ex } HBaseAdmin admin = this.context.getHaseIntegrationTestingUtility().getHBaseAdmin(); - - List regions = admin.getTableRegions(tableNameBytes); Collection serversList = admin.getClusterStatus().getServers(); ServerName[] servers = serversList.toArray(new ServerName[serversList.size()]); LOG.info("Performing action: Move regions of table " + tableName); + List regions = admin.getTableRegions(tableNameBytes); + if (regions == null || regions.isEmpty()) { + LOG.info("Table " + tableName + " doesn't have regions to move"); + return; + } + for (HRegionInfo regionInfo:regions) { try { String destServerName = Modified: hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java?rev=1528227&r1=1528226&r2=1528227&view=diff ============================================================================== --- hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java (original) +++ hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/SlowDeterministicMonkeyFactory.java Tue Oct 1 22:02:01 2013 @@ -25,6 +25,7 @@ import org.apache.hadoop.hbase.chaos.act import org.apache.hadoop.hbase.chaos.actions.ChangeVersionsAction; import org.apache.hadoop.hbase.chaos.actions.CompactRandomRegionOfTableAction; import org.apache.hadoop.hbase.chaos.actions.CompactTableAction; +import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction; import org.apache.hadoop.hbase.chaos.actions.FlushRandomRegionOfTableAction; import org.apache.hadoop.hbase.chaos.actions.FlushTableAction; import org.apache.hadoop.hbase.chaos.actions.MergeRandomAdjacentRegionsOfTableAction; @@ -81,11 +82,17 @@ public class SlowDeterministicMonkeyFact new RestartRsHoldingMetaAction(35000), }; + // Action to log more info for debugging + Action[] actions4 = new Action[] { + new DumpClusterStatusAction() + }; + return new PolicyBasedChaosMonkey(util, new PeriodicRandomActionPolicy(60 * 1000, actions1), new PeriodicRandomActionPolicy(90 * 1000, actions2), new CompositeSequentialPolicy( new DoActionsOncePolicy(150 * 1000, actions3), - new PeriodicRandomActionPolicy(150 * 1000, actions3))); + new PeriodicRandomActionPolicy(150 * 1000, actions3)), + new PeriodicRandomActionPolicy(90 * 1000, actions4)); } } Modified: hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/test/IntegrationTestBigLinkedList.java URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/test/IntegrationTestBigLinkedList.java?rev=1528227&r1=1528226&r2=1528227&view=diff ============================================================================== --- hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/test/IntegrationTestBigLinkedList.java (original) +++ hbase/trunk/hbase-it/src/test/java/org/apache/hadoop/hbase/test/IntegrationTestBigLinkedList.java Tue Oct 1 22:02:01 2013 @@ -45,6 +45,7 @@ import org.apache.hadoop.hbase.Integrati import org.apache.hadoop.hbase.IntegrationTestingUtility; import org.apache.hadoop.hbase.IntegrationTests; import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.chaos.monkies.CalmChaosMonkey; import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.HBaseAdmin; import org.apache.hadoop.hbase.client.HTable; @@ -59,6 +60,7 @@ import org.apache.hadoop.hbase.mapreduce import org.apache.hadoop.hbase.mapreduce.TableRecordReaderImpl; import org.apache.hadoop.hbase.util.AbstractHBaseTool; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.HBaseFsck; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; @@ -700,6 +702,8 @@ public class IntegrationTestBigLinkedLis private static final Log LOG = LogFactory.getLog(Loop.class); + IntegrationTestBigLinkedList it; + protected void runGenerator(int numMappers, long numNodes, String outputDir, Integer width, Integer wrapMuplitplier) throws Exception { Path outputPath = new Path(outputDir); @@ -714,7 +718,8 @@ public class IntegrationTestBigLinkedLis } } - protected void runVerify(String outputDir, int numReducers, long expectedNumNodes) throws Exception { + protected boolean runVerify(String outputDir, + int numReducers, long expectedNumNodes) throws Exception { Path outputPath = new Path(outputDir); UUID uuid = UUID.randomUUID(); //create a random UUID. Path iterationOutput = new Path(outputPath, uuid.toString()); @@ -726,12 +731,20 @@ public class IntegrationTestBigLinkedLis throw new RuntimeException("Verify.run failed with return code: " + retCode); } - boolean verifySuccess = verify.verify(expectedNumNodes); - if (!verifySuccess) { - throw new RuntimeException("Verify.verify failed"); + if (!verify.verify(expectedNumNodes)) { + try { + HBaseFsck fsck = new HBaseFsck(getConf()); + HBaseFsck.setDisplayFullReport(); + fsck.connect(); + fsck.onlineHbck(); + } catch (Throwable t) { + LOG.error("Failed to run hbck", t); + } + return false; } LOG.info("Verify finished with succees. Total nodes=" + expectedNumNodes); + return true; } @Override @@ -761,7 +774,17 @@ public class IntegrationTestBigLinkedLis runGenerator(numMappers, numNodes, outputDir, width, wrapMuplitplier); expectedNumNodes += numMappers * numNodes; - runVerify(outputDir, numReducers, expectedNumNodes); + if (!runVerify(outputDir, numReducers, expectedNumNodes)) { + if (it.monkey != null && !(it.monkey instanceof CalmChaosMonkey)) { + LOG.info("Verify.verify failed, let's stop CM and verify again"); + it.cleanUpMonkey("Stop monkey before verify again after verify failed"); + if (!runVerify(outputDir, numReducers, expectedNumNodes)) { + LOG.info("Verify.verify failed even without CM, verify one more"); + runVerify(outputDir, numReducers, expectedNumNodes); + } + } + throw new RuntimeException("Verify.verify failed"); + } } return 0; @@ -1051,7 +1074,9 @@ public class IntegrationTestBigLinkedLis } else if (toRun.equals("Verify")) { tool = new Verify(); } else if (toRun.equals("Loop")) { - tool = new Loop(); + Loop loop = new Loop(); + loop.it = this; + tool = loop; } else if (toRun.equals("Walker")) { tool = new Walker(); } else if (toRun.equals("Print")) { Modified: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1528227&r1=1528226&r2=1528227&view=diff ============================================================================== --- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (original) +++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java Tue Oct 1 22:02:01 2013 @@ -269,8 +269,9 @@ public class AssignmentManager extends Z this.timerUpdater = null; } this.zkTable = new ZKTable(this.watcher); - this.maximumAttempts = - this.server.getConfiguration().getInt("hbase.assignment.maximum.attempts", 10); + // This is the max attempts, not retries, so it should be at least 1. + this.maximumAttempts = Math.max(1, + this.server.getConfiguration().getInt("hbase.assignment.maximum.attempts", 10)); this.sleepTimeBeforeRetryingMetaAssignment = this.server.getConfiguration().getLong( "hbase.meta.assignment.retry.sleeptime", 1000l); this.balancer = balancer; @@ -389,6 +390,14 @@ public class AssignmentManager extends Z } /** + * To avoid racing with AM, external entities may need to lock a region, + * for example, when SSH checks what regions to skip re-assigning. + */ + public Lock acquireRegionLock(final String encodedName) { + return locker.acquireLock(encodedName); + } + + /** * Now, failover cleanup is completed. Notify server manager to * process queued up dead servers processing, if any. */ @@ -586,10 +595,7 @@ public class AssignmentManager extends Z } else { // Insert into RIT & resend the query to the region server: may be the previous master // died before sending the query the first time. - regionStates.updateRegionState(rt, RegionState.State.CLOSING); - final RegionState rs = regionStates.getRegionState(regionInfo); - final ClosedRegionHandler closedRegionHandler = - new ClosedRegionHandler(server, this, regionInfo); + final RegionState rs = regionStates.updateRegionState(rt, State.CLOSING); this.executorService.submit( new EventHandler(server, EventType.M_MASTER_RECOVERY) { @Override @@ -598,7 +604,7 @@ public class AssignmentManager extends Z try { unassign(regionInfo, rs, expectedVersion, null, true, null); if (regionStates.isRegionOffline(regionInfo)) { - closedRegionHandler.process(); + assign(regionInfo, true); } } finally { lock.unlock(); @@ -611,18 +617,18 @@ public class AssignmentManager extends Z case RS_ZK_REGION_CLOSED: case RS_ZK_REGION_FAILED_OPEN: // Region is closed, insert into RIT and handle it - addToRITandCallClose(regionInfo, RegionState.State.CLOSED, rt); + addToRITandInvokeAssign(regionInfo, State.CLOSED, rt); break; case M_ZK_REGION_OFFLINE: // If zk node of the region was updated by a live server skip this // region and just add it into RIT. if (!serverManager.isServerOnline(sn)) { - // Region is offline, insert into RIT and handle it like a closed - addToRITandCallClose(regionInfo, RegionState.State.OFFLINE, rt); + // Region is offline, insert into RIT and invoke assign + addToRITandInvokeAssign(regionInfo, State.OFFLINE, rt); } else { // Insert in RIT and resend to the regionserver - regionStates.updateRegionState(rt, RegionState.State.PENDING_OPEN); + regionStates.updateRegionState(rt, State.PENDING_OPEN); final RegionState rs = regionStates.getRegionState(regionInfo); this.executorService.submit( new EventHandler(server, EventType.M_MASTER_RECOVERY) { @@ -645,7 +651,7 @@ public class AssignmentManager extends Z if (!serverManager.isServerOnline(sn)) { forceOffline(regionInfo, rt); } else { - regionStates.updateRegionState(rt, RegionState.State.OPENING); + regionStates.updateRegionState(rt, State.OPENING); } break; @@ -656,7 +662,7 @@ public class AssignmentManager extends Z // Region is opened, insert into RIT and handle it // This could be done asynchronously, we would need then to acquire the lock in the // handler. - regionStates.updateRegionState(rt, RegionState.State.OPEN); + regionStates.updateRegionState(rt, State.OPEN); new OpenedRegionHandler(server, this, regionInfo, sn, expectedVersion).process(); } break; @@ -672,7 +678,7 @@ public class AssignmentManager extends Z // user region rebuilding since we may consider the split is completed. // Put it in SPLITTING state to avoid complications. regionStates.regionOnline(regionInfo, sn); - regionStates.updateRegionState(rt, RegionState.State.SPLITTING); + regionStates.updateRegionState(rt, State.SPLITTING); LOG.info("Processed " + prettyPrintedRegionName + " in state : " + et); } break; @@ -687,7 +693,7 @@ public class AssignmentManager extends Z // user region rebuilding since we may consider the split is completed. // Put it in SPLITTING state to avoid complications. regionStates.regionOnline(regionInfo, sn); - regionStates.updateRegionState(rt, RegionState.State.SPLITTING); + regionStates.updateRegionState(rt, State.SPLITTING); LOG.info("Processed " + prettyPrintedRegionName + " in state : " + et); // Move the region to splitting state. The regionserver is supposed to update the znode // multiple times so if it's still up we will receive an update soon. @@ -741,24 +747,24 @@ public class AssignmentManager extends Z private void forceOffline(final HRegionInfo hri, final RegionTransition oldRt) throws KeeperException { // If was on dead server, its closed now. Force to OFFLINE and then - // handle it like a close; this will get it reassigned if appropriate + // invoke assign; this will get it reassigned if appropriate LOG.debug("RIT " + hri.getEncodedName() + " in state=" + oldRt.getEventType() + " was on deadserver; forcing offline"); ZKAssign.createOrForceNodeOffline(this.watcher, hri, oldRt.getServerName()); - addToRITandCallClose(hri, RegionState.State.OFFLINE, oldRt); + addToRITandInvokeAssign(hri, State.OFFLINE, oldRt); } /** - * Add to the in-memory copy of regions in transition and then call close - * handler on passed region hri + * Add to the in-memory copy of regions in transition and then invoke + * assign on passed region hri * @param hri * @param state * @param oldData */ - private void addToRITandCallClose(final HRegionInfo hri, - final RegionState.State state, final RegionTransition oldData) { + private void addToRITandInvokeAssign(final HRegionInfo hri, + final State state, final RegionTransition oldData) { regionStates.updateRegionState(oldData, state); - new ClosedRegionHandler(this.server, this, hri).process(); + invokeAssign(hri); } /** @@ -800,7 +806,7 @@ public class AssignmentManager extends Z if (!serverManager.isServerOnline(sn) && !ignoreStatesRSOffline.contains(rt.getEventType())) { LOG.warn("Attempted to handle region transition for server but " + - "server is not online: " + prettyPrintedRegionName); + "it is not online: " + prettyPrintedRegionName + ", " + rt); return; } @@ -839,119 +845,114 @@ public class AssignmentManager extends Z } regionState = latestState; switch (rt.getEventType()) { - case RS_ZK_REGION_SPLITTING: - if (!isInStateForSplitting(regionState)) break; - regionStates.updateRegionState(rt, RegionState.State.SPLITTING); - break; + case RS_ZK_REGION_SPLITTING: + if (!isInStateForSplitting(regionState)) break; + regionStates.updateRegionState(rt, State.SPLITTING); + break; - case RS_ZK_REGION_SPLIT: - // RegionState must be null, or SPLITTING or PENDING_CLOSE. - if (!isInStateForSplitting(regionState)) break; - // If null, add SPLITTING state before going to SPLIT + case RS_ZK_REGION_SPLIT: + // RegionState must be null, or SPLITTING or PENDING_CLOSE. + if (!isInStateForSplitting(regionState)) break; + // If null, add SPLITTING state before going to SPLIT + if (regionState == null) { + regionState = regionStates.updateRegionState(rt, State.SPLITTING); + + String message = "Received SPLIT for region " + prettyPrintedRegionName + + " from server " + sn; + // If still null, it means we cannot find it and it was already processed if (regionState == null) { - regionState = regionStates.updateRegionState(rt, - RegionState.State.SPLITTING); - - String message = "Received SPLIT for region " + prettyPrintedRegionName + - " from server " + sn; - // If still null, it means we cannot find it and it was already processed - if (regionState == null) { - LOG.warn(message + " but it doesn't exist anymore," + - " probably already processed its split"); - break; - } - LOG.info(message + - " but region was not first in SPLITTING state; continuing"); - } - // Check it has daughters. - byte [] payload = rt.getPayload(); - List daughters; - try { - daughters = HRegionInfo.parseDelimitedFrom(payload, 0, payload.length); - } catch (IOException e) { - LOG.error("Dropped split! Failed reading split payload for " + - prettyPrintedRegionName); + LOG.warn(message + " but it doesn't exist anymore," + + " probably already processed its split"); break; } - assert daughters.size() == 2; - // Assert that we can get a serverinfo for this server. - if (!this.serverManager.isServerOnline(sn)) { - LOG.error("Dropped split! ServerName=" + sn + " unknown."); - break; - } - // Run handler to do the rest of the SPLIT handling. - new SplitRegionHandler(server, this, regionState.getRegion(), sn, daughters).process(); - updateSplitHandlerTracker(); + LOG.info(message + + " but region was not first in SPLITTING state; continuing"); + } + // Check it has daughters. + byte [] payload = rt.getPayload(); + List daughters; + try { + daughters = HRegionInfo.parseDelimitedFrom(payload, 0, payload.length); + } catch (IOException e) { + LOG.error("Dropped split! Failed reading split payload for " + + prettyPrintedRegionName); break; - - case RS_ZK_REGION_MERGING: - // Merged region is a new region, we can't find it in the region states now. - // However, the two merging regions are not new. They should be in state for merging. - handleRegionMerging(rt, prettyPrintedRegionName, sn); + } + assert daughters.size() == 2; + // Assert that we can get a serverinfo for this server. + if (!this.serverManager.isServerOnline(sn)) { + LOG.error("Dropped split! ServerName=" + sn + " unknown."); break; + } + // Run handler to do the rest of the SPLIT handling. + new SplitRegionHandler(server, this, regionState.getRegion(), sn, daughters).process(); + updateSplitHandlerTracker(); + break; - case RS_ZK_REGION_MERGED: - // Assert that we can get a serverinfo for this server. - if (!this.serverManager.isServerOnline(sn)) { - LOG.error("Dropped merge! ServerName=" + sn + " unknown."); - break; - } - // Get merged and merging regions. - byte[] payloadOfMerge = rt.getPayload(); - List mergeRegions; - try { - mergeRegions = HRegionInfo.parseDelimitedFrom(payloadOfMerge, 0, - payloadOfMerge.length); - } catch (IOException e) { - LOG.error("Dropped merge! Failed reading merge payload for " + - prettyPrintedRegionName); - break; - } - assert mergeRegions.size() == 3; - HRegionInfo merge_a = mergeRegions.get(1); - HRegionInfo merge_b = mergeRegions.get(2); - if (!isInStateForMerging(sn, merge_a, merge_b)) { - // Move on. Merge already happened (passed PONR), no point to stop now - LOG.warn("Got merge event, but not in state good for MERGED; rs_a=" - + merge_a + ", rs_b=" + merge_b); - } - // Run handler to do the rest of the MERGED handling. - new MergedRegionHandler(server, this, sn, mergeRegions).process(); - break; + case RS_ZK_REGION_MERGING: + // Merged region is a new region, we can't find it in the region states now. + // However, the two merging regions are not new. They should be in state for merging. + handleRegionMerging(rt, prettyPrintedRegionName, sn); + break; - case M_ZK_REGION_CLOSING: - // Should see CLOSING after we have asked it to CLOSE or additional - // times after already being in state of CLOSING - if (regionState == null - || !regionState.isPendingCloseOrClosingOnServer(sn)) { - LOG.warn("Received CLOSING for " + prettyPrintedRegionName - + " from " + sn + " but the region isn't PENDING_CLOSE/CLOSING here: " - + regionStates.getRegionState(encodedName)); - return; - } - // Transition to CLOSING (or update stamp if already CLOSING) - regionStates.updateRegionState(rt, RegionState.State.CLOSING); + case RS_ZK_REGION_MERGED: + // Assert that we can get a serverinfo for this server. + if (!this.serverManager.isServerOnline(sn)) { + LOG.error("Dropped merge! ServerName=" + sn + " unknown."); break; - - case RS_ZK_REGION_CLOSED: - // Should see CLOSED after CLOSING but possible after PENDING_CLOSE - if (regionState == null - || !regionState.isPendingCloseOrClosingOnServer(sn)) { - LOG.warn("Received CLOSED for " + prettyPrintedRegionName - + " from " + sn + " but the region isn't PENDING_CLOSE/CLOSING here: " - + regionStates.getRegionState(encodedName)); - return; - } - // Handle CLOSED by assigning elsewhere or stopping if a disable - // If we got here all is good. Need to update RegionState -- else - // what follows will fail because not in expected state. - regionState = regionStates.updateRegionState(rt, RegionState.State.CLOSED); - if (regionState != null) { - removeClosedRegion(regionState.getRegion()); - new ClosedRegionHandler(server, this, regionState.getRegion()).process(); - updateClosedRegionHandlerTracker(regionState.getRegion()); - } + } + // Get merged and merging regions. + byte[] payloadOfMerge = rt.getPayload(); + List mergeRegions; + try { + mergeRegions = HRegionInfo.parseDelimitedFrom(payloadOfMerge, 0, + payloadOfMerge.length); + } catch (IOException e) { + LOG.error("Dropped merge! Failed reading merge payload for " + + prettyPrintedRegionName); break; + } + assert mergeRegions.size() == 3; + HRegionInfo merge_a = mergeRegions.get(1); + HRegionInfo merge_b = mergeRegions.get(2); + if (!isInStateForMerging(sn, merge_a, merge_b)) { + // Move on. Merge already happened (passed PONR), no point to stop now + LOG.warn("Got merge event, but not in state good for MERGED; rs_a=" + + merge_a + ", rs_b=" + merge_b); + } + // Run handler to do the rest of the MERGED handling. + new MergedRegionHandler(server, this, sn, mergeRegions).process(); + break; + + case M_ZK_REGION_CLOSING: + // Should see CLOSING after we have asked it to CLOSE or additional + // times after already being in state of CLOSING + if (regionState == null + || !regionState.isPendingCloseOrClosingOnServer(sn)) { + LOG.warn("Received CLOSING for " + prettyPrintedRegionName + + " from " + sn + " but the region isn't PENDING_CLOSE/CLOSING here: " + + regionStates.getRegionState(encodedName)); + return; + } + // Transition to CLOSING (or update stamp if already CLOSING) + regionStates.updateRegionState(rt, State.CLOSING); + break; + + case RS_ZK_REGION_CLOSED: + // Should see CLOSED after CLOSING but possible after PENDING_CLOSE + if (regionState == null + || !regionState.isPendingCloseOrClosingOnServer(sn)) { + LOG.warn("Received CLOSED for " + prettyPrintedRegionName + + " from " + sn + " but the region isn't PENDING_CLOSE/CLOSING here: " + + regionStates.getRegionState(encodedName)); + return; + } + // Handle CLOSED by assigning elsewhere or stopping if a disable + // If we got here all is good. Need to update RegionState -- else + // what follows will fail because not in expected state. + new ClosedRegionHandler(server, this, regionState.getRegion()).process(); + updateClosedRegionHandlerTracker(regionState.getRegion()); + break; case RS_ZK_REGION_FAILED_OPEN: if (regionState == null @@ -970,13 +971,13 @@ public class AssignmentManager extends Z failedOpenTracker.put(encodedName, failedOpenCount); } if (failedOpenCount.incrementAndGet() >= maximumAttempts) { - regionStates.updateRegionState(rt, RegionState.State.FAILED_OPEN); + regionStates.updateRegionState(rt, State.FAILED_OPEN); // remove the tracking info to save memory, also reset // the count for next open initiative failedOpenTracker.remove(encodedName); } else { // Handle this the same as if it were opened and then closed. - regionState = regionStates.updateRegionState(rt, RegionState.State.CLOSED); + regionState = regionStates.updateRegionState(rt, State.CLOSED); if (regionState != null) { // When there are more than one region server a new RS is selected as the // destination and the same is updated in the regionplan. (HBASE-5546) @@ -1001,7 +1002,7 @@ public class AssignmentManager extends Z return; } // Transition to OPENING (or update stamp if already OPENING) - regionStates.updateRegionState(rt, RegionState.State.OPENING); + regionStates.updateRegionState(rt, State.OPENING); break; case RS_ZK_REGION_OPENED: @@ -1019,7 +1020,7 @@ public class AssignmentManager extends Z return; } // Handle OPENED by removing from transition and deleted zk node - regionState = regionStates.updateRegionState(rt, RegionState.State.OPEN); + regionState = regionStates.updateRegionState(rt, State.OPEN); if (regionState != null) { failedOpenTracker.remove(encodedName); // reset the count, if any new OpenedRegionHandler( @@ -1141,8 +1142,7 @@ public class AssignmentManager extends Z private boolean convertPendingCloseToSplitting(final RegionState rs) { if (!rs.isPendingClose()) return false; LOG.debug("Converting PENDING_CLOSE to SPLITTING; rs=" + rs); - regionStates.updateRegionState( - rs.getRegion(), RegionState.State.SPLITTING); + regionStates.updateRegionState(rs.getRegion(), State.SPLITTING); // Clean up existing state. Clear from region plans seems all we // have to do here by way of clean up of PENDING_CLOSE. clearRegionPlan(rs.getRegion()); @@ -1400,10 +1400,6 @@ public class AssignmentManager extends Z * @param sn */ void regionOnline(HRegionInfo regionInfo, ServerName sn) { - if (!serverManager.isServerOnline(sn)) { - LOG.warn("A region was opened on a dead server, ServerName=" + - sn + ", region=" + regionInfo.getEncodedName()); - } numRegionsOpened.incrementAndGet(); regionStates.regionOnline(regionInfo, sn); @@ -1556,7 +1552,7 @@ public class AssignmentManager extends Z */ public void assign(HRegionInfo region, boolean setOfflineInZK, boolean forceNewPlan) { - if (!setOfflineInZK && isDisabledorDisablingRegionInRIT(region)) { + if (isDisabledorDisablingRegionInRIT(region)) { return; } if (this.serverManager.isClusterShutdown()) { @@ -1569,6 +1565,12 @@ public class AssignmentManager extends Z try { RegionState state = forceRegionStateToOffline(region, forceNewPlan); if (state != null) { + if (regionStates.wasRegionOnDeadServer(encodedName)) { + LOG.info("Skip assigning " + region.getRegionNameAsString() + + ", it's host " + regionStates.getLastRegionServerOfRegion(encodedName) + + " is dead but not processed yet"); + return; + } assign(state, setOfflineInZK, forceNewPlan); } } finally { @@ -1605,19 +1607,34 @@ public class AssignmentManager extends Z Map plans = new HashMap(regions.size()); List states = new ArrayList(regions.size()); for (HRegionInfo region : regions) { - String encodedRegionName = region.getEncodedName(); - RegionState state = forceRegionStateToOffline(region, true); - if (state != null && asyncSetOfflineInZooKeeper(state, cb, destination)) { - RegionPlan plan = new RegionPlan(region, state.getServerName(), destination); - plans.put(encodedRegionName, plan); - states.add(state); - } else { - LOG.warn("failed to force region state to offline or " - + "failed to set it offline in ZK, will reassign later: " + region); - failedToOpenRegions.add(region); // assign individually later - Lock lock = locks.remove(encodedRegionName); - lock.unlock(); + String encodedName = region.getEncodedName(); + if (!isDisabledorDisablingRegionInRIT(region)) { + RegionState state = forceRegionStateToOffline(region, false); + boolean onDeadServer = false; + if (state != null) { + if (regionStates.wasRegionOnDeadServer(encodedName)) { + LOG.info("Skip assigning " + region.getRegionNameAsString() + + ", it's host " + regionStates.getLastRegionServerOfRegion(encodedName) + + " is dead but not processed yet"); + onDeadServer = true; + } else if (asyncSetOfflineInZooKeeper(state, cb, destination)) { + RegionPlan plan = new RegionPlan(region, state.getServerName(), destination); + plans.put(encodedName, plan); + states.add(state); + continue; + } + } + // Reassign if the region wasn't on a dead server + if (!onDeadServer) { + LOG.info("failed to force region state to offline or " + + "failed to set it offline in ZK, will reassign later: " + region); + failedToOpenRegions.add(region); // assign individually later + } } + // Release the lock, this region is excluded from bulk assign because + // we can't update its state, or set its znode to offline. + Lock lock = locks.remove(encodedName); + lock.unlock(); } // Wait until all unassigned nodes have been put up and watchers set. @@ -1653,8 +1670,8 @@ public class AssignmentManager extends Z Lock lock = locks.remove(encodedRegionName); lock.unlock(); } else { - regionStates.updateRegionState(region, - RegionState.State.PENDING_OPEN, destination); + regionStates.updateRegionState( + region, State.PENDING_OPEN, destination); List favoredNodes = ServerName.EMPTY_SERVER_LIST; if (this.shouldAssignRegionsWithFavoredNodes) { favoredNodes = ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region); @@ -1666,8 +1683,8 @@ public class AssignmentManager extends Z // Move on to open regions. try { - // Send OPEN RPC. If it fails on a IOE or RemoteException, the - // TimeoutMonitor will pick up the pieces. + // Send OPEN RPC. If it fails on a IOE or RemoteException, + // regions will be assigned individually. long maxWaitTime = System.currentTimeMillis() + this.server.getConfiguration(). getLong("hbase.regionserver.rpc.startup.waittime", 60000); @@ -1730,8 +1747,8 @@ public class AssignmentManager extends Z } } catch (IOException e) { // Can be a socket timeout, EOF, NoRouteToHost, etc - LOG.info("Unable to communicate with the region server in order" + - " to assign regions", e); + LOG.info("Unable to communicate with " + destination + + " in order to assign regions, ", e); return false; } catch (InterruptedException e) { throw new RuntimeException(e); @@ -1744,10 +1761,12 @@ public class AssignmentManager extends Z if (!failedToOpenRegions.isEmpty()) { for (HRegionInfo region : failedToOpenRegions) { - invokeAssign(region); + if (!regionStates.isRegionOnline(region)) { + invokeAssign(region); + } } } - LOG.debug("Bulk assigning done for " + destination.toString()); + LOG.debug("Bulk assigning done for " + destination); return true; } finally { metricsAssignmentManager.updateBulkAssignTime(EnvironmentEdgeManager.currentTimeMillis() - startTime); @@ -1823,7 +1842,7 @@ public class AssignmentManager extends Z } // Run out of attempts if (!tomActivated && state != null) { - regionStates.updateRegionState(region, RegionState.State.FAILED_CLOSE); + regionStates.updateRegionState(region, State.FAILED_CLOSE); } } @@ -1838,36 +1857,77 @@ public class AssignmentManager extends Z state = regionStates.createRegionState(region); } + if (forceNewPlan && LOG.isDebugEnabled()) { + LOG.debug("Force region state offline " + state); + } + switch (state.getState()) { case OPEN: case OPENING: case PENDING_OPEN: + case CLOSING: + case PENDING_CLOSE: if (!forceNewPlan) { - LOG.debug("Attempting to assign region " + - region + " but it is already in transition: " + state); + LOG.debug("Skip assigning " + + region + ", it is already " + state); return null; } - case CLOSING: - case PENDING_CLOSE: case FAILED_CLOSE: case FAILED_OPEN: unassign(region, state, -1, null, false, null); + RegionState oldState = state; state = regionStates.getRegionState(region); - if (state.isOffline()) break; + if (state.isFailedClose()) { + LOG.info("Skip assigning " + + region + ", we couldn't close it: " + state); + return null; + } + // In these cases, we need to confirm with meta + // the region was not on a dead server if it's open/pending. + if ((oldState.isOpened() || oldState.isPendingOpenOrOpening()) + && wasRegionOnDeadServerByMeta(region, oldState.getServerName())) { + LOG.info("Skip assigning " + region.getRegionNameAsString() + + ", it is on a dead but not processed yet server"); + return null; + } case CLOSED: - LOG.debug("Forcing OFFLINE; was=" + state); - state = regionStates.updateRegionState( - region, RegionState.State.OFFLINE); case OFFLINE: break; default: LOG.error("Trying to assign region " + region - + ", which is in state " + state); + + ", which is " + state); return null; } return state; } + private boolean wasRegionOnDeadServerByMeta( + final HRegionInfo region, final ServerName sn) { + try { + if (region.isMetaRegion()) { + ServerName server = catalogTracker.getMetaLocation(); + return regionStates.isServerDeadAndNotProcessed(server); + } + while (!server.isStopped()) { + try { + catalogTracker.waitForMeta(); + Pair r = + MetaReader.getRegion(catalogTracker, region.getRegionName()); + ServerName server = r == null ? null : r.getSecond(); + return regionStates.isServerDeadAndNotProcessed(server); + } catch (IOException ioe) { + LOG.info("Received exception accessing hbase:meta during force assign " + + region.getRegionNameAsString() + ", retrying", ioe); + } + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + LOG.info("Interrupted accessing hbase:meta", e); + } + // Call is interrupted or server is stopped. + return regionStates.isServerDeadAndNotProcessed(sn); + } + /** * Caller must hold lock on the passed state object. * @param state @@ -1878,214 +1938,214 @@ public class AssignmentManager extends Z final boolean setOfflineInZK, final boolean forceNewPlan) { long startTime = EnvironmentEdgeManager.currentTimeMillis(); try { - RegionState currentState = state; - int versionOfOfflineNode = -1; - RegionPlan plan = null; - long maxRegionServerStartupWaitTime = -1; - HRegionInfo region = state.getRegion(); - RegionOpeningState regionOpenState; - for (int i = 1; i <= maximumAttempts && !server.isStopped(); i++) { - if (plan == null) { // Get a server for the region at first - try { - plan = getRegionPlan(region, forceNewPlan); - } catch (HBaseIOException e) { - LOG.warn("Failed to get region plan", e); + RegionState currentState = state; + int versionOfOfflineNode = -1; + RegionPlan plan = null; + long maxRegionServerStartupWaitTime = -1; + HRegionInfo region = state.getRegion(); + RegionOpeningState regionOpenState; + for (int i = 1; i <= maximumAttempts && !server.isStopped(); i++) { + if (plan == null) { // Get a server for the region at first + try { + plan = getRegionPlan(region, forceNewPlan); + } catch (HBaseIOException e) { + LOG.warn("Failed to get region plan", e); + } } - } - if (plan == null) { - LOG.warn("Unable to determine a plan to assign " + region); - if (tomActivated){ - this.timeoutMonitor.setAllRegionServersOffline(true); - } else { - if (region.isMetaRegion()) { - try { - if (i != maximumAttempts) { - Thread.sleep(this.sleepTimeBeforeRetryingMetaAssignment); - continue; + if (plan == null) { + LOG.warn("Unable to determine a plan to assign " + region); + if (tomActivated){ + this.timeoutMonitor.setAllRegionServersOffline(true); + } else { + if (region.isMetaRegion()) { + try { + if (i != maximumAttempts) { + Thread.sleep(this.sleepTimeBeforeRetryingMetaAssignment); + continue; + } + // TODO : Ensure HBCK fixes this + LOG.error("Unable to determine a plan to assign hbase:meta even after repeated attempts. Run HBCK to fix this"); + } catch (InterruptedException e) { + LOG.error("Got exception while waiting for hbase:meta assignment"); + Thread.currentThread().interrupt(); } - // TODO : Ensure HBCK fixes this - LOG.error("Unable to determine a plan to assign hbase:meta even after repeated attempts. Run HBCK to fix this"); - } catch (InterruptedException e) { - LOG.error("Got exception while waiting for hbase:meta assignment"); - Thread.currentThread().interrupt(); } + regionStates.updateRegionState(region, State.FAILED_OPEN); } - regionStates.updateRegionState(region, RegionState.State.FAILED_OPEN); + return; } - return; - } - if (setOfflineInZK && versionOfOfflineNode == -1) { - // get the version of the znode after setting it to OFFLINE. - // versionOfOfflineNode will be -1 if the znode was not set to OFFLINE - versionOfOfflineNode = setOfflineInZooKeeper(currentState, plan.getDestination()); - if (versionOfOfflineNode != -1) { - if (isDisabledorDisablingRegionInRIT(region)) { - return; + if (setOfflineInZK && versionOfOfflineNode == -1) { + // get the version of the znode after setting it to OFFLINE. + // versionOfOfflineNode will be -1 if the znode was not set to OFFLINE + versionOfOfflineNode = setOfflineInZooKeeper(currentState, plan.getDestination()); + if (versionOfOfflineNode != -1) { + if (isDisabledorDisablingRegionInRIT(region)) { + return; + } + // In case of assignment from EnableTableHandler table state is ENABLING. Any how + // EnableTableHandler will set ENABLED after assigning all the table regions. If we + // try to set to ENABLED directly then client API may think table is enabled. + // When we have a case such as all the regions are added directly into hbase:meta and we call + // assignRegion then we need to make the table ENABLED. Hence in such case the table + // will not be in ENABLING or ENABLED state. + TableName tableName = region.getTable(); + if (!zkTable.isEnablingTable(tableName) && !zkTable.isEnabledTable(tableName)) { + LOG.debug("Setting table " + tableName + " to ENABLED state."); + setEnabledTable(tableName); + } } - // In case of assignment from EnableTableHandler table state is ENABLING. Any how - // EnableTableHandler will set ENABLED after assigning all the table regions. If we - // try to set to ENABLED directly then client API may think table is enabled. - // When we have a case such as all the regions are added directly into hbase:meta and we call - // assignRegion then we need to make the table ENABLED. Hence in such case the table - // will not be in ENABLING or ENABLED state. - TableName tableName = region.getTable(); - if (!zkTable.isEnablingTable(tableName) && !zkTable.isEnabledTable(tableName)) { - LOG.debug("Setting table " + tableName + " to ENABLED state."); - setEnabledTable(tableName); + } + if (setOfflineInZK && versionOfOfflineNode == -1) { + LOG.info("Unable to set offline in ZooKeeper to assign " + region); + // Setting offline in ZK must have been failed due to ZK racing or some + // exception which may make the server to abort. If it is ZK racing, + // we should retry since we already reset the region state, + // existing (re)assignment will fail anyway. + if (!server.isAborted()) { + continue; } } - } - if (setOfflineInZK && versionOfOfflineNode == -1) { - LOG.info("Unable to set offline in ZooKeeper to assign " + region); - // Setting offline in ZK must have been failed due to ZK racing or some - // exception which may make the server to abort. If it is ZK racing, - // we should retry since we already reset the region state, - // existing (re)assignment will fail anyway. - if (!server.isAborted()) { - continue; + if (this.server.isStopped() || this.server.isAborted()) { + LOG.debug("Server stopped/aborted; skipping assign of " + region); + return; } - } - if (this.server.isStopped() || this.server.isAborted()) { - LOG.debug("Server stopped/aborted; skipping assign of " + region); - return; - } - LOG.info("Assigning " + region.getRegionNameAsString() + - " to " + plan.getDestination().toString()); - // Transition RegionState to PENDING_OPEN - currentState = regionStates.updateRegionState(region, - RegionState.State.PENDING_OPEN, plan.getDestination()); - - boolean needNewPlan; - final String assignMsg = "Failed assignment of " + region.getRegionNameAsString() + - " to " + plan.getDestination(); - try { - List favoredNodes = ServerName.EMPTY_SERVER_LIST; - if (this.shouldAssignRegionsWithFavoredNodes) { - favoredNodes = ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region); - } - regionOpenState = serverManager.sendRegionOpen( - plan.getDestination(), region, versionOfOfflineNode, favoredNodes); - - if (regionOpenState == RegionOpeningState.FAILED_OPENING) { - // Failed opening this region, looping again on a new server. - needNewPlan = true; - LOG.warn(assignMsg + ", regionserver says 'FAILED_OPENING', " + - " trying to assign elsewhere instead; " + - "try=" + i + " of " + this.maximumAttempts); - } else { - // we're done - if (regionOpenState == RegionOpeningState.ALREADY_OPENED) { - processAlreadyOpenedRegion(region, plan.getDestination()); + LOG.info("Assigning " + region.getRegionNameAsString() + + " to " + plan.getDestination().toString()); + // Transition RegionState to PENDING_OPEN + currentState = regionStates.updateRegionState(region, + State.PENDING_OPEN, plan.getDestination()); + + boolean needNewPlan; + final String assignMsg = "Failed assignment of " + region.getRegionNameAsString() + + " to " + plan.getDestination(); + try { + List favoredNodes = ServerName.EMPTY_SERVER_LIST; + if (this.shouldAssignRegionsWithFavoredNodes) { + favoredNodes = ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region); + } + regionOpenState = serverManager.sendRegionOpen( + plan.getDestination(), region, versionOfOfflineNode, favoredNodes); + + if (regionOpenState == RegionOpeningState.FAILED_OPENING) { + // Failed opening this region, looping again on a new server. + needNewPlan = true; + LOG.warn(assignMsg + ", regionserver says 'FAILED_OPENING', " + + " trying to assign elsewhere instead; " + + "try=" + i + " of " + this.maximumAttempts); + } else { + // we're done + if (regionOpenState == RegionOpeningState.ALREADY_OPENED) { + processAlreadyOpenedRegion(region, plan.getDestination()); + } + return; + } + + } catch (Throwable t) { + if (t instanceof RemoteException) { + t = ((RemoteException) t).unwrapRemoteException(); + } + + // Should we wait a little before retrying? If the server is starting it's yes. + // If the region is already in transition, it's yes as well: we want to be sure that + // the region will get opened but we don't want a double assignment. + boolean hold = (t instanceof RegionAlreadyInTransitionException || + t instanceof ServerNotRunningYetException); + + // In case socket is timed out and the region server is still online, + // the openRegion RPC could have been accepted by the server and + // just the response didn't go through. So we will retry to + // open the region on the same server to avoid possible + // double assignment. + boolean retry = !hold && (t instanceof java.net.SocketTimeoutException + && this.serverManager.isServerOnline(plan.getDestination())); + + + if (hold) { + LOG.warn(assignMsg + ", waiting a little before trying on the same region server " + + "try=" + i + " of " + this.maximumAttempts, t); + + if (maxRegionServerStartupWaitTime < 0) { + maxRegionServerStartupWaitTime = EnvironmentEdgeManager.currentTimeMillis() + + this.server.getConfiguration(). + getLong("hbase.regionserver.rpc.startup.waittime", 60000); + } + try { + long now = EnvironmentEdgeManager.currentTimeMillis(); + if (now < maxRegionServerStartupWaitTime) { + LOG.debug("Server is not yet up; waiting up to " + + (maxRegionServerStartupWaitTime - now) + "ms", t); + Thread.sleep(100); + i--; // reset the try count + needNewPlan = false; + } else { + LOG.debug("Server is not up for a while; try a new one", t); + needNewPlan = true; + } + } catch (InterruptedException ie) { + LOG.warn("Failed to assign " + + region.getRegionNameAsString() + " since interrupted", ie); + Thread.currentThread().interrupt(); + if (!tomActivated) { + regionStates.updateRegionState(region, State.FAILED_OPEN); + } + return; + } + } else if (retry) { + needNewPlan = false; + LOG.warn(assignMsg + ", trying to assign to the same region server " + + "try=" + i + " of " + this.maximumAttempts, t); + } else { + needNewPlan = true; + LOG.warn(assignMsg + ", trying to assign elsewhere instead;" + + " try=" + i + " of " + this.maximumAttempts, t); } - return; } - } catch (Throwable t) { - if (t instanceof RemoteException) { - t = ((RemoteException) t).unwrapRemoteException(); + if (i == this.maximumAttempts) { + // Don't reset the region state or get a new plan any more. + // This is the last try. + continue; } - // Should we wait a little before retrying? If the server is starting it's yes. - // If the region is already in transition, it's yes as well: we want to be sure that - // the region will get opened but we don't want a double assignment. - boolean hold = (t instanceof RegionAlreadyInTransitionException || - t instanceof ServerNotRunningYetException); - - // In case socket is timed out and the region server is still online, - // the openRegion RPC could have been accepted by the server and - // just the response didn't go through. So we will retry to - // open the region on the same server to avoid possible - // double assignment. - boolean retry = !hold && (t instanceof java.net.SocketTimeoutException - && this.serverManager.isServerOnline(plan.getDestination())); - - - if (hold) { - LOG.warn(assignMsg + ", waiting a little before trying on the same region server " + - "try=" + i + " of " + this.maximumAttempts, t); - - if (maxRegionServerStartupWaitTime < 0) { - maxRegionServerStartupWaitTime = EnvironmentEdgeManager.currentTimeMillis() + - this.server.getConfiguration(). - getLong("hbase.regionserver.rpc.startup.waittime", 60000); - } + // If region opened on destination of present plan, reassigning to new + // RS may cause double assignments. In case of RegionAlreadyInTransitionException + // reassigning to same RS. + if (needNewPlan) { + // Force a new plan and reassign. Will return null if no servers. + // The new plan could be the same as the existing plan since we don't + // exclude the server of the original plan, which should not be + // excluded since it could be the only server up now. + RegionPlan newPlan = null; try { - long now = EnvironmentEdgeManager.currentTimeMillis(); - if (now < maxRegionServerStartupWaitTime) { - LOG.debug("Server is not yet up; waiting up to " + - (maxRegionServerStartupWaitTime - now) + "ms", t); - Thread.sleep(100); - i--; // reset the try count - needNewPlan = false; + newPlan = getRegionPlan(region, true); + } catch (HBaseIOException e) { + LOG.warn("Failed to get region plan", e); + } + if (newPlan == null) { + if (tomActivated) { + this.timeoutMonitor.setAllRegionServersOffline(true); } else { - LOG.debug("Server is not up for a while; try a new one", t); - needNewPlan = true; - } - } catch (InterruptedException ie) { - LOG.warn("Failed to assign " - + region.getRegionNameAsString() + " since interrupted", ie); - Thread.currentThread().interrupt(); - if (!tomActivated) { - regionStates.updateRegionState(region, RegionState.State.FAILED_OPEN); + regionStates.updateRegionState(region, State.FAILED_OPEN); } + LOG.warn("Unable to find a viable location to assign region " + + region.getRegionNameAsString()); return; } - } else if (retry) { - needNewPlan = false; - LOG.warn(assignMsg + ", trying to assign to the same region server " + - "try=" + i + " of " + this.maximumAttempts, t); - } else { - needNewPlan = true; - LOG.warn(assignMsg + ", trying to assign elsewhere instead;" + - " try=" + i + " of " + this.maximumAttempts, t); - } - } - if (i == this.maximumAttempts) { - // Don't reset the region state or get a new plan any more. - // This is the last try. - continue; - } - - // If region opened on destination of present plan, reassigning to new - // RS may cause double assignments. In case of RegionAlreadyInTransitionException - // reassigning to same RS. - if (needNewPlan) { - // Force a new plan and reassign. Will return null if no servers. - // The new plan could be the same as the existing plan since we don't - // exclude the server of the original plan, which should not be - // excluded since it could be the only server up now. - RegionPlan newPlan = null; - try { - newPlan = getRegionPlan(region, true); - } catch (HBaseIOException e) { - LOG.warn("Failed to get region plan", e); - } - if (newPlan == null) { - if (tomActivated) { - this.timeoutMonitor.setAllRegionServersOffline(true); - } else { - regionStates.updateRegionState(region, RegionState.State.FAILED_OPEN); + if (plan != newPlan && !plan.getDestination().equals(newPlan.getDestination())) { + // Clean out plan we failed execute and one that doesn't look like it'll + // succeed anyways; we need a new plan! + // Transition back to OFFLINE + currentState = regionStates.updateRegionState(region, State.OFFLINE); + versionOfOfflineNode = -1; + plan = newPlan; } - LOG.warn("Unable to find a viable location to assign region " + - region.getRegionNameAsString()); - return; - } - - if (plan != newPlan && !plan.getDestination().equals(newPlan.getDestination())) { - // Clean out plan we failed execute and one that doesn't look like it'll - // succeed anyways; we need a new plan! - // Transition back to OFFLINE - currentState = regionStates.updateRegionState(region, RegionState.State.OFFLINE); - versionOfOfflineNode = -1; - plan = newPlan; } } - } - // Run out of attempts - if (!tomActivated) { - regionStates.updateRegionState(region, RegionState.State.FAILED_OPEN); - } + // Run out of attempts + if (!tomActivated) { + regionStates.updateRegionState(region, State.FAILED_OPEN); + } } finally { metricsAssignmentManager.updateAssignmentTime(EnvironmentEdgeManager.currentTimeMillis() - startTime); } @@ -2139,8 +2199,7 @@ public class AssignmentManager extends Z this.server.abort(msg, new IllegalStateException(msg)); return -1; } - regionStates.updateRegionState(state.getRegion(), - RegionState.State.OFFLINE); + regionStates.updateRegionState(state.getRegion(), State.OFFLINE); int versionOfOfflineNode; try { // get the version after setting the znode to OFFLINE @@ -2316,21 +2375,25 @@ public class AssignmentManager extends Z // creation ReentrantLock lock = locker.acquireLock(encodedName); RegionState state = regionStates.getRegionTransitionState(encodedName); + boolean reassign = true; try { if (state == null) { // Region is not in transition. // We can unassign it only if it's not SPLIT/MERGED. state = regionStates.getRegionState(encodedName); - if (state != null && (state.isMerged() || state.isSplit())) { + if (state != null && (state.isMerged() + || state.isSplit() || state.isOffline())) { LOG.info("Attempting to unassign " + state + ", ignored"); + // Offline region will be reassigned below return; } // Create the znode in CLOSING state try { - state = regionStates.getRegionState(region); if (state == null || state.getServerName() == null) { // We don't know where the region is, offline it. // No need to send CLOSE RPC + LOG.warn("Attempting to unassign a region not in RegionStates" + + region.getRegionNameAsString() + ", offlined"); regionOffline(region); return; } @@ -2338,8 +2401,9 @@ public class AssignmentManager extends Z watcher, region, state.getServerName()); if (versionOfClosingNode == -1) { LOG.info("Attempting to unassign " + - region.getRegionNameAsString() + " but ZK closing node " - + "can't be created."); + region.getRegionNameAsString() + " but ZK closing node " + + "can't be created."); + reassign = false; // not unassigned at all return; } } catch (KeeperException e) { @@ -2354,6 +2418,7 @@ public class AssignmentManager extends Z if (isSplitOrSplittingOrMergedOrMerging(path)) { LOG.debug(path + " is SPLIT or SPLITTING or MERGED or MERGING; " + "skipping unassign because region no longer exists -- its split or merge"); + reassign = false; // no need to reassign for split/merged region return; } } catch (KeeperException.NoNodeException ke) { @@ -2369,9 +2434,10 @@ public class AssignmentManager extends Z } // If we get here, don't understand whats going on -- abort. server.abort("Unexpected ZK exception creating node CLOSING", e); + reassign = false; // heading out already return; } - state = regionStates.updateRegionState(region, RegionState.State.PENDING_CLOSE); + state = regionStates.updateRegionState(region, State.PENDING_CLOSE); } else if (state.isFailedOpen()) { // The region is not open yet regionOffline(region); @@ -2381,7 +2447,7 @@ public class AssignmentManager extends Z " which is already " + state.getState() + " but forcing to send a CLOSE RPC again "); if (state.isFailedClose()) { - state = regionStates.updateRegionState(region, RegionState.State.PENDING_CLOSE); + state = regionStates.updateRegionState(region, State.PENDING_CLOSE); } state.updateTimestampToNow(); } else { @@ -2392,11 +2458,13 @@ public class AssignmentManager extends Z } unassign(region, state, versionOfClosingNode, dest, true, null); - if (regionStates.isRegionOffline(region)) { - new ClosedRegionHandler(server, this, region).process(); - } } finally { lock.unlock(); + + // Region is expected to be reassigned afterwards + if (reassign && regionStates.isRegionOffline(region)) { + assign(region, true); + } } } @@ -2477,7 +2545,7 @@ public class AssignmentManager extends Z */ public boolean waitForAssignment(HRegionInfo regionInfo) throws InterruptedException { - while (!regionStates.isRegionAssigned(regionInfo)) { + while (!regionStates.isRegionOnline(regionInfo)) { if (regionStates.isRegionInState(regionInfo, State.FAILED_OPEN) || this.server.isStopped()) { return false; @@ -2575,7 +2643,13 @@ public class AssignmentManager extends Z " region(s) to " + servers + " server(s)"); } for (Map.Entry> plan: bulkPlan.entrySet()) { - assign(plan.getKey(), plan.getValue()); + if (!assign(plan.getKey(), plan.getValue())) { + for (HRegionInfo region: plan.getValue()) { + if (!regionStates.isRegionOnline(region)) { + invokeAssign(region); + } + } + } } } else { LOG.info("Bulk assigning " + regions + " region(s) across " @@ -2818,6 +2892,8 @@ public class AssignmentManager extends Z if (deadServers != null) { for (Map.Entry> server: deadServers.entrySet()) { ServerName serverName = server.getKey(); + // We need to keep such info even if the server is known dead + regionStates.setLastRegionServerOfRegions(serverName, server.getValue()); if (!serverManager.isServerDead(serverName)) { serverManager.expireServer(serverName); // Let SSH do region re-assign } @@ -3178,9 +3254,10 @@ public class AssignmentManager extends Z RegionState regionState = regionStates.getRegionTransitionState(encodedName); if (regionState == null - || !regionState.isPendingOpenOrOpeningOnServer(sn)) { - LOG.info("Skip " + hri - + " since it is not opening on the dead server any more: " + sn); + || (regionState.getServerName() != null && !regionState.isOnServer(sn)) + || !(regionState.isFailedClose() || regionState.isPendingOpenOrOpening())) { + LOG.info("Skip " + regionState + " since it is not opening/failed_close" + + " on the dead server any more: " + sn); it.remove(); } else { try { @@ -3194,8 +3271,8 @@ public class AssignmentManager extends Z regionStates.regionOffline(hri); continue; } - // Mark the region closed and assign it again by SSH - regionStates.updateRegionState(hri, RegionState.State.CLOSED); + // Mark the region offline and assign it again by SSH + regionStates.updateRegionState(hri, State.OFFLINE); } } finally { lock.unlock(); @@ -3213,17 +3290,10 @@ public class AssignmentManager extends Z */ public void handleSplitReport(final ServerName sn, final HRegionInfo parent, final HRegionInfo a, final HRegionInfo b) { - regionOffline(parent, State.SPLIT); - regionOnline(a, sn); - regionOnline(b, sn); - - // There's a possibility that the region was splitting while a user asked - // the master to disable, we need to make sure we close those regions in - // that case. This is not racing with the region server itself since RS - // report is done after the split transaction completed. - if (this.zkTable.isDisablingOrDisabledTable(parent.getTable())) { - unassign(a); - unassign(b); + synchronized (regionStates) { + regionOffline(parent, State.SPLIT); + onlineNewRegion(a, sn); + onlineNewRegion(b, sn); } } @@ -3236,16 +3306,10 @@ public class AssignmentManager extends Z */ public void handleRegionsMergeReport(final ServerName sn, final HRegionInfo merged, final HRegionInfo a, final HRegionInfo b) { - regionOffline(a, State.MERGED); - regionOffline(b, State.MERGED); - regionOnline(merged, sn); - - // There's a possibility that the region was merging while a user asked - // the master to disable, we need to make sure we close those regions in - // that case. This is not racing with the region server itself since RS - // report is done after the regions merge transaction completed. - if (this.zkTable.isDisablingOrDisabledTable(merged.getTable())) { - unassign(merged); + synchronized (regionStates) { + regionOffline(a, State.MERGED); + regionOffline(b, State.MERGED); + onlineNewRegion(merged, sn); } } @@ -3256,7 +3320,14 @@ public class AssignmentManager extends Z synchronized (this.regionPlans) { this.regionPlans.put(plan.getRegionName(), plan); } - unassign(plan.getRegionInfo(), false, plan.getDestination()); + HRegionInfo hri = plan.getRegionInfo(); + TableName tableName = hri.getTable(); + if (zkTable.isDisablingOrDisabledTable(tableName)) { + LOG.info("Ignored moving region of disabling/disabled table " + + tableName); + return; + } + unassign(hri, false, plan.getDestination()); } public void stop() { @@ -3303,8 +3374,7 @@ public class AssignmentManager extends Z new IllegalStateException()); return false; } - regionStates.updateRegionState( - state.getRegion(), RegionState.State.OFFLINE); + regionStates.updateRegionState(state.getRegion(), State.OFFLINE); try { ZKAssign.asyncCreateNodeOffline(watcher, state.getRegion(), destination, cb, state); @@ -3344,8 +3414,8 @@ public class AssignmentManager extends Z + merging_a + ", rs_b=" + merging_b); return false; } - regionStates.updateRegionState(merging_a, RegionState.State.MERGING); - regionStates.updateRegionState(merging_b, RegionState.State.MERGING); + regionStates.updateRegionState(merging_a, State.MERGING); + regionStates.updateRegionState(merging_b, State.MERGING); return true; } @@ -3360,4 +3430,21 @@ public class AssignmentManager extends Z // remove the region plan as well just in case. clearRegionPlan(regionInfo); } + + /** + * Online a newly created region, which is usually from split/merge. + */ + private void onlineNewRegion(final HRegionInfo region, final ServerName sn) { + synchronized (regionStates) { + // Someone could find the region from meta and reassign it. + if (regionStates.getRegionState(region) == null) { + regionStates.createRegionState(region); + regionOnline(region, sn); + } + } + // User could disable the table before master knows the new region. + if (zkTable.isDisablingOrDisabledTable(region.getTable())) { + unassign(region); + } + } } Modified: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java?rev=1528227&r1=1528226&r2=1528227&view=diff ============================================================================== --- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java (original) +++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java Tue Oct 1 22:02:01 2013 @@ -328,14 +328,6 @@ public class CatalogJanitor extends Chor if (hasNoReferences(a) && hasNoReferences(b)) { LOG.debug("Deleting region " + parent.getRegionNameAsString() + " because daughter splits no longer hold references"); - - // This latter regionOffline should not be necessary but is done for now - // until we let go of regionserver to master heartbeats. See HBASE-3368. - if (this.services.getAssignmentManager() != null) { - // The mock used in testing catalogjanitor returns null for getAssignmnetManager. - // Allow for null result out of getAssignmentManager. - this.services.getAssignmentManager().regionOffline(parent); - } FileSystem fs = this.services.getMasterFileSystem().getFileSystem(); if (LOG.isTraceEnabled()) LOG.trace("Archiving parent region: " + parent); HFileArchiver.archiveRegion(this.services.getConfiguration(), fs, parent); @@ -380,7 +372,8 @@ public class CatalogJanitor extends Chor regionFs = HRegionFileSystem.openRegionFromFileSystem( this.services.getConfiguration(), fs, tabledir, daughter, true); } catch (IOException e) { - LOG.warn("Daughter region does not exist: " + daughter.getEncodedName()); + LOG.warn("Daughter region does not exist: " + daughter.getEncodedName() + + ", parent is: " + parent.getEncodedName()); return new Pair(Boolean.FALSE, Boolean.FALSE); } Modified: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java?rev=1528227&r1=1528226&r2=1528227&view=diff ============================================================================== --- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java (original) +++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java Tue Oct 1 22:02:01 2013 @@ -18,6 +18,8 @@ */ package org.apache.hadoop.hbase.master; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; @@ -39,6 +41,8 @@ import java.util.Set; */ @InterfaceAudience.Private public class DeadServer { + private static final Log LOG = LogFactory.getLog(DeadServer.class); + /** * Set of known dead servers. On znode expiration, servers are added here. * This is needed in case of a network partitioning where the server's lease @@ -110,8 +114,8 @@ public class DeadServer { } } - @SuppressWarnings("UnusedParameters") - public synchronized void finish(ServerName ignored) { + public synchronized void finish(ServerName sn) { + LOG.debug("Finished processing " + sn); this.numProcessing--; } Modified: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/GeneralBulkAssigner.java URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/GeneralBulkAssigner.java?rev=1528227&r1=1528226&r2=1528227&view=diff ============================================================================== --- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/GeneralBulkAssigner.java (original) +++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/GeneralBulkAssigner.java Tue Oct 1 22:02:01 2013 @@ -35,6 +35,7 @@ import org.apache.hadoop.conf.Configurat import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.master.RegionState.State; /** * Run bulk assign. Does one RCP per regionserver passing a @@ -129,9 +130,8 @@ public class GeneralBulkAssigner extends Iterator regionInfoIterator = regionSet.iterator(); while (regionInfoIterator.hasNext()) { HRegionInfo hri = regionInfoIterator.next(); - RegionState state = regionStates.getRegionState(hri); - if ((!regionStates.isRegionInTransition(hri) && regionStates.isRegionAssigned(hri)) - || state.isSplitting() || state.isMerging()) { + if (regionStates.isRegionOnline(hri) || regionStates.isRegionInState(hri, + State.SPLITTING, State.SPLIT, State.MERGING, State.MERGED)) { regionInfoIterator.remove(); } } @@ -195,8 +195,11 @@ public class GeneralBulkAssigner extends + " regions to server " + e.getKey() + ", reassigning them"); reassigningRegions.addAll(failedPlans.remove(e.getKey())); } + RegionStates regionStates = assignmentManager.getRegionStates(); for (HRegionInfo region : reassigningRegions) { - assignmentManager.invokeAssign(region); + if (!regionStates.isRegionOnline(region)) { + assignmentManager.invokeAssign(region); + } } return reassigningRegions.size(); } Modified: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1528227&r1=1528226&r2=1528227&view=diff ============================================================================== --- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original) +++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Tue Oct 1 22:02:01 2013 @@ -2516,8 +2516,7 @@ MasterServices, Server { LOG.debug(getClientIdAuditPrefix() + " unassign " + hri.getRegionNameAsString() + " in current location if it is online and reassign.force=" + force); this.assignmentManager.unassign(hri, force); - if (!this.assignmentManager.getRegionStates().isRegionInTransition(hri) - && !this.assignmentManager.getRegionStates().isRegionAssigned(hri)) { + if (this.assignmentManager.getRegionStates().isRegionOffline(hri)) { LOG.debug("Region " + hri.getRegionNameAsString() + " is not online on any region server, reassigning it."); assignRegion(hri);