Return-Path: X-Original-To: apmail-hadoop-hdfs-commits-archive@minotaur.apache.org Delivered-To: apmail-hadoop-hdfs-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 195FB9D44 for ; Fri, 11 May 2012 05:58:09 +0000 (UTC) Received: (qmail 91367 invoked by uid 500); 11 May 2012 05:58:08 -0000 Delivered-To: apmail-hadoop-hdfs-commits-archive@hadoop.apache.org Received: (qmail 87563 invoked by uid 500); 11 May 2012 05:58:02 -0000 Mailing-List: contact hdfs-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: hdfs-dev@hadoop.apache.org Delivered-To: mailing list hdfs-commits@hadoop.apache.org Received: (qmail 87488 invoked by uid 99); 11 May 2012 05:57:59 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 11 May 2012 05:57:59 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 11 May 2012 05:57:58 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 0FD7F2388860; Fri, 11 May 2012 05:57:38 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1337030 - in /hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs: CHANGES.txt src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStateTransitionFailure.java Date: Fri, 11 May 2012 05:57:37 -0000 To: hdfs-commits@hadoop.apache.org From: atm@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20120511055738.0FD7F2388860@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: atm Date: Fri May 11 05:57:37 2012 New Revision: 1337030 URL: http://svn.apache.org/viewvc?rev=1337030&view=rev Log: HDFS-3026. HA: Handle failure during HA state transition. Contributed by Aaron T. Myers. Added: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStateTransitionFailure.java Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt?rev=1337030&r1=1337029&r2=1337030&view=diff ============================================================================== --- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt (original) +++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt Fri May 11 05:57:37 2012 @@ -651,6 +651,8 @@ Release 2.0.0 - UNRELEASED necessarily a BlockInfoUnderConstruction, so do not cast it in FSNamesystem.recoverLeaseInternal(..). (szetszwo) + HDFS-3026. HA: Handle failure during HA state transition. (atm) + BREAKDOWN OF HDFS-1623 SUBTASKS HDFS-2179. Add fencing framework and mechanisms for NameNode HA. (todd) Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java?rev=1337030&r1=1337029&r2=1337030&view=diff ============================================================================== --- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java (original) +++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java Fri May 11 05:57:37 2012 @@ -206,6 +206,7 @@ public class NameNode { private final boolean haEnabled; private final HAContext haContext; protected boolean allowStaleStandbyReads; + private Runtime runtime = Runtime.getRuntime(); /** httpServer */ @@ -481,11 +482,16 @@ public class NameNode { } private void startTrashEmptier(Configuration conf) throws IOException { - long trashInterval - = conf.getLong(CommonConfigurationKeys.FS_TRASH_INTERVAL_KEY, - CommonConfigurationKeys.FS_TRASH_INTERVAL_DEFAULT); - if(trashInterval == 0) + long trashInterval = conf.getLong( + CommonConfigurationKeys.FS_TRASH_INTERVAL_KEY, + CommonConfigurationKeys.FS_TRASH_INTERVAL_DEFAULT); + if (trashInterval == 0) { return; + } else if (trashInterval < 0) { + throw new IOException("Cannot start tresh emptier with negative interval." + + " Set " + CommonConfigurationKeys.FS_TRASH_INTERVAL_KEY + " to a" + + " positive value."); + } this.emptier = new Thread(new Trash(conf).getEmptier(), "Trash Emptier"); this.emptier.setDaemon(true); this.emptier.start(); @@ -1235,14 +1241,37 @@ public class NameNode { } return state.getServiceState(); } + + @VisibleForTesting + public synchronized void setRuntimeForTesting(Runtime runtime) { + this.runtime = runtime; + } /** - * Class used as expose {@link NameNode} as context to {@link HAState} + * Shutdown the NN immediately in an ungraceful way. Used when it would be + * unsafe for the NN to continue operating, e.g. during a failed HA state + * transition. * - * TODO(HA): - * When entering and exiting state, on failing to start services, - * appropriate action is needed todo either shutdown the node or recover - * from failure. + * @param t exception which warrants the shutdown. Printed to the NN log + * before exit. + * @throws ServiceFailedException thrown only for testing. + */ + private synchronized void doImmediateShutdown(Throwable t) + throws ServiceFailedException { + String message = "Error encountered requiring NN shutdown. " + + "Shutting down immediately."; + try { + LOG.fatal(message, t); + } catch (Throwable ignored) { + // This is unlikely to happen, but there's nothing we can do if it does. + } + runtime.exit(1); + // This code is only reached during testing, when runtime is stubbed out. + throw new ServiceFailedException(message, t); + } + + /** + * Class used to expose {@link NameNode} as context to {@link HAState} */ protected class NameNodeHAContext implements HAContext { @Override @@ -1257,32 +1286,52 @@ public class NameNode { @Override public void startActiveServices() throws IOException { - namesystem.startActiveServices(); - startTrashEmptier(conf); + try { + namesystem.startActiveServices(); + startTrashEmptier(conf); + } catch (Throwable t) { + doImmediateShutdown(t); + } } @Override public void stopActiveServices() throws IOException { - if (namesystem != null) { - namesystem.stopActiveServices(); + try { + if (namesystem != null) { + namesystem.stopActiveServices(); + } + stopTrashEmptier(); + } catch (Throwable t) { + doImmediateShutdown(t); } - stopTrashEmptier(); } @Override public void startStandbyServices() throws IOException { - namesystem.startStandbyServices(conf); + try { + namesystem.startStandbyServices(conf); + } catch (Throwable t) { + doImmediateShutdown(t); + } } @Override public void prepareToStopStandbyServices() throws ServiceFailedException { - namesystem.prepareToStopStandbyServices(); + try { + namesystem.prepareToStopStandbyServices(); + } catch (Throwable t) { + doImmediateShutdown(t); + } } @Override public void stopStandbyServices() throws IOException { - if (namesystem != null) { - namesystem.stopStandbyServices(); + try { + if (namesystem != null) { + namesystem.stopStandbyServices(); + } + } catch (Throwable t) { + doImmediateShutdown(t); } } Added: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStateTransitionFailure.java URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStateTransitionFailure.java?rev=1337030&view=auto ============================================================================== --- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStateTransitionFailure.java (added) +++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStateTransitionFailure.java Fri May 11 05:57:37 2012 @@ -0,0 +1,80 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode.ha; + +import static org.apache.hadoop.test.GenericTestUtils.assertExceptionContains; +import static org.junit.Assert.fail; +import static org.mockito.Matchers.anyInt; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; + +import java.io.IOException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.CommonConfigurationKeys; +import org.apache.hadoop.ha.ServiceFailedException; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.junit.Test; + +/** + * Tests to verify the behavior of failing to fully start transition HA states. + */ +public class TestStateTransitionFailure { + + public static final Log LOG = LogFactory.getLog(TestStateTransitionFailure.class); + + /** + * Ensure that a failure to fully transition to the active state causes a + * shutdown of the NameNode. + */ + @Test + public void testFailureToTransitionCausesShutdown() throws IOException { + MiniDFSCluster cluster = null; + try { + Configuration conf = new Configuration(); + // Set an illegal value for the trash emptier interval. This will cause + // the NN to fail to transition to the active state. + conf.setLong(CommonConfigurationKeys.FS_TRASH_INTERVAL_KEY, -1); + cluster = new MiniDFSCluster.Builder(conf) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(0) + .build(); + cluster.waitActive(); + Runtime mockRuntime = mock(Runtime.class); + cluster.getNameNode(0).setRuntimeForTesting(mockRuntime); + verify(mockRuntime, times(0)).exit(anyInt()); + try { + cluster.transitionToActive(0); + fail("Transitioned to active but should not have been able to."); + } catch (ServiceFailedException sfe) { + assertExceptionContains("Error encountered requiring NN shutdown. " + + "Shutting down immediately.", sfe); + LOG.info("got expected exception", sfe); + } + verify(mockRuntime, times(1)).exit(anyInt()); + } finally { + if (cluster != null) { + cluster.shutdown(); + } + } + } +}