Return-Path: X-Original-To: apmail-accumulo-commits-archive@www.apache.org Delivered-To: apmail-accumulo-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 2F31910CBC for ; Sat, 7 Dec 2013 04:14:36 +0000 (UTC) Received: (qmail 40816 invoked by uid 500); 7 Dec 2013 04:14:30 -0000 Delivered-To: apmail-accumulo-commits-archive@accumulo.apache.org Received: (qmail 40789 invoked by uid 500); 7 Dec 2013 04:14:30 -0000 Mailing-List: contact commits-help@accumulo.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@accumulo.apache.org Delivered-To: mailing list commits@accumulo.apache.org Received: (qmail 40779 invoked by uid 99); 7 Dec 2013 04:14:30 -0000 Received: from tyr.zones.apache.org (HELO tyr.zones.apache.org) (140.211.11.114) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 07 Dec 2013 04:14:30 +0000 Received: by tyr.zones.apache.org (Postfix, from userid 65534) id B5DBB8A79F6; Sat, 7 Dec 2013 04:14:29 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: elserj@apache.org To: commits@accumulo.apache.org Date: Sat, 07 Dec 2013 04:14:29 -0000 Message-Id: X-Mailer: ASF-Git Admin Mailer Subject: [1/6] git commit: ACCUMULO-1794 adds hdfs failover to continuous integration test. Updated Branches: refs/heads/1.6.0-SNAPSHOT 352dc1b8b -> 7fb59e909 ACCUMULO-1794 adds hdfs failover to continuous integration test. Signed-off-by: Josh Elser Project: http://git-wip-us.apache.org/repos/asf/accumulo/repo Commit: http://git-wip-us.apache.org/repos/asf/accumulo/commit/872fd1df Tree: http://git-wip-us.apache.org/repos/asf/accumulo/tree/872fd1df Diff: http://git-wip-us.apache.org/repos/asf/accumulo/diff/872fd1df Branch: refs/heads/1.6.0-SNAPSHOT Commit: 872fd1dfb252e45560b5547aad43399fe433f1a1 Parents: 513f4d2 Author: Sean Busbey Authored: Sat Nov 16 02:46:40 2013 -0600 Committer: Josh Elser Committed: Fri Dec 6 20:35:28 2013 -0500 ---------------------------------------------------------------------- .../system/continuous/continuous-env.sh.example | 9 + test/system/continuous/hdfs-agitator.pl | 217 +++++++++++++++++++ test/system/continuous/start-agitator.sh | 5 + test/system/continuous/stop-agitator.sh | 4 + 4 files changed, 235 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/accumulo/blob/872fd1df/test/system/continuous/continuous-env.sh.example ---------------------------------------------------------------------- diff --git a/test/system/continuous/continuous-env.sh.example b/test/system/continuous/continuous-env.sh.example index 830ae86..1d39034 100644 --- a/test/system/continuous/continuous-env.sh.example +++ b/test/system/continuous/continuous-env.sh.example @@ -89,6 +89,15 @@ MAX_KILL=1 MASTER_KILL_SLEEP_TIME=60 MASTER_RESTART_SLEEP_TIME=2 +#Do we want to perturb HDFS? Only works on HDFS versions with HA, i.e. Hadoop 2 +# AGITATE_HDFS=true +#Defaults to false for Accumulo versions that ship default against Hadoop 1 +AGITATE_HDFS=false +AGITATE_HDFS_SLEEP_TIME=10 +AGITATE_HDFS_SUPERUSER=hdfs +AGITATE_HDFS_COMMAND="${HADOOP_PREFIX:/usr/lib/hadoop}/share/hadoop/hdfs/bin/hdfs" +AGITATE_HDFS_SUDO=`which sudo` + #settings for the verification map reduce job VERIFY_OUT=/tmp/continuous_verify VERIFY_MAX_MAPS=64 http://git-wip-us.apache.org/repos/asf/accumulo/blob/872fd1df/test/system/continuous/hdfs-agitator.pl ---------------------------------------------------------------------- diff --git a/test/system/continuous/hdfs-agitator.pl b/test/system/continuous/hdfs-agitator.pl new file mode 100755 index 0000000..85eab32 --- /dev/null +++ b/test/system/continuous/hdfs-agitator.pl @@ -0,0 +1,217 @@ +#! /usr/bin/env perl + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +use strict; +use warnings; +use POSIX qw(strftime); +use Getopt::Long; +use Pod::Usage; + +my $help = 0; +my $man = 0; +my $sleep = 10; +my $superuser = 'hdfs'; +my $hdfsCmd; +if( defined $ENV{'HADOOP_PREFIX'} ){ + $hdfsCmd = $ENV{'HADOOP_PREFIX'} . '/share/hadoop/hdfs/bin/hdfs'; +} +my $sudo; +my $nameservice; + +GetOptions('help|?' => \$help, 'man' => \$man, 'sleep=i' => \$sleep, 'nameservice=s' => \$nameservice, 'superuser=s' => \$superuser, 'hdfs-cmd=s' => \$hdfsCmd, 'sudo:s' => \$sudo) or pod2usage(2); +pod2usage(-exitval => 0, -verbose => 1) if $help; +pod2usage(-exitval => 0, -verbose => 2) if $man; +pod2usage(-exitval => 1, -verbose => 1, -message => '$HADOOP_PREFIX not defined and no hdfs-cmd given. please use --hdfs-cmd to specify where your hdfs cli is.') if not defined $hdfsCmd; +pod2usage(-exitval => 1, -verbose => 1, -message => "Your specified hdfs cli '$hdfsCmd' is not executable.") if not -x $hdfsCmd; +if( defined $sudo and "" eq $sudo ){ + $sudo = `which sudo`; + pod2usage(-exitval => 1, -verbose => 1, -message => "Error attempting to find the sudo command, please specify it with --sudo /path/to/sudo") if 0 != $?; + chomp($sudo); +} +if( defined $sudo ){ + pod2usage(-exitval => 1, -verbose => 1, -message => "Your specified sudo command '$sudo' is not executable.") if not -x $sudo; +} + +my $needsudo = defined $sudo; +my $haadmin = "$hdfsCmd haadmin"; +if($needsudo) { + $haadmin = "$sudo -u $superuser $haadmin"; + print STDERR "Starting HDFS agitator, configured to fail over every $sleep minutes. will run hdfs command '$hdfsCmd' as user '$superuser' via '$sudo'.\n"; +} else { + print STDERR "Starting HDFS agitator, configured to fail over every $sleep minutes. will run hdfs command '$hdfsCmd' as the current user.\n"; +} +while(1){ + sleep($sleep * 60); + my $t = strftime "%Y%m%d %H:%M:%S", localtime; + my @failServices; + if( defined $nameservice ){ + @failServices = ($nameservice); + } else { + my $nameservicesRaw = `$hdfsCmd getconf -confKey dfs.nameservices`; + if(0 != $?) { + print STDERR "$t HDFS CLI failed. please see --help to set it correctly\n"; + exit(1); + } + chomp($nameservicesRaw); + my @nameservices = split(/,/, $nameservicesRaw); + if(1 > scalar(@nameservices)) { + print STDERR "$t No HDFS NameServices found. Are you sure you're running in HA?\n"; + exit(1); + } + if(rand(1) < .5){ + my $serviceToFail = $nameservices[int(rand(scalar(@nameservices)))]; + print STDERR "$t Failing over nameservice $serviceToFail\n"; + @failServices = ($serviceToFail); + } else { + print STDERR "$t Failing over all nameservices\n"; + @failServices = @nameservices; + } + } + for my $toFail (@failServices){ + my $namenodesRaw = `$hdfsCmd getconf -confKey dfs.ha.namenodes.$toFail`; + if(0 != $?) { + print STDERR "$t HDFS CLI failed to look up namenodes in service $toFail.\n"; + exit(1); + } + chomp($namenodesRaw); + my @namenodes = split(/,/, $namenodesRaw); + if(2 > scalar(@namenodes)) { + print STDERR "$t WARN NameService $toFail does not have at least 2 namenodes according to the HDFS configuration, skipping.\n"; + next; + } + my $active; + for my $namenode (@namenodes){ + my $status = `$haadmin -ns $toFail -getServiceState $namenode`; + if(0 != $?) { + if($needsudo) { + print STDERR "$t WARN Error while attempting to get the service state of $toFail :: $namenode\n"; + $status = 'error'; + } else { + print STDERR "$t WARN Current user may not run the HDFS haadmin utility, attempting to sudo to the $superuser user.\n"; + $needsudo = 1; + if(not defined $sudo) { + $sudo = `which sudo`; + pod2usage(-exitval => 1, -verbose => 1, -message => "Error attempting to find the sudo command, please specify it with --sudo") if 0 != $?; + chomp($sudo); + pod2usage(-exitval => 1, -verbose => 1, -message => "The sudo command '$sudo' is not executable. please specify sudo with --sudo") if not -x $sudo; + } + $haadmin = "$sudo -u $superuser $haadmin"; + redo; + } + } + chomp($status); + if( 'active' eq $status ){ + $active = $namenode; + last; + } + } + if( defined $active ){ + my @standby = grep { $_ ne $active } @namenodes; + my $newActive = $standby[int(rand(scalar(@standby)))]; + print STDERR "$t Transitioning nameservice $toFail from $active to $newActive\n"; + my $cmd = "$haadmin -ns $toFail -failover $active $newActive"; + print "$t $cmd\n"; + system($cmd); + } else { + my $newActive = $namenodes[int(rand(scalar(@namenodes)))]; + print STDERR "$t WARN nameservice $toFail did not have an active namenode. Transitioning a random namenode to active. This will fail if HDFS is configured for automatic failover.\n"; + my $cmd = "$haadmin -ns $toFail -transitionToActive $newActive"; + print "$t $cmd\n"; + system($cmd); + } + } +} +__END__ + +=head1 NAME + +hdfs-agitator - causes HDFS to failover + +=head1 DESCRIPTION + +Sleeps for a configurable amount of time, then causes a NameNode failover in one +or more HDFS NameServices. If a given NameService does not have an Active +NameNode when it comes time to failover, a random standby is promoted. + +Only works on HDFS versions that support HA configurations and the haadmin +command. In order to function, the user running this script must be able to +use the haadmin command. This requires access to an HDFS superuser. By default, +it will attempt to sudo to perform calls. + +=head1 SYNOPSIS + +hdfs-agitator [options] + + Options: + --help Brief help message + --man Full documentation + --sleep Time to sleep between failovers in minutes. Default 10 + --superuser HDFS superuser. Default 'hdfs' + --hdfs-cmd hdfs command path. Default '$HADOOP_PREFIX/share/hadoop/hdfs/bin/hdfs' + --nameservice Limit failovers to specified nameservice. Default all nameservices + --sudo command to call to sudo to the HDFS superuser. Default 'sudo' if needed. + +=head1 OPTIONS + +=over 8 + +=item B<--sleep> + +Sleep the given number of minutes between attempts to fail over nameservices. + +=item B<--nameservice> + +Limit failover attempts to the given nameservice. By default, we attempt ot list +all known nameservices and choose either one or all of them to failover in a +given cycle. + +=item B<--superuser> + +An HDFS superuser capable of running the haadmin command. Defaults to "hdfs". + +=item B<--hdfs-cmd> + +Path to the HDFS cli. Will be used both for non-administrative commands (e.g. +listing the nameservices and serviceids in a given nameservice) and admin-only +actions such as checking status and failing over. + +Defaults to using $HADOOP_PREFIX. + +=item B<--sudo> + +Command to allow us to act as the given HDFS superuser. By default we assume the current user +can run HDFS administrative commands. When this argument is specified we will instead attempt +to use the HDFS superuser instead. If given an argument, it will be called like +sudo, i.e. "sudo -u $superuser $cmd". Defaults to "sudo" on the shell's path. + +=back + +=head1 SEE ALSO + +See the Apache Hadoop documentation on configuring HDFS HA + +=over 8 + +=item B + +http://hadoop.apache.org/docs/r2.2.0/hadoop-yarn/hadoop-yarn-site/HDFSHighAvailabilityWithQJM.html#Administrative_commands + +=item B + +http://hadoop.apache.org/docs/r2.2.0/hadoop-yarn/hadoop-yarn-site/HDFSHighAvailabilityWithNFS.html#Administrative_commands + +=back http://git-wip-us.apache.org/repos/asf/accumulo/blob/872fd1df/test/system/continuous/start-agitator.sh ---------------------------------------------------------------------- diff --git a/test/system/continuous/start-agitator.sh b/test/system/continuous/start-agitator.sh index 52e5a4e..c734943 100755 --- a/test/system/continuous/start-agitator.sh +++ b/test/system/continuous/start-agitator.sh @@ -21,3 +21,8 @@ CONTINUOUS_CONF_DIR=${CONTINUOUS_CONF_DIR:-$ACCUMULO_HOME/test/system/continuous nohup ./agitator.pl $KILL_SLEEP_TIME $TUP_SLEEP_TIME $MIN_KILL $MAX_KILL >$CONTINUOUS_LOG_DIR/`date +%Y%m%d%H%M%S`_`hostname`_agitator.out 2>$CONTINUOUS_LOG_DIR/`date +%Y%m%d%H%M%S`_`hostname`_agitator.err & nohup ./magitator.pl $MASTER_KILL_SLEEP_TIME $MASTER_RESTART_SLEEP_TIME >$CONTINUOUS_LOG_DIR/`date +%Y%m%d%H%M%S`_`hostname`_magitator.out 2>$CONTINUOUS_LOG_DIR/`date +%Y%m%d%H%M%S`_`hostname`_magitator.err & + +if ${AGITATE_HDFS:-false} ; then + AGITATOR_LOG=${CONTINUOUS_LOG_DIR}/`date +%Y%m%d%H%M%S`_`hostname`_hdfs-agitator + nohup ./hdfs-agitator.pl --sleep ${AGITATE_HDFS_SLEEP_TIME} --hdfs-cmd ${AGITATE_HDFS_COMMAND} --superuser ${AGITATE_HDFS_SUPERUSER} --sudo ${AGITATE_HDFS_SUDO} >${AGITATOR_LOG}.out 2>${AGITATOR_LOG}.err & +fi http://git-wip-us.apache.org/repos/asf/accumulo/blob/872fd1df/test/system/continuous/stop-agitator.sh ---------------------------------------------------------------------- diff --git a/test/system/continuous/stop-agitator.sh b/test/system/continuous/stop-agitator.sh index b853a55..f26e3b2 100755 --- a/test/system/continuous/stop-agitator.sh +++ b/test/system/continuous/stop-agitator.sh @@ -18,5 +18,9 @@ CONTINUOUS_CONF_DIR=${CONTINUOUS_CONF_DIR:-$ACCUMULO_HOME/test/system/continuous/} . $CONTINUOUS_CONF_DIR/continuous-env.sh +if ${AGITATE_HDFS:-false} ; then + pkill -f hdfs-agitator.pl +fi + pkill -f agitator.pl