hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ssrungar...@apache.org
Subject hbase git commit: HBASE-14261 Enhance Chaos Monkey framework by adding zookeeper and datanode fault injections.
Date Thu, 10 Sep 2015 07:50:16 GMT
Repository: hbase
Updated Branches:
  refs/heads/branch-1.1 d1d403d97 -> 95425c44c


HBASE-14261 Enhance Chaos Monkey framework by adding zookeeper and datanode fault injections.


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/95425c44
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/95425c44
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/95425c44

Branch: refs/heads/branch-1.1
Commit: 95425c44c06684ff29a0984fdda15e2b4165008f
Parents: d1d403d
Author: Srikanth Srungarapu <ssrungarapu@cloudera.com>
Authored: Thu Sep 3 11:49:40 2015 -0700
Committer: Srikanth Srungarapu <ssrungarapu@cloudera.com>
Committed: Thu Sep 10 00:50:16 2015 -0700

----------------------------------------------------------------------
 .../org/apache/hadoop/hbase/ClusterManager.java |   1 +
 .../hadoop/hbase/DistributedHBaseCluster.java   |  86 ++++++++++++-
 .../hadoop/hbase/HBaseClusterManager.java       | 122 ++++++++++++++++---
 .../hadoop/hbase/RESTApiClusterManager.java     |   1 +
 .../hadoop/hbase/chaos/actions/Action.java      |  70 ++++++++++-
 .../chaos/actions/RestartActionBaseAction.java  |  14 +++
 .../actions/RestartRandomDataNodeAction.java    |  58 +++++++++
 .../actions/RestartRandomZKNodeAction.java      |  40 ++++++
 .../hbase/chaos/factories/MonkeyFactory.java    |   2 +
 ...rverAndDependenciesKillingMonkeyFactory.java |  65 ++++++++++
 .../hadoop/hbase/zookeeper/ZKServerTool.java    |  33 +++--
 .../org/apache/hadoop/hbase/HBaseCluster.java   |  75 ++++++++++++
 .../apache/hadoop/hbase/MiniHBaseCluster.java   |  50 ++++++++
 13 files changed, 583 insertions(+), 34 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/95425c44/hbase-it/src/test/java/org/apache/hadoop/hbase/ClusterManager.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/ClusterManager.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/ClusterManager.java
index 2d46279..db5f8d8 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/ClusterManager.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/ClusterManager.java
@@ -39,6 +39,7 @@ interface ClusterManager extends Configurable {
     HADOOP_DATANODE("datanode"),
     HADOOP_JOBTRACKER("jobtracker"),
     HADOOP_TASKTRACKER("tasktracker"),
+    ZOOKEEPER_SERVER("QuorumPeerMain"),
     HBASE_MASTER("master"),
     HBASE_REGIONSERVER("regionserver");
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/95425c44/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
index 6e7cd33..07ca5ec 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
@@ -114,16 +114,14 @@ public class DistributedHBaseCluster extends HBaseCluster {
   public void killRegionServer(ServerName serverName) throws IOException {
     LOG.info("Aborting RS: " + serverName.getServerName());
     clusterManager.kill(ServiceType.HBASE_REGIONSERVER,
-            serverName.getHostname(),
-            serverName.getPort());
+      serverName.getHostname(), serverName.getPort());
   }
 
   @Override
   public void stopRegionServer(ServerName serverName) throws IOException {
     LOG.info("Stopping RS: " + serverName.getServerName());
     clusterManager.stop(ServiceType.HBASE_REGIONSERVER,
-            serverName.getHostname(),
-            serverName.getPort());
+      serverName.getHostname(), serverName.getPort());
   }
 
   @Override
@@ -131,20 +129,96 @@ public class DistributedHBaseCluster extends HBaseCluster {
     waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout);
   }
 
+  @Override
+  public void startZkNode(String hostname, int port) throws IOException {
+    LOG.info("Starting Zookeeper node on: " + hostname);
+    clusterManager.start(ServiceType.ZOOKEEPER_SERVER, hostname, port);
+  }
+
+  @Override
+  public void killZkNode(ServerName serverName) throws IOException {
+    LOG.info("Aborting Zookeeper node on: " + serverName.getServerName());
+    clusterManager.kill(ServiceType.ZOOKEEPER_SERVER,
+      serverName.getHostname(), serverName.getPort());
+  }
+
+  @Override
+  public void stopZkNode(ServerName serverName) throws IOException {
+    LOG.info("Stopping Zookeeper node: " + serverName.getServerName());
+    clusterManager.stop(ServiceType.ZOOKEEPER_SERVER,
+      serverName.getHostname(), serverName.getPort());
+  }
+
+  @Override
+  public void waitForZkNodeToStart(ServerName serverName, long timeout) throws IOException
{
+    waitForServiceToStart(ServiceType.ZOOKEEPER_SERVER, serverName, timeout);
+  }
+
+  @Override
+  public void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOException
{
+    waitForServiceToStop(ServiceType.ZOOKEEPER_SERVER, serverName, timeout);
+  }
+
+  @Override
+  public void startDataNode(ServerName serverName) throws IOException {
+    LOG.info("Starting data node on: " + serverName.getServerName());
+    clusterManager.start(ServiceType.HADOOP_DATANODE,
+      serverName.getHostname(), serverName.getPort());
+  }
+
+  @Override
+  public void killDataNode(ServerName serverName) throws IOException {
+    LOG.info("Aborting data node on: " + serverName.getServerName());
+    clusterManager.kill(ServiceType.HADOOP_DATANODE,
+      serverName.getHostname(), serverName.getPort());
+  }
+
+  @Override
+  public void stopDataNode(ServerName serverName) throws IOException {
+    LOG.info("Stopping data node on: " + serverName.getServerName());
+    clusterManager.stop(ServiceType.HADOOP_DATANODE,
+      serverName.getHostname(), serverName.getPort());
+  }
+
+  @Override
+  public void waitForDataNodeToStart(ServerName serverName, long timeout) throws IOException
{
+    waitForServiceToStart(ServiceType.HADOOP_DATANODE, serverName, timeout);
+  }
+
+  @Override
+  public void waitForDataNodeToStop(ServerName serverName, long timeout) throws IOException
{
+    waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout);
+  }
+
   private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
     throws IOException {
-    LOG.info("Waiting service:" + service + " to stop: " + serverName.getServerName());
+    LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName());
     long start = System.currentTimeMillis();
 
     while ((System.currentTimeMillis() - start) < timeout) {
       if (!clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort()))
{
         return;
       }
-      Threads.sleep(1000);
+      Threads.sleep(100);
     }
     throw new IOException("did timeout waiting for service to stop:" + serverName);
   }
 
+  private void waitForServiceToStart(ServiceType service, ServerName serverName, long timeout)
+    throws IOException {
+    LOG.info("Waiting for service: " + service + " to start: " + serverName.getServerName());
+    long start = System.currentTimeMillis();
+
+    while ((System.currentTimeMillis() - start) < timeout) {
+      if (clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort()))
{
+        return;
+      }
+      Threads.sleep(100);
+    }
+    throw new IOException("did timeout waiting for service to start:" + serverName);
+  }
+
+
   @Override
   public MasterService.BlockingInterface getMasterAdminService()
   throws IOException {

http://git-wip-us.apache.org/repos/asf/hbase/blob/95425c44/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
index 8bdb5d6..c49ae44 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
@@ -54,9 +54,11 @@ public class HBaseClusterManager extends Configured implements ClusterManager
{
 
   /**
    * The command format that is used to execute the remote command. Arguments:
-   * 1 SSH options, 2 user name , 3 "@" if username is set, 4 host, 5 original command.
+   * 1 SSH options, 2 user name , 3 "@" if username is set, 4 host,
+   * 5 original command, 6 service user.
    */
-  private static final String DEFAULT_TUNNEL_CMD = "/usr/bin/ssh %1$s %2$s%3$s%4$s \"%5$s\"";
+  private static final String DEFAULT_TUNNEL_CMD =
+      "/usr/bin/ssh %1$s %2$s%3$s%4$s \"sudo -u %6$s %5$s\"";
   private String tunnelCmd;
 
   private static final String RETRY_ATTEMPTS_KEY = "hbase.it.clustermanager.retry.attempts";
@@ -93,11 +95,24 @@ public class HBaseClusterManager extends Configured implements ClusterManager
{
         .setSleepInterval(conf.getLong(RETRY_SLEEP_INTERVAL_KEY, DEFAULT_RETRY_SLEEP_INTERVAL)));
   }
 
+  private String getServiceUser(ServiceType service) {
+    Configuration conf = getConf();
+    switch (service) {
+      case HADOOP_DATANODE:
+        return conf.get("hbase.it.clustermanager.hadoop.hdfs.user", "hdfs");
+      case ZOOKEEPER_SERVER:
+        return conf.get("hbase.it.clustermanager.zookeeper.user", "zookeeper");
+      default:
+        return conf.get("hbase.it.clustermanager.hbase.user", "hbase");
+    }
+  }
+
   /**
    * Executes commands over SSH
    */
   protected class RemoteShell extends Shell.ShellCommandExecutor {
     private String hostname;
+    private String user;
 
     public RemoteShell(String hostname, String[] execString, File dir, Map<String, String>
env,
         long timeout) {
@@ -120,11 +135,17 @@ public class HBaseClusterManager extends Configured implements ClusterManager
{
       this.hostname = hostname;
     }
 
+    public RemoteShell(String hostname, String user, String[] execString) {
+      super(execString);
+      this.hostname = hostname;
+      this.user = user;
+    }
+
     @Override
     public String[] getExecString() {
       String at = sshUserName.isEmpty() ? "" : "@";
       String remoteCmd = StringUtils.join(super.getExecString(), " ");
-      String cmd = String.format(tunnelCmd, sshOptions, sshUserName, at, hostname, remoteCmd);
+      String cmd = String.format(tunnelCmd, sshOptions, sshUserName, at, hostname, remoteCmd,
user);
       LOG.info("Executing full command [" + cmd + "]");
       return new String[] { "/usr/bin/env", "bash", "-c", cmd };
     }
@@ -188,13 +209,83 @@ public class HBaseClusterManager extends Configured implements ClusterManager
{
     }
   }
 
+  /**
+   * CommandProvider to manage the service using sbin/hadoop-* scripts.
+   */
+  static class HadoopShellCommandProvider extends CommandProvider {
+    private final String hadoopHome;
+    private final String confDir;
+
+    HadoopShellCommandProvider(Configuration conf) throws IOException {
+      hadoopHome = conf.get("hbase.it.clustermanager.hadoop.home",
+          System.getenv("HADOOP_HOME"));
+      String tmp = conf.get("hbase.it.clustermanager.hadoop.conf.dir",
+          System.getenv("HADOOP_CONF_DIR"));
+      if (hadoopHome == null) {
+        throw new IOException("Hadoop home configuration parameter i.e. " +
+          "'hbase.it.clustermanager.hadoop.home' is not configured properly.");
+      }
+      if (tmp != null) {
+        confDir = String.format("--config %s", tmp);
+      } else {
+        confDir = "";
+      }
+    }
+
+    @Override
+    public String getCommand(ServiceType service, Operation op) {
+      return String.format("%s/sbin/hadoop-daemon.sh %s %s %s", hadoopHome, confDir,
+          op.toString().toLowerCase(), service);
+    }
+  }
+
+  /**
+   * CommandProvider to manage the service using bin/zk* scripts.
+   */
+  static class ZookeeperShellCommandProvider extends CommandProvider {
+    private final String zookeeperHome;
+    private final String confDir;
+
+    ZookeeperShellCommandProvider(Configuration conf) throws IOException {
+      zookeeperHome = conf.get("hbase.it.clustermanager.zookeeper.home",
+          System.getenv("ZOOBINDIR"));
+      String tmp = conf.get("hbase.it.clustermanager.zookeeper.conf.dir",
+          System.getenv("ZOOCFGDIR"));
+      if (zookeeperHome == null) {
+        throw new IOException("Zookeeper home configuration parameter i.e. " +
+          "'hbase.it.clustermanager.zookeeper.home' is not configured properly.");
+      }
+      if (tmp != null) {
+        confDir = String.format("--config %s", tmp);
+      } else {
+        confDir = "";
+      }
+    }
+
+    @Override
+    public String getCommand(ServiceType service, Operation op) {
+      return String.format("%s/bin/zkServer.sh %s", zookeeperHome, op.toString().toLowerCase());
+    }
+
+    @Override
+    protected String findPidCommand(ServiceType service) {
+      return String.format("ps aux | grep %s | grep -v grep | tr -s ' ' | cut -d ' ' -f2",
+        service);
+    }
+  }
+
   public HBaseClusterManager() {
   }
 
-  protected CommandProvider getCommandProvider(ServiceType service) {
-    //TODO: make it pluggable, or auto-detect the best command provider, should work with
-    //hadoop daemons as well
-    return new HBaseShellCommandProvider(getConf());
+  protected CommandProvider getCommandProvider(ServiceType service) throws IOException {
+    switch (service) {
+      case HADOOP_DATANODE:
+        return new HadoopShellCommandProvider(getConf());
+      case ZOOKEEPER_SERVER:
+        return new ZookeeperShellCommandProvider(getConf());
+      default:
+        return new HBaseShellCommandProvider(getConf());
+    }
   }
 
   /**
@@ -202,10 +293,11 @@ public class HBaseClusterManager extends Configured implements ClusterManager
{
    * @return pair of exit code and command output
    * @throws IOException if something goes wrong.
    */
-  private Pair<Integer, String> exec(String hostname, String... cmd) throws IOException
{
+  private Pair<Integer, String> exec(String hostname, ServiceType service, String...
cmd)
+    throws IOException {
     LOG.info("Executing remote command: " + StringUtils.join(cmd, " ") + " , hostname:" +
hostname);
 
-    RemoteShell shell = new RemoteShell(hostname, cmd);
+    RemoteShell shell = new RemoteShell(hostname, getServiceUser(service), cmd);
     try {
       shell.execute();
     } catch (Shell.ExitCodeException ex) {
@@ -222,12 +314,12 @@ public class HBaseClusterManager extends Configured implements ClusterManager
{
     return new Pair<Integer, String>(shell.getExitCode(), shell.getOutput());
   }
 
-  private Pair<Integer, String> execWithRetries(String hostname, String... cmd)
+  private Pair<Integer, String> execWithRetries(String hostname, ServiceType service,
String... cmd)
       throws IOException {
     RetryCounter retryCounter = retryCounterFactory.create();
     while (true) {
       try {
-        return exec(hostname, cmd);
+        return exec(hostname, service, cmd);
       } catch (IOException e) {
         retryOrThrow(retryCounter, e, hostname, cmd);
       }
@@ -252,7 +344,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager
{
   }
 
   private void exec(String hostname, ServiceType service, Operation op) throws IOException
{
-    execWithRetries(hostname, getCommandProvider(service).getCommand(service, op));
+    execWithRetries(hostname, service, getCommandProvider(service).getCommand(service, op));
   }
 
   @Override
@@ -271,13 +363,13 @@ public class HBaseClusterManager extends Configured implements ClusterManager
{
   }
 
   public void signal(ServiceType service, String signal, String hostname) throws IOException
{
-    execWithRetries(hostname, getCommandProvider(service).signalCommand(service, signal));
+    execWithRetries(hostname, service, getCommandProvider(service).signalCommand(service,
signal));
   }
 
   @Override
   public boolean isRunning(ServiceType service, String hostname, int port) throws IOException
{
-    String ret = execWithRetries(hostname, getCommandProvider(service).isRunningCommand(service))
-        .getSecond();
+    String ret = execWithRetries(hostname, service,
+      getCommandProvider(service).isRunningCommand(service)).getSecond();
     return ret.length() > 0;
   }
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/95425c44/hbase-it/src/test/java/org/apache/hadoop/hbase/RESTApiClusterManager.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/RESTApiClusterManager.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/RESTApiClusterManager.java
index 9ea126a..717de17 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/RESTApiClusterManager.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/RESTApiClusterManager.java
@@ -321,6 +321,7 @@ public class RESTApiClusterManager extends Configured implements ClusterManager
 
   // The RoleCommand enum is used by the doRoleCommand method to guard against non-existent
methods
   // being invoked on a given role.
+  // TODO: Integrate zookeeper and hdfs related failure injections (Ref: HBASE-14261).
   private enum RoleCommand {
     START, STOP, RESTART;
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/95425c44/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
index ebc83ff..329ba80 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
@@ -27,6 +27,7 @@ import java.util.List;
 import org.apache.commons.lang.math.RandomUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.ClusterStatus;
 import org.apache.hadoop.hbase.HBaseCluster;
 import org.apache.hadoop.hbase.HRegionInfo;
@@ -35,7 +36,6 @@ import org.apache.hadoop.hbase.ServerLoad;
 import org.apache.hadoop.hbase.ServerName;
 import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
 import org.apache.hadoop.hbase.client.Admin;
-import org.apache.hadoop.hbase.client.HBaseAdmin;
 import org.apache.hadoop.hbase.util.Bytes;
 
 /**
@@ -44,11 +44,19 @@ import org.apache.hadoop.hbase.util.Bytes;
 public class Action {
 
   public static final String KILL_MASTER_TIMEOUT_KEY =
-      "hbase.chaosmonkey.action.killmastertimeout";
+    "hbase.chaosmonkey.action.killmastertimeout";
   public static final String START_MASTER_TIMEOUT_KEY =
-      "hbase.chaosmonkey.action.startmastertimeout";
+    "hbase.chaosmonkey.action.startmastertimeout";
   public static final String KILL_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.killrstimeout";
   public static final String START_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.startrstimeout";
+  public static final String KILL_ZK_NODE_TIMEOUT_KEY =
+    "hbase.chaosmonkey.action.killzknodetimeout";
+  public static final String START_ZK_NODE_TIMEOUT_KEY =
+    "hbase.chaosmonkey.action.startzknodetimeout";
+  public static final String KILL_DATANODE_TIMEOUT_KEY =
+    "hbase.chaosmonkey.action.killdatanodetimeout";
+  public static final String START_DATANODE_TIMEOUT_KEY =
+    "hbase.chaosmonkey.action.startdatanodetimeout";
 
   protected static Log LOG = LogFactory.getLog(Action.class);
 
@@ -56,6 +64,10 @@ public class Action {
   protected static final long START_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
   protected static final long KILL_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
   protected static final long START_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
+  protected static final long KILL_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
+  protected static final long START_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
+  protected static final long KILL_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
+  protected static final long START_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
 
   protected ActionContext context;
   protected HBaseCluster cluster;
@@ -66,6 +78,10 @@ public class Action {
   protected long startMasterTimeout;
   protected long killRsTimeout;
   protected long startRsTimeout;
+  protected long killZkNodeTimeout;
+  protected long startZkNodeTimeout;
+  protected long killDataNodeTimeout;
+  protected long startDataNodeTimeout;
 
   public void init(ActionContext context) throws IOException {
     this.context = context;
@@ -75,11 +91,19 @@ public class Action {
     initialServers = regionServers.toArray(new ServerName[regionServers.size()]);
 
     killMasterTimeout = cluster.getConf().getLong(KILL_MASTER_TIMEOUT_KEY,
-        KILL_MASTER_TIMEOUT_DEFAULT);
+      KILL_MASTER_TIMEOUT_DEFAULT);
     startMasterTimeout = cluster.getConf().getLong(START_MASTER_TIMEOUT_KEY,
-        START_MASTER_TIMEOUT_DEFAULT);
+      START_MASTER_TIMEOUT_DEFAULT);
     killRsTimeout = cluster.getConf().getLong(KILL_RS_TIMEOUT_KEY, KILL_RS_TIMEOUT_DEFAULT);
     startRsTimeout = cluster.getConf().getLong(START_RS_TIMEOUT_KEY, START_RS_TIMEOUT_DEFAULT);
+    killZkNodeTimeout = cluster.getConf().getLong(KILL_ZK_NODE_TIMEOUT_KEY,
+      KILL_ZK_NODE_TIMEOUT_DEFAULT);
+    startZkNodeTimeout = cluster.getConf().getLong(START_ZK_NODE_TIMEOUT_KEY,
+      START_ZK_NODE_TIMEOUT_DEFAULT);
+    killDataNodeTimeout = cluster.getConf().getLong(KILL_DATANODE_TIMEOUT_KEY,
+      KILL_DATANODE_TIMEOUT_DEFAULT);
+    startDataNodeTimeout = cluster.getConf().getLong(START_DATANODE_TIMEOUT_KEY,
+      START_DATANODE_TIMEOUT_DEFAULT);
   }
 
   public void perform() throws Exception { }
@@ -132,7 +156,37 @@ public class Action {
     cluster.startRegionServer(server.getHostname(), server.getPort());
     cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(), startRsTimeout);
     LOG.info("Started region server:" + server + ". Reported num of rs:"
-        + cluster.getClusterStatus().getServersSize());
+      + cluster.getClusterStatus().getServersSize());
+  }
+
+  protected void killZKNode(ServerName server) throws IOException {
+    LOG.info("Killing zookeeper node:" + server);
+    cluster.killZkNode(server);
+    cluster.waitForZkNodeToStop(server, killZkNodeTimeout);
+    LOG.info("Killed zookeeper node:" + server + ". Reported num of rs:"
+      + cluster.getClusterStatus().getServersSize());
+  }
+
+  protected void startZKNode(ServerName server) throws IOException {
+    LOG.info("Starting zookeeper node:" + server.getHostname());
+    cluster.startZkNode(server.getHostname(), server.getPort());
+    cluster.waitForZkNodeToStart(server, startZkNodeTimeout);
+    LOG.info("Started zookeeper node:" + server);
+  }
+
+  protected void killDataNode(ServerName server) throws IOException {
+    LOG.info("Killing datanode:" + server);
+    cluster.killDataNode(server);
+    cluster.waitForDataNodeToStop(server, killDataNodeTimeout);
+    LOG.info("Killed datanode:" + server + ". Reported num of rs:"
+      + cluster.getClusterStatus().getServersSize());
+  }
+
+  protected void startDataNode(ServerName server) throws IOException {
+    LOG.info("Starting datanode:" + server.getHostname());
+    cluster.startDataNode(server);
+    cluster.waitForDataNodeToStart(server, startDataNodeTimeout);
+    LOG.info("Started datanode:" + server);
   }
 
   protected void unbalanceRegions(ClusterStatus clusterStatus,
@@ -174,6 +228,10 @@ public class Action {
     }
   }
 
+  public Configuration getConf() {
+    return cluster.getConf();
+  }
+
   /**
    * Context for Action's
    */

http://git-wip-us.apache.org/repos/asf/hbase/blob/95425c44/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java
b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java
index 8795352..3f209da 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java
@@ -51,4 +51,18 @@ public class RestartActionBaseAction extends Action {
     sleep(sleepTime);
     startRs(server);
   }
+
+  void restartZKNode(ServerName server, long sleepTime) throws IOException {
+    sleepTime = Math.max(sleepTime, 1000);
+    killZKNode(server);
+    sleep(sleepTime);
+    startZKNode(server);
+  }
+
+  void restartDataNode(ServerName server, long sleepTime) throws IOException {
+    sleepTime = Math.max(sleepTime, 1000);
+    killDataNode(server);
+    sleep(sleepTime);
+    startDataNode(server);
+  }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/95425c44/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartRandomDataNodeAction.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartRandomDataNodeAction.java
b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartRandomDataNodeAction.java
new file mode 100644
index 0000000..7299e79
--- /dev/null
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartRandomDataNodeAction.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.chaos.actions;
+
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
+import org.apache.hadoop.hbase.util.FSUtils;
+import org.apache.hadoop.hdfs.DFSClient;
+import org.apache.hadoop.hdfs.DistributedFileSystem;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * Action that restarts a random datanode.
+ */
+public class RestartRandomDataNodeAction extends RestartActionBaseAction {
+  public RestartRandomDataNodeAction(long sleepTime) {
+    super(sleepTime);
+  }
+
+  @Override
+  public void perform() throws Exception {
+    LOG.info("Performing action: Restart random data node");
+    ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getDataNodes());
+    restartDataNode(server, sleepTime);
+  }
+
+  public ServerName[] getDataNodes() throws IOException {
+    DistributedFileSystem fs = (DistributedFileSystem) FSUtils.getRootDir(getConf())
+        .getFileSystem(getConf());
+    DFSClient dfsClient = fs.getClient();
+    List<ServerName> hosts = new LinkedList<ServerName>();
+    for (DatanodeInfo dataNode: dfsClient.datanodeReport(HdfsConstants.DatanodeReportType.LIVE))
{
+      hosts.add(ServerName.valueOf(dataNode.getHostName(), -1, -1));
+    }
+    return hosts.toArray(new ServerName[hosts.size()]);
+  }
+}

http://git-wip-us.apache.org/repos/asf/hbase/blob/95425c44/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartRandomZKNodeAction.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartRandomZKNodeAction.java
b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartRandomZKNodeAction.java
new file mode 100644
index 0000000..6043acd
--- /dev/null
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartRandomZKNodeAction.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.chaos.actions;
+
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
+import org.apache.hadoop.hbase.zookeeper.ZKServerTool;
+
+/**
+ * Action that restarts a random zookeeper node.
+ */
+public class RestartRandomZKNodeAction extends RestartActionBaseAction {
+  public RestartRandomZKNodeAction(long sleepTime) {
+    super(sleepTime);
+  }
+
+  @Override
+  public void perform() throws Exception {
+    LOG.info("Performing action: Restart random zookeeper node");
+    ServerName server = PolicyBasedChaosMonkey.selectRandomItem(
+        ZKServerTool.readZKNodes(getConf()));
+    restartZKNode(server, sleepTime);
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/hbase/blob/95425c44/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyFactory.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyFactory.java
b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyFactory.java
index 2f65251..0cf3318 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyFactory.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/MonkeyFactory.java
@@ -70,12 +70,14 @@ public abstract class MonkeyFactory {
   public static final String STRESS_AM = "stressAM";
   public static final String NO_KILL = "noKill";
   public static final String MASTER_KILLING = "masterKilling";
+  public static final String SERVER_AND_DEPENDENCIES_KILLING = "serverAndDependenciesKilling";
 
   public static Map<String, MonkeyFactory> FACTORIES = ImmutableMap.<String,MonkeyFactory>builder()
     .put(CALM, new CalmMonkeyFactory())
     .put(SLOW_DETERMINISTIC, new SlowDeterministicMonkeyFactory())
     .put(UNBALANCE, new UnbalanceMonkeyFactory())
     .put(SERVER_KILLING, new ServerKillingMonkeyFactory())
+    .put(SERVER_AND_DEPENDENCIES_KILLING, new ServerAndDependenciesKillingMonkeyFactory())
     .put(STRESS_AM, new StressAssignmentManagerMonkeyFactory())
     .put(NO_KILL, new NoKillMonkeyFactory())
     .put(MASTER_KILLING, new MasterKillingMonkeyFactory())

http://git-wip-us.apache.org/repos/asf/hbase/blob/95425c44/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java
b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java
new file mode 100644
index 0000000..4faa786
--- /dev/null
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.chaos.factories;
+
+import org.apache.hadoop.hbase.chaos.actions.Action;
+import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
+import org.apache.hadoop.hbase.chaos.actions.ForceBalancerAction;
+import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction;
+import org.apache.hadoop.hbase.chaos.actions.RestartRandomDataNodeAction;
+import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction;
+import org.apache.hadoop.hbase.chaos.actions.RestartRandomZKNodeAction;
+import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsExceptMetaAction;
+import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
+import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
+import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy;
+import org.apache.hadoop.hbase.chaos.policies.DoActionsOncePolicy;
+import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy;
+
+/**
+ * Creates ChaosMonkeys for doing server restart actions, but not
+ * flush / compact / snapshot kind of actions.
+ */
+public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory {
+
+  @Override
+  public ChaosMonkey build() {
+
+    // Destructive actions to mess things around. Cannot run batch restart.
+    Action[] actions1 = new Action[]{
+      new RestartRandomRsExceptMetaAction(60000),
+      new RestartActiveMasterAction(5000),
+      new RollingBatchRestartRsExceptMetaAction(5000, 1.0f, 2), // only allow 2 servers to
be dead.
+      new ForceBalancerAction(),
+      new RestartRandomDataNodeAction(60000),
+      new RestartRandomZKNodeAction(60000)
+    };
+
+    // Action to log more info for debugging
+    Action[] actions2 = new Action[]{
+      new DumpClusterStatusAction()
+    };
+
+    return new PolicyBasedChaosMonkey(util,
+      new CompositeSequentialPolicy(
+        new DoActionsOncePolicy(60 * 1000, actions1),
+        new PeriodicRandomActionPolicy(60 * 1000, actions1)),
+      new PeriodicRandomActionPolicy(60 * 1000, actions2));
+  }
+}

http://git-wip-us.apache.org/repos/asf/hbase/blob/95425c44/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKServerTool.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKServerTool.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKServerTool.java
index 73483da..8a958c4 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKServerTool.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKServerTool.java
@@ -19,9 +19,13 @@
 
 package org.apache.hadoop.hbase.zookeeper;
 
+import java.util.LinkedList;
+import java.util.List;
 import java.util.Properties;
 import java.util.Map.Entry;
 
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.ServerName;
 import org.apache.hadoop.hbase.classification.InterfaceAudience;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.HBaseConfiguration;
@@ -33,12 +37,10 @@ import org.apache.hadoop.hbase.HBaseInterfaceAudience;
  */
 @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.TOOLS)
 public class ZKServerTool {
-  /**
-   * Run the tool.
-   * @param args Command line arguments.
-   */
-  public static void main(String args[]) {
-    Configuration conf = HBaseConfiguration.create();
+
+  public static ServerName[] readZKNodes(Configuration conf) {
+    List<ServerName> hosts = new LinkedList<ServerName>();
+
     // Note that we do not simply grab the property
     // HConstants.ZOOKEEPER_QUORUM from the HBaseConfiguration because the
     // user may be using a zoo.cfg file.
@@ -49,8 +51,25 @@ public class ZKServerTool {
       if (key.startsWith("server.")) {
         String[] parts = value.split(":");
         String host = parts[0];
-        System.out.println("ZK host:" + host);
+
+        int port = HConstants.DEFAULT_ZOOKEPER_CLIENT_PORT;
+        if (parts.length > 1) {
+          port = Integer.parseInt(parts[1]);
+        }
+        hosts.add(ServerName.valueOf(host, port, -1));
       }
     }
+    return hosts.toArray(new ServerName[hosts.size()]);
+  }
+
+  /**
+   * Run the tool.
+   * @param args Command line arguments.
+   */
+  public static void main(String args[]) {
+    for(ServerName server: readZKNodes(HBaseConfiguration.create())) {
+      // bin/zookeeper.sh relies on the "ZK host" string for grepping which is case sensitive.
+      System.out.println("ZK host: " + server.getHostname());
+    }
   }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/95425c44/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java
index adbdd69..b32b4f1 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java
@@ -163,6 +163,81 @@ public abstract class HBaseCluster implements Closeable, Configurable
{
       throws IOException;
 
   /**
+   * Starts a new zookeeper node on the given hostname or if this is a mini/local cluster,
+   * silently logs warning message.
+   * @param hostname the hostname to start the regionserver on
+   * @throws IOException if something goes wrong
+   */
+  public abstract void startZkNode(String hostname, int port) throws IOException;
+
+  /**
+   * Kills the zookeeper node process if this is a distributed cluster, otherwise,
+   * this causes master to exit doing basic clean up only.
+   * @throws IOException if something goes wrong
+   */
+  public abstract void killZkNode(ServerName serverName) throws IOException;
+
+  /**
+   * Stops the region zookeeper if this is a distributed cluster, otherwise
+   * silently logs warning message.
+   * @throws IOException if something goes wrong
+   */
+  public abstract void stopZkNode(ServerName serverName) throws IOException;
+
+  /**
+   * Wait for the specified zookeeper node to join the cluster
+   * @return whether the operation finished with success
+   * @throws IOException if something goes wrong or timeout occurs
+   */
+  public abstract void waitForZkNodeToStart(ServerName serverName, long timeout)
+    throws IOException;
+
+  /**
+   * Wait for the specified zookeeper node to stop the thread / process.
+   * @return whether the operation finished with success
+   * @throws IOException if something goes wrong or timeout occurs
+   */
+  public abstract void waitForZkNodeToStop(ServerName serverName, long timeout)
+    throws IOException;
+
+  /**
+   * Starts a new datanode on the given hostname or if this is a mini/local cluster,
+   * silently logs warning message.
+   * @throws IOException if something goes wrong
+   */
+  public abstract void startDataNode(ServerName serverName) throws IOException;
+
+  /**
+   * Kills the datanode process if this is a distributed cluster, otherwise,
+   * this causes master to exit doing basic clean up only.
+   * @throws IOException if something goes wrong
+   */
+  public abstract void killDataNode(ServerName serverName) throws IOException;
+
+  /**
+   * Stops the datanode if this is a distributed cluster, otherwise
+   * silently logs warning message.
+   * @throws IOException if something goes wrong
+   */
+  public abstract void stopDataNode(ServerName serverName) throws IOException;
+
+  /**
+   * Wait for the specified datanode to join the cluster
+   * @return whether the operation finished with success
+   * @throws IOException if something goes wrong or timeout occurs
+   */
+  public abstract void waitForDataNodeToStart(ServerName serverName, long timeout)
+    throws IOException;
+
+  /**
+   * Wait for the specified datanode to stop the thread / process.
+   * @return whether the operation finished with success
+   * @throws IOException if something goes wrong or timeout occurs
+   */
+  public abstract void waitForDataNodeToStop(ServerName serverName, long timeout)
+    throws IOException;
+
+  /**
    * Starts a new master on the given hostname or if this is a mini/local cluster,
    * starts a master locally.
    * @param hostname the hostname to start the master on

http://git-wip-us.apache.org/repos/asf/hbase/blob/95425c44/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
index 4a02b04..099b836 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
@@ -261,6 +261,56 @@ public class MiniHBaseCluster extends HBaseCluster {
   }
 
   @Override
+  public void startZkNode(String hostname, int port) throws IOException {
+    LOG.warn("Starting zookeeper nodes on mini cluster is not supported");
+  }
+
+  @Override
+  public void killZkNode(ServerName serverName) throws IOException {
+    LOG.warn("Aborting zookeeper nodes on mini cluster is not supported");
+  }
+
+  @Override
+  public void stopZkNode(ServerName serverName) throws IOException {
+    LOG.warn("Stopping zookeeper nodes on mini cluster is not supported");
+  }
+
+  @Override
+  public void waitForZkNodeToStart(ServerName serverName, long timeout) throws IOException
{
+    LOG.warn("Waiting for zookeeper nodes to start on mini cluster is not supported");
+  }
+
+  @Override
+  public void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOException
{
+    LOG.warn("Waiting for zookeeper nodes to stop on mini cluster is not supported");
+  }
+
+  @Override
+  public void startDataNode(ServerName serverName) throws IOException {
+    LOG.warn("Starting datanodes on mini cluster is not supported");
+  }
+
+  @Override
+  public void killDataNode(ServerName serverName) throws IOException {
+    LOG.warn("Aborting datanodes on mini cluster is not supported");
+  }
+
+  @Override
+  public void stopDataNode(ServerName serverName) throws IOException {
+    LOG.warn("Stopping datanodes on mini cluster is not supported");
+  }
+
+  @Override
+  public void waitForDataNodeToStart(ServerName serverName, long timeout) throws IOException
{
+    LOG.warn("Waiting for datanodes to start on mini cluster is not supported");
+  }
+
+  @Override
+  public void waitForDataNodeToStop(ServerName serverName, long timeout) throws IOException
{
+    LOG.warn("Waiting for datanodes to stop on mini cluster is not supported");
+  }
+
+  @Override
   public void startMaster(String hostname, int port) throws IOException {
     this.startMaster();
   }


Mime
View raw message