hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From apurt...@apache.org
Subject [4/4] hbase git commit: HBASE-13996 Add write sniffing in canary (Liu Shaohui)
Date Sat, 22 Aug 2015 17:08:49 GMT
HBASE-13996 Add write sniffing in canary (Liu Shaohui)

Conflicts:
	hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java
	hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon
	hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java

Amending-Author: Andrew Purtell <apurtell@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/216dbbc2
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/216dbbc2
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/216dbbc2

Branch: refs/heads/0.98
Commit: 216dbbc2fd16cd5ce0b1aaaaddc4e5bdf634e32e
Parents: c9fa5a0
Author: Andrew Purtell <apurtell@apache.org>
Authored: Fri Aug 21 22:44:56 2015 -0700
Committer: Andrew Purtell <apurtell@apache.org>
Committed: Fri Aug 21 23:12:57 2015 -0700

----------------------------------------------------------------------
 .../org/apache/hadoop/hbase/HConstants.java     |  27 ++-
 .../hbase/tmpl/master/MasterStatusTmpl.jamon    |  12 +-
 .../org/apache/hadoop/hbase/tool/Canary.java    | 242 +++++++++++++++++--
 .../hadoop/hbase/HBaseTestingUtility.java       |   3 +-
 src/main/asciidoc/_chapters/ops_mgt.adoc        |  21 ++
 5 files changed, 276 insertions(+), 29 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/216dbbc2/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java
----------------------------------------------------------------------
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java
index 0915c7b..64000f7 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java
@@ -785,7 +785,8 @@ public final class HConstants {
   /**
    * timeout for short operation RPC
    */
-  public static final String HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY = "hbase.rpc.shortoperation.timeout";
+  public static final String HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY =
+      "hbase.rpc.shortoperation.timeout";
 
   /**
    * Default value of {@link #HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY}
@@ -840,8 +841,8 @@ public final class HConstants {
     */
   public static final float HBASE_CLUSTER_MINIMUM_MEMORY_THRESHOLD = 0.2f;
 
-  public static final Pattern CP_HTD_ATTR_KEY_PATTERN = Pattern.compile
-      ("^coprocessor\\$([0-9]+)$", Pattern.CASE_INSENSITIVE);
+  public static final Pattern CP_HTD_ATTR_KEY_PATTERN =
+      Pattern.compile("^coprocessor\\$([0-9]+)$", Pattern.CASE_INSENSITIVE);
   public static final Pattern CP_HTD_ATTR_VALUE_PATTERN =
       Pattern.compile("(^[^\\|]*)\\|([^\\|]+)\\|[\\s]*([\\d]*)[\\s]*(\\|.*)?$");
 
@@ -934,7 +935,8 @@ public final class HConstants {
     "hbase.regionserver.wal.enablecompression";
 
   /** Region in Transition metrics threshold time */
-  public static final String METRICS_RIT_STUCK_WARNING_THRESHOLD="hbase.metrics.rit.stuck.warning.threshold";
+  public static final String METRICS_RIT_STUCK_WARNING_THRESHOLD =
+      "hbase.metrics.rit.stuck.warning.threshold";
 
   public static final String LOAD_BALANCER_SLOP_KEY = "hbase.regions.slop";
 
@@ -1027,7 +1029,8 @@ public final class HConstants {
    * 0.0.0.0.
    * @see <a href="https://issues.apache.org/jira/browse/HBASE-9961">HBASE-9961</a>
    */
-  public static final String STATUS_MULTICAST_BIND_ADDRESS = "hbase.status.multicast.bind.address.ip";
+  public static final String STATUS_MULTICAST_BIND_ADDRESS =
+      "hbase.status.multicast.bind.address.ip";
   public static final String DEFAULT_STATUS_MULTICAST_BIND_ADDRESS = "0.0.0.0";
 
   /**
@@ -1106,6 +1109,20 @@ public final class HConstants {
   public static final String REGION_SPLIT_THREADS_MAX =
     "hbase.regionserver.region.split.threads.max";
 
+  /** Canary config keys */
+  public static final String HBASE_CANARY_WRITE_DATA_TTL_KEY = "hbase.canary.write.data.ttl";
+
+  public static final String HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY =
+      "hbase.canary.write.perserver.regions.lowerLimit";
+
+  public static final String HBASE_CANARY_WRITE_PERSERVER_REGIONS_UPPERLIMIT_KEY =
+      "hbase.canary.write.perserver.regions.upperLimit";
+
+  public static final String HBASE_CANARY_WRITE_VALUE_SIZE_KEY = "hbase.canary.write.value.size";
+
+  public static final String HBASE_CANARY_WRITE_TABLE_CHECK_PERIOD_KEY =
+      "hbase.canary.write.table.check.period";
+
   private HConstants() {
     // Can't be instantiated with this ctor.
   }

http://git-wip-us.apache.org/repos/asf/hbase/blob/216dbbc2/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon
b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon
index 1b60763..cdcb14f 100644
--- a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon
+++ b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon
@@ -49,6 +49,7 @@ org.apache.hadoop.hbase.master.RegionState;
 org.apache.hadoop.hbase.HTableDescriptor;
 org.apache.hadoop.hbase.HBaseConfiguration;
 org.apache.hadoop.hbase.TableName;
+org.apache.hadoop.hbase.tool.Canary;
 org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
 org.apache.hadoop.hbase.master.DeadServer;
 org.apache.hadoop.hbase.protobuf.ProtobufUtil;
@@ -359,12 +360,15 @@ AssignmentManager assignmentManager = master.getAssignmentManager();
     </%if>
     <%java>String description = null;
         if (tableName.equals(TableName.META_TABLE_NAME)){
-            description = "The hbase:meta table holds references to all User Table regions";
+            description = "The hbase:meta table holds references to all User Table regions.";
+        } else if (tableName.equals(Canary.DEFAULT_WRITE_TABLE_NAME)){
+            description = "The hbase:canary table is used to sniff the write availbility
of"
+              + " each regionserver.";
         } else if (tableName.equals(AccessControlLists.ACL_TABLE_NAME)){
             description = "The hbase:acl table holds information about acl";
-	 } else if (tableName.equals(VisibilityConstants.LABELS_TABLE_NAME)){
-	     description = "The hbase:labels table holds information about visibility labels";
-        } else {
+        } else if (tableName.equals(VisibilityConstants.LABELS_TABLE_NAME)){
+            description = "The hbase:labels table holds information about visibility labels.";
+        } else if (tableName.equals(TableName.NAMESPACE_TABLE_NAME)){
             description = "The hbase:namespace table holds information about namespaces.";
         }
     </%java>

http://git-wip-us.apache.org/repos/asf/hbase/blob/216dbbc2/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
index d08f344..c2ef7a2 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
@@ -24,6 +24,7 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
@@ -47,8 +48,11 @@ import org.apache.hadoop.hbase.AuthUtil;
 import org.apache.hadoop.hbase.DoNotRetryIOException;
 import org.apache.hadoop.hbase.HBaseConfiguration;
 import org.apache.hadoop.hbase.HColumnDescriptor;
+import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.HRegionInfo;
+import org.apache.hadoop.hbase.HRegionLocation;
 import org.apache.hadoop.hbase.HTableDescriptor;
+import org.apache.hadoop.hbase.NamespaceDescriptor;
 import org.apache.hadoop.hbase.ServerName;
 import org.apache.hadoop.hbase.TableName;
 import org.apache.hadoop.hbase.TableNotEnabledException;
@@ -59,13 +63,20 @@ import org.apache.hadoop.hbase.client.HConnection;
 import org.apache.hadoop.hbase.client.HConnectionManager;
 import org.apache.hadoop.hbase.client.HTable;
 import org.apache.hadoop.hbase.client.HTableInterface;
+import org.apache.hadoop.hbase.client.Put;
 import org.apache.hadoop.hbase.client.ResultScanner;
 import org.apache.hadoop.hbase.client.Scan;
 import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
+import org.apache.hadoop.hbase.tool.Canary.RegionTask.TaskType;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
 import org.apache.hadoop.hbase.util.ReflectionUtils;
+import org.apache.hadoop.hbase.util.RegionSplitter;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 
+import com.google.protobuf.ServiceException;
+
 /**
  * HBase Canary Tool, that that can be used to do
  * "canary monitoring" of a running HBase cluster.
@@ -83,6 +94,9 @@ public final class Canary implements Tool {
     public void publishReadFailure(HRegionInfo region, Exception e);
     public void publishReadFailure(HRegionInfo region, HColumnDescriptor column, Exception
e);
     public void publishReadTiming(HRegionInfo region, HColumnDescriptor column, long msTime);
+    public void publishWriteFailure(HRegionInfo region, Exception e);
+    public void publishWriteFailure(HRegionInfo region, HColumnDescriptor column, Exception
e);
+    public void publishWriteTiming(HRegionInfo region, HColumnDescriptor column, long msTime);
   }
   // new extended sink for output regionserver mode info
   // do not change the Sink interface directly due to maintaining the API
@@ -110,6 +124,23 @@ public final class Canary implements Tool {
       LOG.info(String.format("read from region %s column family %s in %dms",
                region.getRegionNameAsString(), column.getNameAsString(), msTime));
     }
+
+    @Override
+    public void publishWriteFailure(HRegionInfo region, Exception e) {
+      LOG.error(String.format("write to region %s failed", region.getRegionNameAsString()),
e);
+    }
+
+    @Override
+    public void publishWriteFailure(HRegionInfo region, HColumnDescriptor column, Exception
e) {
+      LOG.error(String.format("write to region %s column family %s failed",
+        region.getRegionNameAsString(), column.getNameAsString()), e);
+    }
+
+    @Override
+    public void publishWriteTiming(HRegionInfo region, HColumnDescriptor column, long msTime)
{
+      LOG.info(String.format("write to region %s column family %s in %dms",
+        region.getRegionNameAsString(), column.getNameAsString(), msTime));
+    }
   }
   // a ExtendedSink implementation
   public static class RegionServerStdOutSink extends StdOutSink implements ExtendedSink {
@@ -130,19 +161,35 @@ public final class Canary implements Tool {
    * For each column family of the region tries to get one row and outputs the latency, or
the
    * failure.
    */
-  static class RegionTask implements Callable<Void> {
+  public static class RegionTask implements Callable<Void> {
+    public enum TaskType{
+      READ, WRITE
+    }
     private HConnection connection;
     private HRegionInfo region;
     private Sink sink;
+    private TaskType taskType;
 
-    RegionTask(HConnection connection, HRegionInfo region, Sink sink) {
+    RegionTask(HConnection connection, HRegionInfo region, Sink sink, TaskType taskType)
{
       this.connection = connection;
       this.region = region;
       this.sink = sink;
+      this.taskType = taskType;
     }
 
     @Override
     public Void call() {
+      switch (taskType) {
+      case READ:
+        return read();
+      case WRITE:
+        return write();
+      default:
+        return read();
+      }
+    }
+
+    public Void read() {
       HTableInterface table = null;
       HTableDescriptor tableDesc = null;
       try {
@@ -155,6 +202,7 @@ public final class Canary implements Tool {
           try {
             table.close();
           } catch (IOException ioe) {
+            LOG.error("Close table failed", e);
           }
         }
         return null;
@@ -209,6 +257,44 @@ public final class Canary implements Tool {
       try {
         table.close();
       } catch (IOException e) {
+        LOG.error("Close table failed", e);
+      }
+      return null;
+    }
+
+    /**
+     * Check writes for the canary table
+     * @return
+     */
+    private Void write() {
+      HTableInterface table = null;
+      HTableDescriptor tableDesc = null;
+      try {
+        table = connection.getTable(region.getTable());
+        tableDesc = table.getTableDescriptor();
+        byte[] rowToCheck = region.getStartKey();
+        if (rowToCheck.length == 0) {
+          rowToCheck = new byte[]{0x0};
+        }
+        int writeValueSize =
+            connection.getConfiguration().getInt(HConstants.HBASE_CANARY_WRITE_VALUE_SIZE_KEY,
10);
+        for (HColumnDescriptor column : tableDesc.getColumnFamilies()) {
+          Put put = new Put(rowToCheck);
+          byte[] value = new byte[writeValueSize];
+          Bytes.random(value);
+          put.add(column.getName(), HConstants.EMPTY_BYTE_ARRAY, value);
+          try {
+            long startTime = System.currentTimeMillis();
+            table.put(put);
+            long time = System.currentTimeMillis() - startTime;
+            sink.publishWriteTiming(region, column, time);
+          } catch (Exception e) {
+            sink.publishWriteFailure(region, column, e);
+          }
+        }
+        table.close();
+      } catch (IOException e) {
+        sink.publishWriteFailure(region, e);
       }
       return null;
     }
@@ -266,6 +352,7 @@ public final class Canary implements Tool {
         }
         sink.publishReadTiming(tableName.getNameAsString(), serverName, stopWatch.getTime());
       } catch (TableNotFoundException tnfe) {
+        LOG.error("Table may be deleted", tnfe);
         // This is ignored because it doesn't imply that the regionserver is dead
       } catch (TableNotEnabledException tnee) {
         // This is considered a success since we got a response.
@@ -281,6 +368,7 @@ public final class Canary implements Tool {
           try {
             table.close();
           } catch (IOException e) {/* DO NOTHING */
+            LOG.error("Close table failed", e);
           }
         }
         scan = null;
@@ -299,11 +387,15 @@ public final class Canary implements Tool {
   private static final long DEFAULT_INTERVAL = 6000;
 
   private static final long DEFAULT_TIMEOUT = 600000; // 10 mins
-
   private static final int MAX_THREADS_NUM = 16; // #threads to contact regions
 
   private static final Log LOG = LogFactory.getLog(Canary.class);
 
+  public static final TableName DEFAULT_WRITE_TABLE_NAME = TableName.valueOf(
+    NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR, "canary");
+
+  private static final String CANARY_TABLE_FAMILY_NAME = "Test";
+
   private Configuration conf = null;
   private long interval = 0;
   private Sink sink = null;
@@ -312,6 +404,9 @@ public final class Canary implements Tool {
   private long timeout = DEFAULT_TIMEOUT;
   private boolean failOnError = true;
   private boolean regionServerMode = false;
+  private boolean writeSniffing = false;
+  private TableName writeTableName = DEFAULT_WRITE_TABLE_NAME;
+
   private ExecutorService executor; // threads to retrieve data from regionservers
 
   public Canary() {
@@ -333,10 +428,8 @@ public final class Canary implements Tool {
     this.conf = conf;
   }
 
-  @Override
-  public int run(String[] args) throws Exception {
+  private int parseArgs(String[] args) {
     int index = -1;
-
     // Process command line args
     for (int i = 0; i < args.length; i++) {
       String cmd = args[i];
@@ -371,6 +464,8 @@ public final class Canary implements Tool {
           }
         } else if(cmd.equals("-regionserver")) {
           this.regionServerMode = true;
+        } else if(cmd.equals("-writeSniffing")) {
+          this.writeSniffing = true;
         } else if (cmd.equals("-e")) {
           this.useRegExp = true;
         } else if (cmd.equals("-t")) {
@@ -387,7 +482,14 @@ public final class Canary implements Tool {
             System.err.println("-t needs a numeric value argument.");
             printUsageAndExit();
           }
+        } else if (cmd.equals("-writeTable")) {
+          i++;
 
+          if (i == args.length) {
+            System.err.println("-writeTable needs a string value argument.");
+            printUsageAndExit();
+          }
+          this.writeTableName = TableName.valueOf(args[i]);
         } else if (cmd.equals("-f")) {
           i++;
 
@@ -408,6 +510,12 @@ public final class Canary implements Tool {
         index = i;
       }
     }
+    return index;
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+    int index = parseArgs(args);
 
     // Launches chore for refreshing kerberos credentials if security is enabled.
     // Please see http://hbase.apache.org/book.html#_running_canary_in_a_kerberos_enabled_cluster
@@ -487,6 +595,9 @@ public final class Canary implements Tool {
     System.err.println("   -f <B>         stop whole program if first error occurs,"
+
         " default is true");
     System.err.println("   -t <N>         timeout for a check, default is 600000 (milisecs)");
+    System.err.println("   -writeSniffing enable the write sniffing in canary");
+    System.err.println("   -writeTable    The table used for write sniffing."
+        + " Default is hbase:canary");
     System.exit(USAGE_EXIT_CODE);
   }
 
@@ -513,7 +624,8 @@ public final class Canary implements Tool {
               (ExtendedSink) this.sink, this.executor);
     } else {
       monitor =
-          new RegionMonitor(connection, monitorTargets, this.useRegExp, this.sink, this.executor);
+          new RegionMonitor(connection, monitorTargets, this.useRegExp, this.sink, this.executor,
+              this.writeSniffing, this.writeTableName);
     }
     return monitor;
   }
@@ -576,10 +688,34 @@ public final class Canary implements Tool {
 
   // a monitor for region mode
   private static class RegionMonitor extends Monitor {
+    // 10 minutes
+    private static final int DEFAULT_WRITE_TABLE_CHECK_PERIOD = 10 * 60 * 1000;
+    // 1 days
+    private static final int DEFAULT_WRITE_DATA_TTL = 24 * 60 * 60;
+
+    private long lastCheckTime = -1;
+    private boolean writeSniffing;
+    private TableName writeTableName;
+    private int writeDataTTL;
+    private float regionsLowerLimit;
+    private float regionsUpperLimit;
+    private int checkPeriod;
 
     public RegionMonitor(HConnection connection, String[] monitorTargets, boolean useRegExp,
-        Sink sink, ExecutorService executor) {
+        Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName)
{
       super(connection, monitorTargets, useRegExp, sink, executor);
+      Configuration conf = connection.getConfiguration();
+      this.writeSniffing = writeSniffing;
+      this.writeTableName = writeTableName;
+      this.writeDataTTL =
+          conf.getInt(HConstants.HBASE_CANARY_WRITE_DATA_TTL_KEY, DEFAULT_WRITE_DATA_TTL);
+      this.regionsLowerLimit =
+          conf.getFloat(HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY, 1.0f);
+      this.regionsUpperLimit =
+          conf.getFloat(HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_UPPERLIMIT_KEY, 1.5f);
+      this.checkPeriod =
+          conf.getInt(HConstants.HBASE_CANARY_WRITE_TABLE_CHECK_PERIOD_KEY,
+            DEFAULT_WRITE_TABLE_CHECK_PERIOD);
     }
 
     @Override
@@ -591,11 +727,26 @@ public final class Canary implements Tool {
             String[] tables = generateMonitorTables(this.targets);
             this.initialized = true;
             for (String table : tables) {
-              taskFutures.addAll(Canary.sniff(connection, sink, table, executor));
+              taskFutures.addAll(Canary.sniff(connection, sink, table, executor, TaskType.READ));
             }
           } else {
-            taskFutures.addAll(sniff());
+            taskFutures.addAll(sniff(TaskType.READ));
           }
+
+          if (writeSniffing) {
+            if (EnvironmentEdgeManager.currentTimeMillis() - lastCheckTime > checkPeriod)
{
+              try {
+                checkWriteTableDistribution();
+              } catch (IOException e) {
+                LOG.error("Check canary table distribution failed!", e);
+              }
+              lastCheckTime = EnvironmentEdgeManager.currentTimeMillis();
+            }
+            // sniff canary table with write operation
+            taskFutures.addAll(Canary.sniff(connection, sink,
+              writeTableName.getNameAsString(), executor, TaskType.WRITE));
+          }
+
           for (Future<Void> future : taskFutures) {
             try {
               future.get();
@@ -651,25 +802,77 @@ public final class Canary implements Tool {
     /*
      * canary entry point to monitor all the tables.
      */
-    private List<Future<Void>> sniff() throws Exception {
+    private List<Future<Void>> sniff(TaskType taskType) throws Exception {
       List<Future<Void>> taskFutures = new LinkedList<Future<Void>>();
       for (HTableDescriptor table : admin.listTables()) {
-        if (admin.isTableEnabled(table.getTableName())) {
-          taskFutures.addAll(Canary.sniff(connection, sink, table.getTableName(), executor));
+        if (admin.isTableEnabled(table.getTableName())
+            && (!table.getTableName().equals(writeTableName))) {
+          taskFutures.addAll(Canary.sniff(connection, sink, table.getTableName(), executor,
+            taskType));
         }
       }
       return taskFutures;
     }
+
+    private void checkWriteTableDistribution() throws IOException, ServiceException {
+      if (!admin.tableExists(writeTableName)) {
+        int numberOfServers = admin.getClusterStatus().getServers().size();
+        if (numberOfServers == 0) {
+          throw new IllegalStateException("No live regionservers");
+        }
+        createWriteTable(numberOfServers);
+      }
+
+      if (!admin.isTableEnabled(writeTableName)) {
+        admin.enableTable(writeTableName);
+      }
+
+      int numberOfServers = admin.getClusterStatus().getServers().size();
+      List<HRegionLocation> locations = connection.locateRegions(writeTableName);
+      int numberOfRegions = locations.size();
+      if (numberOfRegions < numberOfServers * regionsLowerLimit
+          || numberOfRegions > numberOfServers * regionsUpperLimit) {
+        admin.disableTable(writeTableName);
+        admin.deleteTable(writeTableName);
+        createWriteTable(numberOfServers);
+      }
+      HashSet<ServerName> serverSet = new HashSet<ServerName>();
+      for (HRegionLocation location: locations) {
+        serverSet.add(location.getServerName());
+      }
+      int numberOfCoveredServers = serverSet.size();
+      if (numberOfCoveredServers < numberOfServers) {
+        admin.balancer();
+      }
+    }
+
+    private void createWriteTable(int numberOfServers) throws IOException {
+      int numberOfRegions = (int)(numberOfServers * regionsLowerLimit);
+      LOG.info("Number of live regionservers: " + numberOfServers + ", "
+          + "pre-splitting the canary table into " + numberOfRegions + " regions "
+          + "(current  lower limi of regions per server is " + regionsLowerLimit
+          + " and you can change it by config: "
+          + HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY + " )");
+      HTableDescriptor desc = new HTableDescriptor(writeTableName);
+      HColumnDescriptor family = new HColumnDescriptor(CANARY_TABLE_FAMILY_NAME);
+      family.setMaxVersions(1);
+      family.setTimeToLive(writeDataTTL);
+
+      desc.addFamily(family);
+      byte[][] splits = new RegionSplitter.HexStringSplit().split(numberOfRegions);
+      admin.createTable(desc, splits);
+    }
   }
 
   /**
    * Canary entry point for specified table.
    * @throws Exception
    */
-  public static void sniff(final HConnection connection, TableName tableName) throws Exception
{
+  public static void sniff(final HConnection connection, TableName tableName, TaskType taskType)
+      throws Exception {
     List<Future<Void>> taskFutures =
         Canary.sniff(connection, new StdOutSink(), tableName.getNameAsString(),
-          new ScheduledThreadPoolExecutor(1));
+          new ScheduledThreadPoolExecutor(1), taskType);
     for (Future<Void> future : taskFutures) {
       future.get();
     }
@@ -680,11 +883,12 @@ public final class Canary implements Tool {
    * @throws Exception
    */
   private static List<Future<Void>> sniff(final HConnection connection, final
Sink sink,
-    String tableName, ExecutorService executor) throws Exception {
+    String tableName, ExecutorService executor, TaskType taskType) throws Exception {
     HBaseAdmin admin = new HBaseAdmin(connection);
     try {
       if (admin.isTableEnabled(TableName.valueOf(tableName))) {
-        return Canary.sniff(connection, sink, TableName.valueOf(tableName), executor);
+        return Canary.sniff(connection, sink, TableName.valueOf(tableName), executor,
+          taskType);
       } else {
         LOG.warn(String.format("Table %s is not enabled", tableName));
       }
@@ -698,7 +902,7 @@ public final class Canary implements Tool {
    * Loops over regions that owns this table, and output some information abouts the state.
    */
   private static List<Future<Void>> sniff(final HConnection connection, final
Sink sink,
-      TableName tableName, ExecutorService executor) throws Exception {
+      TableName tableName, ExecutorService executor, TaskType taskType) throws Exception
{
     HTableInterface table = null;
     try {
       table = connection.getTable(tableName);
@@ -708,7 +912,7 @@ public final class Canary implements Tool {
     List<RegionTask> tasks = new ArrayList<RegionTask>();
     try {
       for (HRegionInfo region : ((HTable)table).getRegionLocations().keySet()) {
-        tasks.add(new RegionTask(connection, region, sink));
+        tasks.add(new RegionTask(connection, region, sink, taskType));
       }
     } finally {
       table.close();

http://git-wip-us.apache.org/repos/asf/hbase/blob/216dbbc2/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
index 6e665c4..d0aacff 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
@@ -88,6 +88,7 @@ import org.apache.hadoop.hbase.regionserver.wal.HLog;
 import org.apache.hadoop.hbase.security.User;
 import org.apache.hadoop.hbase.security.visibility.VisibilityLabelsCache;
 import org.apache.hadoop.hbase.tool.Canary;
+import org.apache.hadoop.hbase.tool.Canary.RegionTask.TaskType;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.FSTableDescriptors;
 import org.apache.hadoop.hbase.util.FSUtils;
@@ -2822,7 +2823,7 @@ public class HBaseTestingUtility extends HBaseCommonTestingUtility {
     // RegionOpeningException.  It is crass but when done all will be online.
     HConnection connection = HConnectionManager.createConnection(conf);
     try {
-      Canary.sniff(connection, TableName.valueOf(table));
+      Canary.sniff(connection, TableName.valueOf(table), TaskType.READ);
     } catch (Exception e) {
       throw new IOException(e);
     } finally {

http://git-wip-us.apache.org/repos/asf/hbase/blob/216dbbc2/src/main/asciidoc/_chapters/ops_mgt.adoc
----------------------------------------------------------------------
diff --git a/src/main/asciidoc/_chapters/ops_mgt.adoc b/src/main/asciidoc/_chapters/ops_mgt.adoc
index 514003d..af99215 100644
--- a/src/main/asciidoc/_chapters/ops_mgt.adoc
+++ b/src/main/asciidoc/_chapters/ops_mgt.adoc
@@ -92,6 +92,8 @@ Usage: bin/hbase org.apache.hadoop.hbase.tool.Canary [opts] [table1 [table2]...]
       which means the region/regionserver is regular expression pattern
    -f <B>         stop whole program if first error occurs, default is true
    -t <N>         timeout for a check, default is 600000 (milliseconds)
+   -writeSniffing enable the write sniffing in canary
+   -writeTable    The table used for write sniffing. Default is hbase:canary
 ----
 
 This tool will return non zero error codes to user for collaborating with other monitoring
tools, such as Nagios.
@@ -193,6 +195,25 @@ This run sets the timeout value to 60 seconds, the default value is 600
seconds.
 $ ${HBASE_HOME}/bin/hbase org.apache.hadoop.hbase.tool.Canary -t 600000
 ----
 
+==== Enable write sniffing in canary
+
+By default, the canary tool only check the read operations, it's hard to find the problem
in the
+write path. To enable the write sniffing, you can run canary with the `-writeSniffing` option.
+When the write sniffing is enabled, the canary tool will create a hbase table and make sure
the
+regions of the table distributed on all region servers. In each sniffing period, the canary
will
+try to put data to these regions to check the write availability of each region server.
+----
+$ ${HBASE_HOME}/bin/hbase org.apache.hadoop.hbase.tool.Canary -writeSniffing
+----
+
+The default write table is `hbase:canary` and can be specified by the option `-writeTable`.
+----
+$ ${HBASE_HOME}/bin/hbase org.apache.hadoop.hbase.tool.Canary -writeSniffing -writeTable
ns:canary
+----
+
+The default value size of each put is 10 bytes and you can set it by the config key:
+`hbase.canary.write.value.size`.
+
 ==== Running Canary in a Kerberos-enabled Cluster
 
 To run Canary in a Kerberos-enabled cluster, configure the following two properties in _hbase-site.xml_:


Mime
View raw message