hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From te...@apache.org
Subject hbase git commit: HBASE-15219 Canary tool does not return non-zero exit code when one of regions is in stuck state
Date Sun, 21 Feb 2016 04:39:21 GMT
Repository: hbase
Updated Branches:
  refs/heads/0.98 95b55fea8 -> ba03fd4b3


HBASE-15219 Canary tool does not return non-zero exit code when one of regions is in stuck
state


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/ba03fd4b
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/ba03fd4b
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/ba03fd4b

Branch: refs/heads/0.98
Commit: ba03fd4b3eafa10f0690486c963725428024fdc3
Parents: 95b55fe
Author: tedyu <yuzhihong@gmail.com>
Authored: Sat Feb 20 20:39:23 2016 -0800
Committer: tedyu <yuzhihong@gmail.com>
Committed: Sat Feb 20 20:39:23 2016 -0800

----------------------------------------------------------------------
 .../org/apache/hadoop/hbase/tool/Canary.java    | 64 +++++++++++++++++---
 src/main/asciidoc/_chapters/ops_mgt.adoc        | 11 ++++
 2 files changed, 66 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/ba03fd4b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
index 5568dc3..9e0696c 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
@@ -37,6 +37,7 @@ import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Future;
 import java.util.concurrent.ScheduledThreadPoolExecutor;
+import java.util.concurrent.atomic.AtomicLong;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
@@ -92,9 +93,12 @@ import com.google.protobuf.ServiceException;
 public final class Canary implements Tool {
   // Sink interface used by the canary to outputs information
   public interface Sink {
+    public long getReadFailureCount();
+    public long incReadFailureCount();
     public void publishReadFailure(HRegionInfo region, Exception e);
     public void publishReadFailure(HRegionInfo region, HColumnDescriptor column, Exception
e);
     public void publishReadTiming(HRegionInfo region, HColumnDescriptor column, long msTime);
+    public long getWriteFailureCount();
     public void publishWriteFailure(HRegionInfo region, Exception e);
     public void publishWriteFailure(HRegionInfo region, HColumnDescriptor column, Exception
e);
     public void publishWriteTiming(HRegionInfo region, HColumnDescriptor column, long msTime);
@@ -109,13 +113,28 @@ public final class Canary implements Tool {
   // Simple implementation of canary sink that allows to plot on
   // file or standard output timings or failures.
   public static class StdOutSink implements Sink {
+    private AtomicLong readFailureCount = new AtomicLong(0),
+        writeFailureCount = new AtomicLong(0);
+
+    @Override
+    public long getReadFailureCount() {
+      return readFailureCount.get();
+    }
+
+    @Override
+    public long incReadFailureCount() {
+      return readFailureCount.incrementAndGet();
+    }
+
     @Override
     public void publishReadFailure(HRegionInfo region, Exception e) {
+      readFailureCount.incrementAndGet();
       LOG.error(String.format("read from region %s failed", region.getRegionNameAsString()),
e);
     }
 
     @Override
     public void publishReadFailure(HRegionInfo region, HColumnDescriptor column, Exception
e) {
+      readFailureCount.incrementAndGet();
       LOG.error(String.format("read from region %s column family %s failed",
                 region.getRegionNameAsString(), column.getNameAsString()), e);
     }
@@ -127,12 +146,19 @@ public final class Canary implements Tool {
     }
 
     @Override
+    public long getWriteFailureCount() {
+      return writeFailureCount.get();
+    }
+
+    @Override
     public void publishWriteFailure(HRegionInfo region, Exception e) {
+      writeFailureCount.incrementAndGet();
       LOG.error(String.format("write to region %s failed", region.getRegionNameAsString()),
e);
     }
 
     @Override
     public void publishWriteFailure(HRegionInfo region, HColumnDescriptor column, Exception
e) {
+      writeFailureCount.incrementAndGet();
       LOG.error(String.format("write to region %s column family %s failed",
         region.getRegionNameAsString(), column.getNameAsString()), e);
     }
@@ -148,6 +174,7 @@ public final class Canary implements Tool {
 
     @Override
     public void publishReadFailure(String table, String server) {
+      incReadFailureCount();
       LOG.error(String.format("Read from table:%s on region server:%s", table, server));
     }
 
@@ -405,6 +432,7 @@ public final class Canary implements Tool {
   private static final int INIT_ERROR_EXIT_CODE = 2;
   private static final int TIMEOUT_ERROR_EXIT_CODE = 3;
   private static final int ERROR_EXIT_CODE = 4;
+  private static final int FAILURE_EXIT_CODE = 5;
 
   private static final long DEFAULT_INTERVAL = 6000;
 
@@ -427,6 +455,7 @@ public final class Canary implements Tool {
   private boolean failOnError = true;
   private boolean regionServerMode = false;
   private boolean writeSniffing = false;
+  private boolean treatFailureAsError = false;
   private TableName writeTableName = DEFAULT_WRITE_TABLE_NAME;
 
   private ExecutorService executor; // threads to retrieve data from regionservers
@@ -488,6 +517,8 @@ public final class Canary implements Tool {
           this.regionServerMode = true;
         } else if(cmd.equals("-writeSniffing")) {
           this.writeSniffing = true;
+        } else if(cmd.equals("-treatFailureAsError")) {
+          this.treatFailureAsError = true;
         } else if (cmd.equals("-e")) {
           this.useRegExp = true;
         } else if (cmd.equals("-t")) {
@@ -584,7 +615,7 @@ public final class Canary implements Tool {
             }
           }
 
-          if (this.failOnError && monitor.hasError()) {
+          if (this.failOnError && monitor.finalCheckForErrors()) {
             monitorThread.interrupt();
             return monitor.errorCode;
           }
@@ -617,6 +648,7 @@ public final class Canary implements Tool {
         " default is true");
     System.err.println("   -t <N>         timeout for a check, default is 600000 (milisecs)");
     System.err.println("   -writeSniffing enable the write sniffing in canary");
+    System.err.println("   -treatFailureAsError treats read / write failure as error");
     System.err.println("   -writeTable    The table used for write sniffing."
         + " Default is hbase:canary");
     System.err
@@ -644,11 +676,11 @@ public final class Canary implements Tool {
     if (this.regionServerMode) {
       monitor =
           new RegionServerMonitor(connection, monitorTargets, this.useRegExp,
-              (ExtendedSink) this.sink, this.executor);
+              (ExtendedSink) this.sink, this.executor, this.treatFailureAsError);
     } else {
       monitor =
           new RegionMonitor(connection, monitorTargets, this.useRegExp, this.sink, this.executor,
-              this.writeSniffing, this.writeTableName);
+              this.writeSniffing, this.writeTableName, this.treatFailureAsError);
     }
     return monitor;
   }
@@ -660,6 +692,7 @@ public final class Canary implements Tool {
     protected HBaseAdmin admin;
     protected String[] targets;
     protected boolean useRegExp;
+    protected boolean treatFailureAsError;
     protected boolean initialized = false;
 
     protected boolean done = false;
@@ -675,13 +708,25 @@ public final class Canary implements Tool {
       return errorCode != 0;
     }
 
+    public boolean finalCheckForErrors() {
+      if (errorCode != 0) {
+        return true;
+      }
+      if (treatFailureAsError &&
+          (sink.getReadFailureCount() > 0 || sink.getWriteFailureCount() > 0)) {
+        errorCode = FAILURE_EXIT_CODE;
+        return true;
+      }
+      return false;
+    }
+
     @Override
     public void close() throws IOException {
       if (this.admin != null) this.admin.close();
     }
 
     protected Monitor(HConnection connection, String[] monitorTargets, boolean useRegExp,
Sink sink,
-        ExecutorService executor) {
+        ExecutorService executor, boolean treatFailureAsError) {
       if (null == connection) throw new IllegalArgumentException("connection shall not be
null");
 
       this.connection = connection;
@@ -726,8 +771,9 @@ public final class Canary implements Tool {
     private int checkPeriod;
 
     public RegionMonitor(HConnection connection, String[] monitorTargets, boolean useRegExp,
-        Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName)
{
-      super(connection, monitorTargets, useRegExp, sink, executor);
+        Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName,
+        boolean treatFailureAsError) {
+      super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError);
       Configuration conf = connection.getConfiguration();
       this.writeSniffing = writeSniffing;
       this.writeTableName = writeTableName;
@@ -1022,8 +1068,8 @@ public final class Canary implements Tool {
   private static class RegionServerMonitor extends Monitor {
 
     public RegionServerMonitor(HConnection connection, String[] monitorTargets, boolean useRegExp,
-        ExtendedSink sink, ExecutorService executor) {
-      super(connection, monitorTargets, useRegExp, sink, executor);
+        ExtendedSink sink, ExecutorService executor, boolean treatFailureAsError) {
+      super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError);
     }
 
     private ExtendedSink getSink() {
@@ -1094,7 +1140,7 @@ public final class Canary implements Tool {
         }
       } catch (InterruptedException e) {
         this.errorCode = ERROR_EXIT_CODE;
-        LOG.error("Sniff regionserver failed!", e);
+        LOG.error("Sniff regionserver interrupted!", e);
       }
     }
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/ba03fd4b/src/main/asciidoc/_chapters/ops_mgt.adoc
----------------------------------------------------------------------
diff --git a/src/main/asciidoc/_chapters/ops_mgt.adoc b/src/main/asciidoc/_chapters/ops_mgt.adoc
index 7089a16..d7ac987 100644
--- a/src/main/asciidoc/_chapters/ops_mgt.adoc
+++ b/src/main/asciidoc/_chapters/ops_mgt.adoc
@@ -93,6 +93,7 @@ Usage: bin/hbase org.apache.hadoop.hbase.tool.Canary [opts] [table1 [table2]...]
    -f <B>         stop whole program if first error occurs, default is true
    -t <N>         timeout for a check, default is 600000 (milliseconds)
    -writeSniffing enable the write sniffing in canary
+   -treatFailureAsError treats read / write failure as error
    -writeTable    The table used for write sniffing. Default is hbase:canary
    -D<configProperty>=<value> assigning or override the configuration params
 ----
@@ -215,6 +216,16 @@ $ ${HBASE_HOME}/bin/hbase canary -writeSniffing -writeTable ns:canary
 The default value size of each put is 10 bytes and you can set it by the config key:
 `hbase.canary.write.value.size`.
 
+==== Treat read / write failure as error
+
+By default, the canary tool only logs read failure, due to e.g. RetriesExhaustedException,
+while returning normal exit code. To treat read / write failure as error, you can run canary
+with the `-treatFailureAsError` option. When enabled, read / write failure would result in
error
+exit code.
+----
+$ ${HBASE_HOME}/bin/hbase canary --treatFailureAsError
+----
+
 ==== Running Canary in a Kerberos-enabled Cluster
 
 To run Canary in a Kerberos-enabled cluster, configure the following two properties in _hbase-site.xml_:


Mime
View raw message