hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From apurt...@apache.org
Subject svn commit: r1428142 - in /hbase/branches/0.94/src: examples/healthcheck/ main/java/org/apache/hadoop/hbase/ main/java/org/apache/hadoop/hbase/master/ main/java/org/apache/hadoop/hbase/regionserver/ test/java/org/apache/hadoop/hbase/
Date Thu, 03 Jan 2013 02:12:57 GMT
Author: apurtell
Date: Thu Jan  3 02:12:57 2013
New Revision: 1428142

URL: http://svn.apache.org/viewvc?rev=1428142&view=rev
Log:
HBASE-7351, HBASE-7399, HBASE-7406. Periodic health check chore (Vandana Ayyalasomayajula)

Added:
    hbase/branches/0.94/src/examples/healthcheck/
    hbase/branches/0.94/src/examples/healthcheck/healthcheck.sh
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HealthCheckChore.java
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HealthChecker.java
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HealthReport.java
    hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/TestNodeHealthCheckChore.java
Modified:
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HConstants.java
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java

Added: hbase/branches/0.94/src/examples/healthcheck/healthcheck.sh
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/examples/healthcheck/healthcheck.sh?rev=1428142&view=auto
==============================================================================
--- hbase/branches/0.94/src/examples/healthcheck/healthcheck.sh (added)
+++ hbase/branches/0.94/src/examples/healthcheck/healthcheck.sh Thu Jan  3 02:12:57 2013
@@ -0,0 +1,84 @@
+#!/bin/bash 
+ # Licensed to the Apache Software Foundation (ASF) under one
+ # or more contributor license agreements.  See the NOTICE file
+ # distributed with this work for additional information
+ # regarding copyright ownership.  The ASF licenses this file
+ # to you under the Apache License, Version 2.0 (the
+ # "License"); you may not use this file except in compliance
+ # with the License.  You may obtain a copy of the License at
+ #
+ #     http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+
+ # This is an example script for checking health of a node ( master or region server). 
+ # The health chore script should essentially output an message containing "ERROR" at an
undesirable
+ # outcome of the checks in the script. 
+
+err=0;
+
+function check_disks {
+
+for m in `awk '$3~/ext3/ {printf" %s ",$2}' /etc/fstab` ; do
+    fsdev=""
+    fsdev=`awk -v m=$m '$2==m {print $1}' /proc/mounts`;
+    if [ -z "$fsdev" ] ; then
+      msg_="$msg_ $m(u)"
+    else
+      msg_="$msg_`awk -v m=$m '$2==m { if ( $4 ~ /^ro,/ ) {printf"%s(ro)",$2 } ; }' /proc/mounts`"
+    fi
+  done
+
+  if [ -z "$msg_" ] ; then
+    echo "disks ok" ; exit 0
+  else
+    echo "$msg_" ; exit 2
+  fi
+
+}
+
+function check_link {
+  /usr/bin/snmpwalk -t 5 -Oe  -Oq  -Os -v 1 -c public localhost if | \
+        awk ' { 
+          split($1,a,".") ;
+          if ( a[1] == "ifIndex" ) { ifIndex[a[2]] = $2 }
+          if ( a[1] == "ifDescr" ) { ifDescr[a[2]] = $2 }
+          if ( a[1] == "ifType" ) { ifType[a[2]] = $2 }
+          if ( a[1] == "ifSpeed" ) { ifSpeed[a[2]] = $2 }
+          if ( a[1] == "ifAdminStatus" ) { ifAdminStatus[a[2]] = $2 }
+          if ( a[1] == "ifOperStatus" ) { ifOperStatus[a[2]] = $2 }
+        }
+        END {
+        up=0;
+        for (i in ifIndex ) {
+          if ( ifType[i] == 6 && ifAdminStatus[i] == 1 && ifOperStatus[i]
== 1 && ifSpeed[i] == 1000000000 ) {
+            up=i;
+          }
+        }
+        if ( up == 0 ) { print "check link" ; exit 2 }
+        else { print ifDescr[up],"ok" }
+        }'
+  exit $? ;
+}
+
+for check in disks link ; do
+  msg=`check_${check}` ;
+  if [ $? -eq 0 ] ; then
+    ok_msg="$ok_msg$msg,"
+  else
+    err_msg="$err_msg$msg,"
+  fi
+done
+
+if [ ! -z "$err_msg" ] ; then
+  echo -n "ERROR $err_msg " 
+fi
+if [ ! -z "$ok_msg" ] ; then
+  echo -n "OK: $ok_msg" 
+fi
+echo
+exit 0

Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HConstants.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HConstants.java?rev=1428142&r1=1428141&r2=1428142&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HConstants.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HConstants.java Thu Jan  3 02:12:57
2013
@@ -672,6 +672,19 @@ public final class HConstants {
           Bytes.toString(META_TABLE_NAME), Bytes.toString(ROOT_TABLE_NAME), SPLIT_LOGDIR_NAME,
           HBCK_SIDELINEDIR_NAME, HFILE_ARCHIVE_DIRECTORY }));
 
+  /** Health script related settings. */
+  public static final String HEALTH_SCRIPT_LOC = "hbase.node.health.script.location";
+  public static final String HEALTH_SCRIPT_TIMEOUT = "hbase.node.health.script.timeout";
+  public static final String HEALTH_CHORE_WAKE_FREQ =
+      "hbase.node.health.script.frequency";
+  public static final long DEFAULT_HEALTH_SCRIPT_TIMEOUT = 60000;
+  /**
+   * The maximum number of health check failures a server can encounter consecutively.
+   */
+  public static final String HEALTH_FAILURE_THRESHOLD =
+      "hbase.node.health.failure.threshold";
+  public static final int DEFAULT_HEALTH_FAILURE_THRESHOLD = 3;
+
   private HConstants() {
     // Can't be instantiated with this ctor.
   }

Added: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HealthCheckChore.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HealthCheckChore.java?rev=1428142&view=auto
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HealthCheckChore.java (added)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HealthCheckChore.java Thu Jan
 3 02:12:57 2013
@@ -0,0 +1,95 @@
+/**
+ * Copyright 2011 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HealthChecker.HealthCheckerExitStatus;
+import org.apache.hadoop.util.StringUtils;
+
+/**
+ * The Class HealthCheckChore for running health checker regularly.
+ */
+ public class HealthCheckChore extends Chore {
+  private static Log LOG = LogFactory.getLog(HealthCheckChore.class);
+  private HealthChecker healthChecker;
+  private Configuration config;
+  private int threshold;
+  private int numTimesUnhealthy = 0;
+  private long failureWindow;
+  private long startWindow;
+
+  public HealthCheckChore(int sleepTime, Stoppable stopper, Configuration conf) {
+    super("HealthChecker", sleepTime, stopper);
+    LOG.info("Health Check Chore runs every " + StringUtils.formatTime(sleepTime));
+    this.config = conf;
+    String healthCheckScript = this.config.get(HConstants.HEALTH_SCRIPT_LOC);
+    long scriptTimeout = this.config.getLong(HConstants.HEALTH_SCRIPT_TIMEOUT,
+      HConstants.DEFAULT_HEALTH_SCRIPT_TIMEOUT);
+    healthChecker = new HealthChecker();
+    healthChecker.init(healthCheckScript, scriptTimeout);
+    this.threshold = config.getInt(HConstants.HEALTH_FAILURE_THRESHOLD,
+      HConstants.DEFAULT_HEALTH_FAILURE_THRESHOLD);
+    this.failureWindow = this.threshold * sleepTime;
+  }
+
+  @Override
+  protected void chore() {
+    HealthReport report = healthChecker.checkHealth();
+    boolean isHealthy = (report.getStatus() == HealthCheckerExitStatus.SUCCESS);
+    if (!isHealthy) {
+      boolean needToStop = decideToStop();
+      if (needToStop) {
+        this.stopper.stop("The region server reported unhealthy " + threshold
+            + " number of times consecutively.");
+      }
+      // Always log health report.
+      LOG.info("Health status at " + StringUtils.formatTime(System.currentTimeMillis()) +
" : "
+          + report.getHealthReport());
+    }
+  }
+
+  private boolean decideToStop() {
+    boolean stop = false;
+    if (numTimesUnhealthy == 0) {
+      // First time we are seeing a failure. No need to stop, just
+      // record the time.
+      numTimesUnhealthy++;
+      startWindow = System.currentTimeMillis();
+    } else {
+      if ((System.currentTimeMillis() - startWindow) < failureWindow) {
+        numTimesUnhealthy++;
+        if (numTimesUnhealthy == threshold) {
+          stop = true;
+        } else {
+          stop = false;
+        }
+      } else {
+        // Outside of failure window, so we reset to 1.
+        numTimesUnhealthy = 1;
+        startWindow = System.currentTimeMillis();
+        stop = false;
+      }
+    }
+    return stop;
+  }
+
+}

Added: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HealthChecker.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HealthChecker.java?rev=1428142&view=auto
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HealthChecker.java (added)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HealthChecker.java Thu Jan 
3 02:12:57 2013
@@ -0,0 +1,127 @@
+/**
+ * Copyright 2011 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase;
+
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.util.Shell.ExitCodeException;
+import org.apache.hadoop.util.Shell.ShellCommandExecutor;
+
+/**
+ * A utility for executing an external script that checks the health of
+ * the node. An example script can be found at
+ * <tt>src/examples/healthcheck/healthcheck.sh</tt>
+ */
+class HealthChecker {
+
+  private static Log LOG = LogFactory.getLog(HealthChecker.class);
+  private ShellCommandExecutor shexec = null;
+  private String exceptionStackTrace;
+
+  /** Pattern used for searching in the output of the node health script */
+  static private final String ERROR_PATTERN = "ERROR";
+
+  private String healthCheckScript;
+  private long scriptTimeout;
+
+  enum HealthCheckerExitStatus {
+    SUCCESS,
+    TIMED_OUT,
+    FAILED_WITH_EXIT_CODE,
+    FAILED_WITH_EXCEPTION,
+    FAILED
+  }
+
+  /**
+   * Initialize.
+   *
+   * @param configuration
+   */
+  public void init(String location, long timeout) {
+    this.healthCheckScript = location;
+    this.scriptTimeout = timeout;
+    ArrayList<String> execScript = new ArrayList<String>();
+    execScript.add(healthCheckScript);
+    this.shexec = new ShellCommandExecutor(execScript.toArray(new String[execScript.size()]),
null,
+        null, scriptTimeout);
+    LOG.info("HealthChecker initialized.");
+  }
+
+  public HealthReport checkHealth() {
+    HealthCheckerExitStatus status = HealthCheckerExitStatus.SUCCESS;
+    try {
+      shexec.execute();
+    } catch (ExitCodeException e) {
+      // ignore the exit code of the script
+      LOG.warn("Caught exception : " + e);
+      status = HealthCheckerExitStatus.FAILED_WITH_EXIT_CODE;
+    } catch (IOException e) {
+      LOG.warn("Caught exception : " + e);
+      if (!shexec.isTimedOut()) {
+        status = HealthCheckerExitStatus.FAILED_WITH_EXCEPTION;
+        exceptionStackTrace = org.apache.hadoop.util.StringUtils.stringifyException(e);
+      } else {
+        status = HealthCheckerExitStatus.TIMED_OUT;
+      }
+    } finally {
+      if (status == HealthCheckerExitStatus.SUCCESS) {
+        if (hasErrors(shexec.getOutput())) {
+          status = HealthCheckerExitStatus.FAILED;
+        }
+      }
+    }
+    return new HealthReport(status, getHealthReport(status));
+  }
+
+  private boolean hasErrors(String output) {
+    String[] splits = output.split("\n");
+    for (String split : splits) {
+      if (split.startsWith(ERROR_PATTERN)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  private String getHealthReport(HealthCheckerExitStatus status){
+    String healthReport = null;
+    switch (status) {
+    case SUCCESS:
+      healthReport = "Server is healthy.";
+      break;
+    case TIMED_OUT:
+      healthReport = "Health script timed out";
+      break;
+    case FAILED_WITH_EXCEPTION:
+      healthReport = exceptionStackTrace;
+      break;
+    case FAILED_WITH_EXIT_CODE:
+      healthReport = "Health script failed with exit code.";
+      break;
+    case FAILED:
+      healthReport = shexec.getOutput();
+      break;
+    }
+    return healthReport;
+  }
+}

Added: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HealthReport.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HealthReport.java?rev=1428142&view=auto
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HealthReport.java (added)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/HealthReport.java Thu Jan  3
02:12:57 2013
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase;
+
+import org.apache.hadoop.hbase.HealthChecker.HealthCheckerExitStatus;
+
+/**
+ * The Class RegionServerHealthReport containing information about
+ * health of the region server.
+ */
+class HealthReport {
+
+  private HealthCheckerExitStatus status;
+  private String healthReport;
+
+  HealthReport(HealthCheckerExitStatus status, String healthReport) {
+    super();
+    this.status = status;
+    this.healthReport = healthReport;
+  }
+
+  /**
+   * Gets the status of the region server.
+   *
+   * @return HealthCheckerExitStatus
+   */
+  HealthCheckerExitStatus getStatus() {
+    return status;
+  }
+ 
+  /**
+   * Gets the health report of the region server.
+   *
+   * @return String
+   */
+  String getHealthReport() {
+    return healthReport;
+  }
+
+  @Override
+  public int hashCode() {
+    final int prime = 31;
+    int result = 1;
+    result = prime * result + ((healthReport == null) ? 0 : healthReport.hashCode());
+    result = prime * result + ((status == null) ? 0 : status.hashCode());
+    return result;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+    if (obj == null) {
+      return false;
+    }
+    if (!(obj instanceof HealthReport)) {
+      return false;
+    }
+    HealthReport other = (HealthReport) obj;
+    if (healthReport == null) {
+      if (other.healthReport != null) {
+        return false;
+      }
+    } else if (!healthReport.equals(other.healthReport)) {
+      return false;
+    }
+    if (status != other.status) {
+      return false;
+    }
+    return true;
+  }
+
+}

Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1428142&r1=1428141&r2=1428142&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Thu Jan
 3 02:12:57 2013
@@ -55,6 +55,7 @@ import org.apache.hadoop.hbase.HConstant
 import org.apache.hadoop.hbase.HRegionInfo;
 import org.apache.hadoop.hbase.HServerLoad;
 import org.apache.hadoop.hbase.HTableDescriptor;
+import org.apache.hadoop.hbase.HealthCheckChore;
 import org.apache.hadoop.hbase.MasterNotRunningException;
 import org.apache.hadoop.hbase.PleaseHoldException;
 import org.apache.hadoop.hbase.Server;
@@ -235,6 +236,9 @@ Server {
   private Map<String, Class<? extends CoprocessorProtocol>>
       protocolHandlerNames = Maps.newHashMap();
 
+  /** The health check chore. */
+  private HealthCheckChore healthCheckChore;
+
   /**
    * Initializes the HMaster. The steps are as follows:
    * <p>
@@ -303,6 +307,13 @@ Server {
     this.zooKeeper = new ZooKeeperWatcher(conf, MASTER + ":" + isa.getPort(), this, true);
     this.rpcServer.startThreads();
     this.metrics = new MasterMetrics(getServerName().toString());
+
+    // Health checker thread.
+    int sleepTime = this.conf.getInt(HConstants.HEALTH_CHORE_WAKE_FREQ,
+      HConstants.DEFAULT_THREAD_WAKE_FREQUENCY);
+    if (isHealthCheckerConfigured()) {
+      healthCheckChore = new HealthCheckChore(sleepTime, this, getConfiguration());
+    }
   }
 
   /**
@@ -878,7 +889,12 @@ Server {
      this.infoServer.setAttribute(MASTER, this);
      this.infoServer.start();
     }
-   
+
+   // Start the health checker
+   if (this.healthCheckChore != null) {
+     Threads.setDaemonThreadRunning(this.healthCheckChore.getThread(), n + ".healthChecker");
+   }
+
     // Start allowing requests to happen.
     this.rpcServer.openServer();
     if (LOG.isDebugEnabled()) {
@@ -905,6 +921,9 @@ Server {
       }
     }
     if (this.executorService != null) this.executorService.shutdown();
+    if (this.healthCheckChore != null) {
+      this.healthCheckChore.interrupt();
+    }
   }
 
   private static Thread getAndStartBalancerChore(final HMaster master) {
@@ -1926,4 +1945,9 @@ Server {
   public HFileCleaner getHFileCleaner() {
     return this.hfileCleaner;
   }
+
+  private boolean isHealthCheckerConfigured() {
+    String healthScriptLocation = this.conf.get(HConstants.HEALTH_SCRIPT_LOC);
+    return org.apache.commons.lang.StringUtils.isNotBlank(healthScriptLocation);
+  }
 }

Modified: hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=1428142&r1=1428141&r2=1428142&view=diff
==============================================================================
--- hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
(original)
+++ hbase/branches/0.94/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
Thu Jan  3 02:12:57 2013
@@ -66,6 +66,7 @@ import org.apache.hadoop.hbase.HBaseConf
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
 import org.apache.hadoop.hbase.HDFSBlocksDistribution;
+import org.apache.hadoop.hbase.HealthCheckChore;
 import org.apache.hadoop.hbase.HRegionInfo;
 import org.apache.hadoop.hbase.HServerAddress;
 import org.apache.hadoop.hbase.HServerInfo;
@@ -368,6 +369,9 @@ public class HRegionServer implements HR
 
   private RegionServerCoprocessorHost rsHost;
 
+  /** The health check chore. */
+  private HealthCheckChore healthCheckChore;
+
   /**
    * Starts a HRegionServer at the default location
    *
@@ -660,6 +664,13 @@ public class HRegionServer implements HR
     this.compactionChecker = new CompactionChecker(this,
       this.threadWakeFrequency * multiplier, this);
 
+    // Health checker thread.
+    int sleepTime = this.conf.getInt(HConstants.HEALTH_CHORE_WAKE_FREQ,
+      HConstants.DEFAULT_THREAD_WAKE_FREQUENCY);
+    if (isHealthCheckerConfigured()) {
+      healthCheckChore = new HealthCheckChore(sleepTime, this, getConfiguration());
+    }
+
     this.leases = new Leases((int) conf.getLong(
         HConstants.HBASE_REGIONSERVER_LEASE_PERIOD_KEY,
         HConstants.DEFAULT_HBASE_REGIONSERVER_LEASE_PERIOD),
@@ -775,6 +786,9 @@ public class HRegionServer implements HR
     if (this.hlogRoller != null) this.hlogRoller.interruptIfNecessary();
     if (this.compactionChecker != null)
       this.compactionChecker.interrupt();
+    if (this.healthCheckChore != null) {
+      this.healthCheckChore.interrupt();
+    }
 
     if (this.killed) {
       // Just skip out w/o closing regions.  Used when testing.
@@ -1559,6 +1573,10 @@ public class HRegionServer implements HR
       handler);
     Threads.setDaemonThreadRunning(this.compactionChecker.getThread(), n +
       ".compactionChecker", handler);
+    if (this.healthCheckChore != null) {
+      Threads.setDaemonThreadRunning(this.healthCheckChore.getThread(), n + ".healthChecker",
+        handler);
+    }
 
     // Leases is not a Thread. Internally it runs a daemon thread. If it gets
     // an unhandled exception, it will just exit.
@@ -1789,6 +1807,9 @@ public class HRegionServer implements HR
   protected void join() {
     Threads.shutdown(this.compactionChecker.getThread());
     Threads.shutdown(this.cacheFlusher.getThread());
+    if (this.healthCheckChore != null) {
+      Threads.shutdown(this.healthCheckChore.getThread());
+    }
     if (this.hlogRoller != null) {
       Threads.shutdown(this.hlogRoller.getThread());
     }
@@ -3864,4 +3885,9 @@ public class HRegionServer implements HR
     }
     return 0;
   }
+
+  private boolean isHealthCheckerConfigured() {
+    String healthScriptLocation = this.conf.get(HConstants.HEALTH_SCRIPT_LOC);
+    return org.apache.commons.lang.StringUtils.isNotBlank(healthScriptLocation);
+  }
 }

Added: hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/TestNodeHealthCheckChore.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/TestNodeHealthCheckChore.java?rev=1428142&view=auto
==============================================================================
--- hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/TestNodeHealthCheckChore.java
(added)
+++ hbase/branches/0.94/src/test/java/org/apache/hadoop/hbase/TestNodeHealthCheckChore.java
Thu Jan  3 02:12:57 2013
@@ -0,0 +1,137 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.PrintWriter;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.SmallTests;
+import org.apache.hadoop.hbase.Stoppable;
+import org.apache.hadoop.hbase.HealthChecker.HealthCheckerExitStatus;
+import org.junit.After;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+@Category(SmallTests.class)
+public class TestNodeHealthCheckChore {
+
+  private static final Log LOG = LogFactory.getLog(TestNodeHealthCheckChore.class);
+  private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
+  private File healthScriptFile;
+
+
+  @After
+  public void cleanUp() throws IOException {
+    UTIL.cleanupTestDir();
+  }
+
+  @Test
+  public void testHealthChecker() throws Exception {
+    Configuration config = getConfForNodeHealthScript();
+    config.addResource(healthScriptFile.getName());
+    String location = healthScriptFile.getAbsolutePath();
+    long timeout = config.getLong(HConstants.HEALTH_SCRIPT_TIMEOUT, 100);
+
+    String normalScript = "echo \"I am all fine\"";
+    createScript(normalScript, true);
+    HealthChecker checker = new HealthChecker();
+    checker.init(location, timeout);
+    HealthReport report = checker.checkHealth();
+    assertTrue(report.getStatus() == HealthCheckerExitStatus.SUCCESS);
+    LOG.info("Health Status:" + checker);
+
+    String errorScript = "echo ERROR\n echo \"Node not healthy\"";
+    createScript(errorScript, true);
+    report = checker.checkHealth();
+    assertTrue(report.getStatus() == HealthCheckerExitStatus.FAILED);
+    LOG.info("Health Status:" + report.getHealthReport());
+
+    String timeOutScript = "sleep 4\n echo\"I am fine\"";
+    createScript(timeOutScript, true);
+    report = checker.checkHealth();
+    assertTrue(report.getStatus() == HealthCheckerExitStatus.TIMED_OUT);
+    LOG.info("Health Status:" + report.getHealthReport());
+
+    healthScriptFile.delete();
+  }
+
+  @Test
+  public void testNodeHealthChore() throws Exception{
+    Stoppable stop = new StoppableImplementation();
+    Configuration conf = getConfForNodeHealthScript();
+    String errorScript = "echo ERROR\n echo \"Node not healthy\"";
+    createScript(errorScript, true);
+    HealthCheckChore rsChore = new HealthCheckChore(100, stop, conf);
+    //Default threshold is three.
+    rsChore.chore();
+    rsChore.chore();
+    assertFalse("Stoppable must not be stopped.", stop.isStopped());
+    rsChore.chore();
+    assertTrue("Stoppable must have been stopped.", stop.isStopped());
+  }
+
+  private void createScript(String scriptStr, boolean setExecutable)
+      throws Exception {
+    healthScriptFile.createNewFile();
+    PrintWriter pw = new PrintWriter(new FileOutputStream(healthScriptFile));
+    pw.println(scriptStr);
+    pw.flush();
+    pw.close();
+    healthScriptFile.setExecutable(setExecutable);
+  }
+
+  private Configuration getConfForNodeHealthScript() {
+    Configuration conf = UTIL.getConfiguration();
+    File tempDir = new File(UTIL.getDataTestDir().toString());
+    tempDir.mkdirs();
+    healthScriptFile = new File(tempDir.getAbsolutePath(), "HealthScript.sh");
+    conf.set(HConstants.HEALTH_SCRIPT_LOC,
+      healthScriptFile.getAbsolutePath());
+    conf.setLong(HConstants.HEALTH_FAILURE_THRESHOLD, 3);
+    conf.setLong(HConstants.HEALTH_SCRIPT_TIMEOUT, 100);
+    return conf;
+  }
+
+  /**
+   * Simple helper class that just keeps track of whether or not its stopped.
+   */
+  private static class StoppableImplementation implements Stoppable {
+    private volatile boolean stop = false;
+
+    @Override
+    public void stop(String why) {
+      this.stop = true;
+    }
+
+    @Override
+    public boolean isStopped() {
+      return this.stop;
+    }
+
+  }
+}



Mime
View raw message