hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From li...@apache.org
Subject svn commit: r1480007 - in /hbase/branches/0.89-fb/src: main/java/org/apache/hadoop/hbase/client/ main/java/org/apache/hadoop/hbase/ipc/ main/java/org/apache/hadoop/hbase/master/ main/java/org/apache/hadoop/hbase/util/ test/java/org/apache/hadoop/hbase/...
Date Tue, 07 May 2013 18:32:25 GMT
Author: liyin
Date: Tue May  7 18:32:20 2013
New Revision: 1480007

URL: http://svn.apache.org/r1480007
Log:
[HBASE-8500] Adds a RollingRestart utility for a region server

Author: rshroff

Summary:
The change adds a basic RollingRestart utility which can be performed at
region server level. It also adds a basic RegionChecker which gets the
availability of the region during the rolling restart.

The changes in the master code path is to have more finer grained
control on region servers which are black listed. Plus a bug fix in the
canOffloadTo()

You can now trigger a RollingRestart of region server on the host by
issuing:
*HBASE/bin/hbase org.apache.hadoop.hbase.util.RollingRestart -c -s hbasedev128.ash3

  // Test

Test Plan:
mr, plus did a RollingRestart of a region server on dev
cluster(Sample output https://phabricator.fb.com/P2126823)

Reviewers: aaiyer, liyintang, manukranthk, adela

Reviewed By: aaiyer

CC: hbase-eng@, paultuckfield

Differential Revision: https://phabricator.fb.com/D796086

Task ID: 2229110

Added:
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RollingRestart.java
Modified:
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java?rev=1480007&r1=1480006&r2=1480007&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java
(original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java
Tue May  7 18:32:20 2013
@@ -1014,12 +1014,14 @@ public class HConnectionManager {
           HRegionLocation metaLocation = locateRegion(parentTable, metaKey);
 
           server = metaLocation.getServerAddress();
+          fInfo = repeatedFailuresMap.get(server);
+
           // Handle the case where .META. is on an unresponsive server.
           if (inFastFailMode(server) &&
               !this.currentThreadInFastFailMode()) {
             // In Fast-fail mode, all but one thread will fast fail. Check
             // if we are that one chosen thread.
-            fInfo = repeatedFailuresMap.get(server);
+
             retryDespiteFastFailMode = shouldRetryInspiteOfFastFail(fInfo);
 
             if (retryDespiteFastFailMode == false) { // we don't have to retry
@@ -1633,14 +1635,14 @@ public class HConnectionManager {
         if (instantiateRegionLocation) {
           callable.instantiateRegionLocation(false);
         }
-
         // Logic to fast fail requests to unreachable servers.
         server = callable.getServerAddress();
+        fInfo = repeatedFailuresMap.get(server);
+
         if (inFastFailMode(server) &&
             !currentThreadInFastFailMode()) {
           // In Fast-fail mode, all but one thread will fast fail. Check
           // if we are that one chosen thread.
-          fInfo = repeatedFailuresMap.get(server);
           retryDespiteFastFailMode = shouldRetryInspiteOfFastFail(fInfo);
           if (retryDespiteFastFailMode == false) { // we don't have to retry
             throw new PreemptiveFastFailException(fInfo.numConsecutiveFailures.get(),
@@ -1711,6 +1713,7 @@ public class HConnectionManager {
       if (fInfo == null) {
         fInfo = new FailureInfo(currentTime);
         FailureInfo oldfInfo = repeatedFailuresMap.putIfAbsent(server, fInfo);
+
         if (oldfInfo != null) {
           fInfo = oldfInfo;
         }
@@ -1841,6 +1844,7 @@ public class HConnectionManager {
 
       // If we were able to connect to the server, reset the failure information.
       if (couldNotCommunicate == false) {
+        LOG.info("Clearing out PFFE for server " + server.getHostname());
         repeatedFailuresMap.remove(server);
       } else {
         // update time of last attempt

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java?rev=1480007&r1=1480006&r2=1480007&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java
(original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java
Tue May  7 18:32:20 2013
@@ -201,7 +201,8 @@ public interface HMasterInterface extend
 
   /**
    * Adds a server to the blacklist map. With this, the Master will not assign
-   * any new regions to this region server
+   * any new regions to this region server. Only an explicit MOVE_REGION
+   * request will move a region to the blacklisted server.
    * @param hostAndPort
    */
   public void addServerToBlacklist(final String hostAndPort);

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1480007&r1=1480006&r2=1480007&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Tue May
 7 18:32:20 2013
@@ -1885,16 +1885,11 @@ public class HMaster extends HasThread i
             Bytes.toStringBinary(regionname) + " in .META. Move failed");
       }
 
-      // Assign the specified host to be the preferred host for the specified region.
-      if (!this.isServerBlackListed(hostnameAndPort)) {
-        this.regionManager.getAssignmentManager().
-          addTransientAssignment(serverAddress, hri);
-        // Close the region so that it will be re-opened by the preferred host.
-        modifyTable(tableName, HConstants.Modify.CLOSE_REGION, new Writable[]{args[0]});
-      } else {
-        LOG.warn("Cannot move the region " + Bytes.toStringBinary(regionname) +
-            " to blacklisted server " + hostnameAndPort);
-      }
+      this.regionManager.getAssignmentManager().
+        addTransientAssignment(serverAddress, hri);
+      // Close the region so that it will be re-opened by the preferred host.
+      modifyTable(tableName, HConstants.Modify.CLOSE_REGION, new Writable[]{args[0]});
+
       break;
     }
 

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java?rev=1480007&r1=1480006&r2=1480007&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java
(original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java
Tue May  7 18:32:20 2013
@@ -594,7 +594,7 @@ public class RegionManager {
     if (rootState != null && rootState.isUnassigned()) {
       // just make sure it isn't hosting META regions (unless
       // it's the only server left).
-      if (!isMetaServer || isSingleServer) {
+      if ((!isMetaServer || isSingleServer) && !master.isServerBlackListed(server.getHostnamePort()))
{
         regionsToAssign.add(rootState);
         LOG.debug("Going to assign -ROOT- region to server " +
             server.getHostnamePort());
@@ -625,7 +625,7 @@ public class RegionManager {
         }
         // Assign the META region here explicitly
         if (regionInfo.isMetaRegion()) {
-          if (regionState.isUnassigned()) {
+          if (regionState.isUnassigned() && !master.isServerBlackListed(server.getHostnamePort()))
{
             regionsToAssign.clear();
             regionsToAssign.add(regionState);
             LOG.debug("Going to assign META region: " +
@@ -658,7 +658,8 @@ public class RegionManager {
         if (preservedRegionsForCurrentRS == null || 
             !preservedRegionsForCurrentRS.contains(regionInfo)) {
           if (assignmentManager.hasTransientAssignment(regionInfo) || 
-              nonPreferredAssignment > this.maxAssignInOneGo) {
+              nonPreferredAssignment > this.maxAssignInOneGo ||
+              master.isServerBlackListed(server.getHostnamePort())) {
             // Hold the region for its favored nodes and limit the number of 
             // non preferred assignments for each region server.
             continue;
@@ -1947,6 +1948,11 @@ public class RegionManager {
     public void loadBalancing(HServerInfo info, HRegionInfo[] mostLoadedRegions,
         ArrayList<HMsg> returnMsgs) {
 
+      if (master.isServerBlackListed(info.getHostnamePort())) {
+        LOG.debug("Server " + info.getHostnamePort() + " is blacklisted. " +
+            "Cannot load balance the regions for this server. Returning");
+        return;
+      }
       int regionsUnassigned = balanceToPrimary(info, mostLoadedRegions,
           returnMsgs);
 
@@ -2148,7 +2154,7 @@ public class RegionManager {
     private boolean canOffloadTo(HServerAddress hServerAddress) {
 
       // Server is black listed. Cannot move the regions to this server.
-      if (ServerManager.isServerBlackListed(hServerAddress.getHostAddressWithPort())) {
+      if (ServerManager.isServerBlackListed(hServerAddress.getHostNameWithPort())) {
         LOG.info("Blacklisted Server. Cannot offload. Returning...");
         return false;
       }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1480007&r1=1480006&r2=1480007&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
(original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
Tue May  7 18:32:20 2013
@@ -661,21 +661,8 @@ public class ServerManager {
       // Should we tell it close regions because its overloaded?  If its
       // currently opening regions, leave it alone till all are open.
       if (openingCount < this.nobalancingCount) {
-
-        if (!blacklistedRSHostPortMap.containsKey(
-            serverInfo.getHostnamePort()) || serversToServerInfo.size() <= 1) {
-
-          // Production code path.
           master.getRegionManager().assignRegions(serverInfo,
               mostLoadedRegions, returnMsgs);
-        } else {
-
-          // We just don't assign anything to "blacklisted" regionservers .
-          // This is OK because another regionserver will get these regions
-          // in response to a heartbeat.
-          LOG.debug("Not assigning regions to blacklisted regionserver "
-              + serverInfo.getHostnamePort());
-        }
       }
 
       // Send any pending table actions.

Added: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RollingRestart.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RollingRestart.java?rev=1480007&view=auto
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RollingRestart.java
(added)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RollingRestart.java
Tue May  7 18:32:20 2013
@@ -0,0 +1,516 @@
+package org.apache.hadoop.hbase.util;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.cli.PosixParser;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.HRegionInfo;
+import org.apache.hadoop.hbase.HServerAddress;
+import org.apache.hadoop.hbase.MasterNotRunningException;
+import org.apache.hadoop.hbase.client.Get;
+import org.apache.hadoop.hbase.client.HBaseAdmin;
+import org.apache.hadoop.hbase.client.HTable;
+import org.apache.hadoop.hbase.ipc.HRegionInterface;
+import org.apache.hadoop.hbase.master.AssignmentPlan;
+import org.apache.hadoop.hbase.master.RegionPlacement;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+
+public class RollingRestart {
+
+  private static final Log LOG = LogFactory.getLog(RollingRestart.class);
+
+  HServerAddress serverAddr;
+  final Configuration conf;
+  AssignmentPlan plan;
+  HRegionInfo[] regions;
+  STAGE currentState;
+  HBaseAdmin admin = null;
+  int sleepIntervalAfterRestart = 0;
+  int regionDrainInterval = 0;
+  int regionUndrainInterval = 0;
+  int getOpFrequency = 0;
+  int sleepIntervalBeforeRestart = 0;
+  int moveTimeoutInterval = 60000;
+  int moveRetries = 1;
+  boolean useHadoopCtl = true;
+  HashMap<HServerAddress, HRegionInterface> serverConnectionMap =
+      new HashMap<HServerAddress, HRegionInterface>();
+  ArrayList<RegionChecker> regionCheckers = new ArrayList<RegionChecker>();
+
+  final static int DEFAULT_SLEEP_AFTER_RESTART_INTERVAL = 10000;
+  final static int DEFAULT_SLEEP_BEFORE_RESTART_INTERVAL = 10000;
+  final static int DEFAULT_REGION_DRAIN_INTERVAL = 1000;
+  final static int DEFAULT_REGION_UNDRAIN_INTERVAL = 10000;
+  final static int DEFAULT_GETOP_FREQUENCY = 1000;
+  final static int DEFAULT_MOVE_RETRIES = 1;
+  final static int DEFAULT_MOVE_TIMEOUT = 60000;
+
+  RollingRestart(String serverName, int regionDrainInterval,
+      int regionUndrainInterval, int sleepIntervalAfterRestart,
+      int sleepIntervalBeforeRestart, int getOpFrequency,
+      boolean useHadoopCtl) throws IOException {
+
+    this.sleepIntervalAfterRestart = sleepIntervalAfterRestart;
+    this.sleepIntervalBeforeRestart = sleepIntervalBeforeRestart;
+    this.useHadoopCtl = useHadoopCtl;
+    this.regionDrainInterval = regionDrainInterval;
+    this.regionUndrainInterval = regionUndrainInterval;
+    this.getOpFrequency = getOpFrequency;
+
+    conf = HBaseConfiguration.create();
+    this.moveRetries = conf.getInt("hbase.rollingrestart.move.maxretries", DEFAULT_MOVE_RETRIES);
+    this.moveTimeoutInterval = conf.getInt("hbase.rollingrestart.move.timeout", DEFAULT_MOVE_TIMEOUT);
+
+    try {
+      admin = new HBaseAdmin(conf);
+    } catch (MasterNotRunningException e) {
+      currentState = STAGE.FAIL;
+      return;
+    }
+
+    this.serverAddr = new HServerAddress(serverName, 60020);
+
+    currentState = STAGE.SETUP;
+  }
+
+  HRegionInterface getHRegionConnection(HServerAddress server) throws IOException {
+    if (serverConnectionMap.get(server) == null) {
+      HRegionInterface rs = admin.getConnection().getHRegionConnection(server);
+      serverConnectionMap.put(server, rs);
+      return rs;
+    }
+    return serverConnectionMap.get(server);
+  }
+
+  enum STAGE {
+    SETUP,
+    DRAIN,
+    RESTART_REGIONSERVER,
+    UNDRAIN,
+    COMPLETE,
+    FAIL
+  };
+
+  boolean moveRegion(final HRegionInfo region) throws Exception {
+    HRegionInterface destinationServer = getDestinationServer(region);
+
+    if (destinationServer == null) {
+      LOG.debug("No preferred server found for " + region.getRegionNameAsString() +
+          ". Skipping...");
+      return false;
+    }
+
+    LOG.info("Moving region:" + region.getRegionNameAsString() + " to " +
+        destinationServer.getHServerInfo().getHostname());
+
+    int numTries = 0;
+    long startTimeInMs = System.currentTimeMillis();
+
+    admin.moveRegion(region.getRegionName(),
+        destinationServer.getHServerInfo().getHostnamePort());
+
+    while (true) {
+      try {
+        HRegionInfo r = destinationServer.getRegionInfo(region.getRegionName());
+        if (r != null) {
+          break;
+        }
+      } catch (Exception e) {
+        if ((System.currentTimeMillis() - startTimeInMs) > moveTimeoutInterval) {
+          if (++numTries >= this.moveRetries) {
+            LOG.warn("Reached max " + numTries + " tries while moving region " +
+                region.getRegionNameAsString() + " to destination server " +
+                destinationServer.getHServerInfo().getHostname());
+            return false;
+          }
+
+          LOG.warn("Timed out while moving region " +
+              region.getRegionNameAsString() + " to destination server " +
+              destinationServer.getHServerInfo().getHostname() + ". Retrying");
+
+          admin.moveRegion(region.getRegionName(),
+              destinationServer.getHServerInfo().getHostnamePort());
+          startTimeInMs = System.currentTimeMillis();
+        }
+        LOG.info("Waiting for region to come online on destination region server");
+      }
+      Thread.sleep(2000);
+    }
+    return true;
+  }
+
+  /**
+   * Restarts the regionserver using the hadoopctl script. This adds
+   * a dependency on the hadoopctl script.
+   * @throws IOException
+   * @throws InterruptedException
+   */
+  void restart() throws IOException, InterruptedException {
+    System.out.println("Shutting down the region server");
+    Thread.sleep(this.sleepIntervalBeforeRestart);
+    String cellName = conf.get("titan.cell.name");
+    try {
+
+      if (this.useHadoopCtl) {
+        Process p = Runtime.getRuntime().exec("/usr/local/bin/hadoopctl restart regionserver");
+        p.waitFor();
+      } else {
+        Process p = Runtime.getRuntime().exec("/usr/local/hadoop/" +
+            cellName + "-HBASE/bin/hbase-daemon.sh stop regionserver");
+        p.waitFor();
+        p = Runtime.getRuntime().exec("/usr/local/hadoop/" +
+            cellName + "-HBASE/bin/hbase-daemon.sh start regionserver");
+        p.waitFor();
+        LOG.info("Exit value for the restarter " + p.exitValue());
+      }
+
+    } catch (IOException e1) {
+      System.out.println("Restart of regionserver failed");
+      throw e1;
+    }
+
+    // Wait for it to come back online
+    while(true) {
+      try {
+        if (getHRegionConnection(serverAddr).isStopped() == false) {
+          break;
+        }
+     } catch (Exception e) {
+       System.out.println("Waiting for region server to come online.");
+       Thread.sleep(1000);
+     }
+    }
+    Thread.sleep(this.sleepIntervalAfterRestart);
+  }
+
+  final HRegionInterface getDestinationServer(final HRegionInfo region) throws IOException
{
+
+    // We are undraining, return the same regionserver back
+    if (currentState == STAGE.UNDRAIN) {
+      return getHRegionConnection(serverAddr);
+    }
+
+    List<HServerAddress> serversForRegion = plan.getAssignment(region);
+
+    // Get the preferred region server from the Assignment Plan
+    for (HServerAddress server : serversForRegion) {
+      if (!server.equals(serverAddr)) {
+        try {
+          HRegionInterface candidate = getHRegionConnection(server);
+          if (!candidate.isStopped()) {
+            return candidate;
+          }
+        } catch (IOException e) {
+          // server not online/reachable skip
+        }
+      }
+    }
+
+    // if none found we should return a random server. For now return null
+    return null;
+  }
+
+  void drainServer() throws Exception {
+
+    LOG.info("Draining region server");
+
+    currentState = STAGE.DRAIN;
+    for (HRegionInfo region : regions) {
+      if (region.isMetaRegion() ||
+          region.isRootRegion()  ||
+          region.getRegionNameAsString().contains(",,")) {
+        continue;
+      }
+      moveRegion(region);
+      Thread.sleep(this.regionDrainInterval);
+    }
+  }
+
+  void undrainServer() throws Exception {
+    LOG.info("Undraining region server");
+    currentState = STAGE.UNDRAIN;
+    for (HRegionInfo region : regions) {
+      if (region.isMetaRegion() ||
+          region.isRootRegion() ||
+          region.getRegionNameAsString().contains(",,")) {
+        continue;
+      }
+      moveRegion(region);
+      Thread.sleep(this.regionUndrainInterval);
+    }
+  }
+
+  void setup() throws IOException {
+
+    LOG.info("Setup started");
+    // blacklist the server
+    admin.getMaster().addServerToBlacklist(
+        getHRegionConnection(serverAddr).getHServerInfo().getHostnamePort());
+
+    regions = getHRegionConnection(serverAddr).getRegionsAssignment();
+
+    RegionPlacement regionPlacementProxy = new RegionPlacement(conf);
+    plan = regionPlacementProxy.getExistingAssignmentPlan();
+
+    // Start the region checker for all the regions present on the region server
+    for (HRegionInfo region : regions) {
+      RegionChecker checker =
+          new RegionChecker(region, region.getTableDesc().getNameAsString(), conf, this.getOpFrequency);
+      this.regionCheckers.add(checker);
+      checker.start();
+    }
+
+    LOG.info("Setup Complete");
+  }
+
+  public void clear() {
+
+    for (RegionChecker r : this.regionCheckers) {
+      r.stop();
+      r.printInfo();
+    }
+    this.regionCheckers.clear();
+
+    try {
+      admin.getMaster().clearBlacklistedServer(
+          getHRegionConnection(serverAddr).getHServerInfo().getHostnamePort());
+    } catch (IOException e) {
+      LOG.error("Failed to remove the server from black list. Please remove it");
+    }
+   }
+
+   public static void clearAll() {
+     Configuration conf = HBaseConfiguration.create();
+
+     try {
+       HBaseAdmin admin = new HBaseAdmin(conf);
+       try {
+         admin.getMaster().clearAllBlacklistedServers();
+       } catch (IOException e) {
+         LOG.error("Failed to clear black listed regionservers.");
+       }
+     } catch (MasterNotRunningException e) {
+       LOG.error("Cannot initialize admin. Error: " + e.getMessage());
+     }
+   }
+
+   public class RegionChecker implements Runnable {
+     final HRegionInfo regionInfo;
+     final byte[] startKey, endKey;
+     final String tableName;
+     final Configuration conf;
+     int frequency;
+     Map<Long, Exception> errors = new HashMap<Long, Exception>();
+     long lastTimeExceptionSeen = 0;
+     long totalTimeout = 0;
+     ScheduledExecutorService threadPool = Executors.newScheduledThreadPool(1);
+     final Random rand = new Random ();
+     HTable table = null;
+
+     RegionChecker(final HRegionInfo info, final String tableName,
+           final Configuration conf, int frequency) {
+       this.regionInfo = info;
+       this.tableName = tableName;
+       this.conf = conf;
+
+       this.frequency = frequency;
+       this.startKey = info.getStartKey();
+       this.endKey = info.getEndKey();
+       try {
+         table = new HTable(conf, tableName);
+       } catch (IOException e) {
+         e.printStackTrace();
+         return;
+       }
+     }
+
+     public void start() {
+      threadPool.scheduleAtFixedRate(this, frequency, frequency, TimeUnit.MILLISECONDS);
+     }
+
+     public void run() {
+      long currentTime = 0;
+      Get g = new Get(getOneRandomRow());
+      try {
+         currentTime = System.currentTimeMillis();
+
+         table.get(g);
+
+         if (lastTimeExceptionSeen != 0) {
+           LOG.debug("Retry successful for region " + this.regionInfo.getRegionNameAsString());
+           totalTimeout += (System.currentTimeMillis() - lastTimeExceptionSeen);
+           lastTimeExceptionSeen = 0;
+         }
+       } catch (Exception e) {
+         errors.put(currentTime, e);
+
+         LOG.debug(regionInfo.getRegionNameAsString() +
+             " encountered exception. Row: " + Bytes.toStringBinary(g.getRow()) + " Count
= " + errors.size(), e);
+         if (lastTimeExceptionSeen == 0) {
+           lastTimeExceptionSeen = System.currentTimeMillis();
+         }
+       }
+     }
+
+     public void stop() {
+       threadPool.shutdownNow();
+       if (lastTimeExceptionSeen != 0) {
+         totalTimeout += (System.currentTimeMillis() - lastTimeExceptionSeen);
+       }
+     }
+
+     public byte[] getOneRandomRow () {
+
+       byte[][] randomSplits = Bytes.split(startKey, endKey, true,
+             rand.nextInt(16));
+       return randomSplits[0];
+     }
+
+     public void printInfo() {
+       LOG.info(regionInfo.getRegionNameAsString() +
+           ": total timeout = " + totalTimeout + ", number of errors = " +  errors.size());
+     }
+   };
+
+  /**
+   * @param args
+   * @throws ParseException
+   */
+  public static void main(String[] args) throws ParseException {
+
+    Options options = new Options();
+
+    options.addOption("s", "server", true,
+        "Name of the region server to restart");
+    options.addOption("r", "sleep_after_restart", true,
+        "time interval after which the region server should be started assigning regions.
Default : 10000ms");
+    options.addOption("r", "sleep_before_restart", true,
+        "time interval after which the region server should be restarted after draining.
Default : 10000ms");
+    options.addOption("d", "region_drain_interval", true,
+        "time interval between region movements while draining. Default : 1000ms");
+    options.addOption("u", "region_undrain_interval", true,
+        "time interval between region movements while undraining. Default : 10000ms");
+    options.addOption("g", "get_request_frequency", true,
+        "frequency at which region checker will check for region availability. Default :
1000ms");
+    options.addOption("c", "clear", false,
+        "Clear all the regionserver from blacklist. Default : false");
+    options.addOption("h", "dont_use_hadoopctl", false,
+        "Don't hadoopctl to restart the regionserver. Default : true");
+
+    if (args.length == 0) {
+      HelpFormatter formatter = new HelpFormatter();
+      formatter.printHelp("RollingRestart", options, true);
+      return;
+    }
+
+    CommandLineParser parser = new PosixParser();
+    CommandLine cmd = parser.parse(options, args);
+
+    String serverName = null;
+    int sleepIntervalAfterRestart = RollingRestart.DEFAULT_SLEEP_AFTER_RESTART_INTERVAL;
+    int regionDrainInterval = RollingRestart.DEFAULT_REGION_DRAIN_INTERVAL;
+    int regionUndrainInterval = RollingRestart.DEFAULT_REGION_UNDRAIN_INTERVAL;
+    int getOpFrequency = RollingRestart.DEFAULT_GETOP_FREQUENCY;
+    int sleepIntervalBeforeRestart = RollingRestart.DEFAULT_SLEEP_BEFORE_RESTART_INTERVAL;
+    boolean useHadoopCtl = true;
+
+    if (cmd.hasOption("c")) {
+      RollingRestart.clearAll();
+    }
+
+    if (!cmd.hasOption("s")) {
+      HelpFormatter formatter = new HelpFormatter();
+      formatter.printHelp("RollingRestart", options, true);
+      return;
+    } else {
+      serverName = cmd.getOptionValue("s");
+    }
+
+    if (cmd.hasOption("r")) {
+      sleepIntervalAfterRestart = Integer.parseInt(cmd.getOptionValue("r"));
+    }
+
+    if (cmd.hasOption("b")) {
+      sleepIntervalBeforeRestart = Integer.parseInt(cmd.getOptionValue("b"));
+    }
+
+    if (cmd.hasOption("h")) {
+      useHadoopCtl = false;
+    }
+
+    if (cmd.hasOption("d")) {
+      regionDrainInterval = Integer.parseInt(cmd.getOptionValue("d"));
+    }
+
+    if (cmd.hasOption("u")) {
+      regionUndrainInterval = Integer.parseInt(cmd.getOptionValue("u"));
+    }
+
+    if (cmd.hasOption("g")) {
+      getOpFrequency = Integer.parseInt(cmd.getOptionValue("g"));
+    }
+
+    RollingRestart rr = null;
+    try {
+      rr = new RollingRestart(serverName, regionDrainInterval,
+          regionUndrainInterval, sleepIntervalAfterRestart,
+          sleepIntervalBeforeRestart, getOpFrequency, useHadoopCtl);
+    } catch (IOException e) {
+      e.printStackTrace();
+      LOG.error("Rolling restart failed for " + serverName);
+      return;
+    }
+
+    Logger.getLogger("org.apache.zookeeper").setLevel(Level.ERROR);
+    Logger.getLogger("org.apache.hadoop.hbase").setLevel(Level.INFO);
+
+    try  {
+      rr.setup();
+      rr.drainServer();
+      rr.restart();
+      rr.undrainServer();
+      LOG.info("Rolling restart complete for " + serverName);
+    } catch (Exception e) {
+      e.printStackTrace();
+      LOG.error("Rolling restart failed for " + serverName + " at stage " + rr.currentState.name());
+      switch (rr.currentState) {
+        case SETUP:
+          LOG.error("Cannot start rolling restart. Please retry");
+          break;
+        case DRAIN:
+          LOG.error("Cannot drain regions from the server. It should " +
+              "get reassigned by the Assignment Load Balancer. Need to " +
+              "retry rolling restart.");
+          break;
+        case RESTART_REGIONSERVER:
+          LOG.error("Unable to restart regionserver. Please restart it "
+              + "manually.");
+          break;
+        case UNDRAIN:
+          LOG.error("Unable to move the region back to the regionserver. " +
+              " Assignment Load Balancer will rebalance the regions.");
+         default:
+       }
+    } finally {
+      rr.clear();
+    }
+  }
+}

Modified: hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java?rev=1480007&r1=1480006&r2=1480007&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java (original)
+++ hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java Tue May
 7 18:32:20 2013
@@ -33,6 +33,7 @@ import org.apache.hadoop.hbase.HBaseTest
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.HRegionInfo;
 import org.apache.hadoop.hbase.HRegionLocation;
+import org.apache.hadoop.hbase.HServerAddress;
 import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
 import org.apache.hadoop.hbase.ipc.HRegionInterface;
 import org.apache.hadoop.hbase.master.AssignmentPlan;
@@ -121,7 +122,7 @@ public class TestHCM {
 
     LOG.debug(blacklistedServer.getServerInfo().getHostnamePort() + " blacklisted");
 
-    drainRegionServer(blacklistedServer);
+    drainRegionServer(ap, blacklistedServer);
 
     LOG.debug("No more regions on black listed server " +
         blacklistedServer.getHServerInfo().getHostnamePort());
@@ -130,7 +131,17 @@ public class TestHCM {
         (blacklistedServerId + 1) % servers.size());
 
     Thread.sleep(60000);
-    assertTrue(blacklistedServer.getOnlineRegions().size() == 0);
+
+    int numberOfNonMetaRegions = 0;
+    for (HRegion r : blacklistedServer.getOnlineRegions()) {
+      LOG.debug("Region opened on " + r.getRegionNameAsString());
+      if (!r.getRegionInfo().isMetaRegion() &&
+          !r.getRegionInfo().isRootRegion()) {
+        numberOfNonMetaRegions++;
+      }
+    }
+
+    assertTrue(numberOfNonMetaRegions == 0);
 
     LOG.debug("Removing blacklisted Region Server");
 
@@ -294,7 +305,8 @@ public class TestHCM {
   }
 
   
-  private int drainRegionServer(HRegionServer blacklistedServer) throws IOException, InterruptedException
{
+  private int drainRegionServer(AssignmentPlan ap,
+      HRegionServer blacklistedServer) throws IOException, InterruptedException {
 
     while (true) {
       Collection<HRegion> regions = blacklistedServer.getOnlineRegions();
@@ -309,13 +321,19 @@ public class TestHCM {
         break;
       }
 
-      HRegionServer destRS = TEST_UTIL.getHBaseCluster().getRegionServer(3);
       for (HRegion region : regions) {
+        HRegionInterface destRS =
+            getDestinationServer(ap, blacklistedServer.getHServerInfo().getServerAddress(),
+            region.getRegionInfo());
+        if (destRS == null) {
+          LOG.debug("No preferred server found for " + region.getRegionNameAsString() +
+              ". Skipping");
+        }
         LOG.debug("Moving region " + region.getRegionNameAsString());
         try {
           TEST_UTIL.getHBaseAdmin().moveRegion(
               region.getRegionInfo().getRegionName(),
-              destRS.getServerInfo().getHostnamePort());
+              destRS.getHServerInfo().getHostnamePort());
         } catch (IOException e) {
           LOG.info("Cannot move " + region.getRegionNameAsString());
           continue;
@@ -340,6 +358,30 @@ public class TestHCM {
     return 0;
   }
 
+ final HRegionInterface getDestinationServer(
+     AssignmentPlan plan, HServerAddress serverAddr,
+     final HRegionInfo region) {
+
+    List<HServerAddress> serversForRegion = plan.getAssignment(region);
+
+    // Get the preferred region server from the Assignment Plan
+    for (HServerAddress server : serversForRegion) {
+      if (!server.equals(serverAddr)) {
+        try {
+          HRegionInterface candidate = TEST_UTIL.getHBaseAdmin().getConnection().getHRegionConnection(server);
+          if (!TEST_UTIL.getHBaseAdmin().getConnection().getHRegionConnection(server).isStopped())
{
+            return candidate;
+          }
+        } catch (IOException e) {
+          // server not online/reachable skip
+        }
+      }
+    }
+
+    // if none found we should return a random server. For now return null
+    return null;
+  }
+
   /**
    * Simulates a case where the RegionServer throws exception because
    * a put operation failed.



Mime
View raw message