hbase-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From li...@apache.org
Subject svn commit: r1509360 - in /hbase/branches/0.89-fb/src: main/java/org/apache/hadoop/hbase/ main/java/org/apache/hadoop/hbase/master/ main/java/org/apache/hadoop/hbase/regionserver/ test/java/org/apache/hadoop/hbase/master/
Date Thu, 01 Aug 2013 18:18:18 GMT
Author: liyin
Date: Thu Aug  1 18:18:18 2013
New Revision: 1509360

URL: http://svn.apache.org/r1509360
Log:
[HBASE-9104] added RegionChecker to track when regions go unavailable and back opened

Author: ibra

Summary: added RegionChecker to track when regions go unavailable and back opened

Test Plan:
1) stop/restart some regionserver and if it had regions you will see canges in log (if not,
try stop/start/restart some other regionservers)
2) how to see changes in log: you can grep it: cat logfile | grep 'REGION_CHECKER_INFO' to
see info

Reviewers: rshroff, aaiyer

Reviewed By: rshroff

CC: hbase-eng@

Differential Revision: https://phabricator.fb.com/D847584

Task ID: 2130094

Added:
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionChecker.java
    hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestRegionChecker.java
Modified:
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/HConstants.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/BaseScanner.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessRegionOpen.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/HConstants.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/HConstants.java?rev=1509360&r1=1509359&r2=1509360&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/HConstants.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/HConstants.java Thu Aug 
1 18:18:18 2013
@@ -214,6 +214,11 @@ public final class HConstants {
   /** Default region server interface class name. */
   public static final String DEFAULT_REGION_SERVER_CLASS = HRegionInterface.class.getName();
 
+  /** Parameter name for enabling regionChecker */
+  public static final String REGION_CHECKER_ENABLED = "hbase.master.regionchecker.enabled";
+  /** Default value for enabling regionChecker */
+  public static final Boolean DEFAULT_REGION_CHECKER_ENABLED = false;
+
   /** Parameter name for what compaction manager to use. */
   public static final String COMPACTION_MANAGER_CLASS = "hbase.compactionmanager.class";
 
@@ -334,7 +339,7 @@ public final class HConstants {
 
   /** Default maximum file size */
   public static final long DEFAULT_MAX_FILE_SIZE = 256 * 1024 * 1024;
-  
+
   /** Default minimum number of files to be compacted */
   public static final int DEFAULT_MIN_FILES_TO_COMPACT = 3;
 
@@ -715,7 +720,7 @@ public final class HConstants {
   public static final String HBASE_REGION_ASSIGNMENT_LOADBALANCER_WAITTIME_MS
                                 = "hbase.master.assignment.load.balancer.waittime.ms";
   public static final int DEFAULT_HBASE_REGION_ASSIGNMENT_LOADBALANCER_WAITTIME_MS = 60000;
-  
+
   /*
    * This defines the number of buckets used for computing the histogram of
    * pread latency.

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/BaseScanner.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/BaseScanner.java?rev=1509360&r1=1509359&r2=1509360&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/BaseScanner.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/BaseScanner.java Thu
Aug  1 18:18:18 2013
@@ -211,8 +211,10 @@ abstract class BaseScanner extends Chore
           splitParents.put(region, values);
         }
         rows += 1;
-
       }
+
+      this.master.getRegionManager().addRegionsInfo(rows, this.rootRegion);
+
       if (rootRegion) {
         this.master.getRegionManager().setNumMetaRegions(rows);
       }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessRegionOpen.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessRegionOpen.java?rev=1509360&r1=1509359&r2=1509360&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessRegionOpen.java
(original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessRegionOpen.java
Thu Aug  1 18:18:18 2013
@@ -110,6 +110,7 @@ public class ProcessRegionOpen extends P
     LOG.info("Updated row " + regionInfo.getRegionNameAsString() + " in region "
         + Bytes.toString(region.getRegionName()) + " with startcode=" + serverInfo.getStartCode()
         + ", server=" + serverInfo.getHostnamePort());
+    this.master.getServerManager().getRegionChecker().becameOpened(regionInfo);
   }
   
   @Override

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java?rev=1509360&r1=1509359&r2=1509360&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java
(original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java
Thu Aug  1 18:18:18 2013
@@ -53,6 +53,7 @@ class ProcessServerShutdown extends Regi
   private List<MetaRegion> metaRegions, metaRegionsUnassigned;
   private boolean rootRescanned;
   private HServerAddress deadServerAddress;
+  private final long expiredSince;
 
   public enum LogSplitResult {
     NOT_RUNNING,
@@ -78,7 +79,7 @@ class ProcessServerShutdown extends Regi
    * @param master
    * @param serverInfo
    */
-  public ProcessServerShutdown(HMaster master, HServerInfo serverInfo) {
+  public ProcessServerShutdown(HMaster master, HServerInfo serverInfo, long expiredSince)
{
     super(master, serverInfo.getServerName());
     this.deadServer = serverInfo.getServerName();
     this.deadServerAddress = serverInfo.getServerAddress();
@@ -86,6 +87,8 @@ class ProcessServerShutdown extends Regi
     this.successfulMetaScans = new HashSet<String>();
     // check to see if I am responsible for either ROOT or any of the META tables.
 
+    this.expiredSince = expiredSince;
+
     // TODO Why do we do this now instead of at processing time?
     closeMetaRegions();
   }
@@ -286,12 +289,12 @@ class ProcessServerShutdown extends Regi
         if (skip)
           continue;
 
-        master.getRegionManager().setUnassigned(info, false);
+        this.setRegionUnassigned(info, false);
         this.metaRegionsUnassigned.add(mr);
       }
       else {
         LOG.debug(this.toString() + "setting " + " unassigned: " + info.toString());
-        master.getRegionManager().setUnassigned(info, false);
+        this.setRegionUnassigned(info, false);
       }
     }
     t2 = System.currentTimeMillis();
@@ -418,7 +421,7 @@ class ProcessServerShutdown extends Regi
       if (metaRegionsUnassigned.contains(metaRegion)) continue;
 
       LOG.info(this.toString() + " setting to unassigned: " + metaRegion.toString());
-      master.getRegionManager().setUnassigned(metaRegion.getRegionInfo(), true);
+      this.setRegionUnassigned(metaRegion.getRegionInfo(), true);
       metaRegionsUnassigned.add(metaRegion);
     }
 
@@ -504,4 +507,9 @@ class ProcessServerShutdown extends Regi
   public LogSplitResult getLogSplitResult() {
     return this.logSplitResult;
   }
+
+  private void setRegionUnassigned(HRegionInfo info, boolean force) {
+    this.master.getServerManager().getRegionChecker().becameClosed(info, this.expiredSince);
+    this.master.getRegionManager().setUnassigned(info, force);
+  }
 }

Added: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionChecker.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionChecker.java?rev=1509360&view=auto
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionChecker.java
(added)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionChecker.java
Thu Aug  1 18:18:18 2013
@@ -0,0 +1,485 @@
+/**
+ * Copyright 2010 The Apache Software Foundation Licensed to the Apache Software Foundation
(ASF)
+ * under one or more contributor license agreements. See the NOTICE file distributed with
this work
+ * for additional information regarding copyright ownership. The ASF licenses this file to
you under
+ * the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed
to in
+ * writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific
+ * language governing permissions and limitations under the License.
+ */
+package org.apache.hadoop.hbase.master;
+
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.StringTokenizer;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.HRegionInfo;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hbase.zookeeper.ZooKeeperWrapper;
+import org.apache.zookeeper.KeeperException;
+
+/**
+ * Class to track information about regions availability and to calculate day availability
and week availability.
+ *
+ * Only information for latest MAX_LOG_TIME_DIF milliseconds is stored.
+ * It stores information for each region in zookeeper in the following format:
+ * /hbase/regionchecker/data/ - directory where data about every region is stored (info about
some region is
+ *                              stored in file with region hash as file name)
+ * /hbase/regionchecker/previous/ - directory where info about latest region's fail is stored,
or 0 if the region
+ *                              is assigned (info about some region is stored in file with
region hash as file name)
+ *
+ * data is stored in the following format:
+ * "ms duration\n"
+ * ms - start time in millisecods (got from EnvironmentEdgeManager.currentTimeMillis())
+ * duration  - duration of being unavailable - in milliseconds.
+ *
+ * @author ibra
+ */
+public class RegionChecker {
+  protected static final Log LOG = LogFactory.getLog(RegionChecker.class);
+
+  private ZooKeeperWrapper zkWrapper;
+  private HMaster master;
+
+  private static final long WEEK_TIME_DIF = 7L * 24L * 60L * 60L * 1000L;
+  private static final long DAY_TIME_DIF = 24L * 60L * 60L * 1000L;
+  private static final long MAX_LOG_TIME_DIF = 7L * 24L * 60L * 60L * 1000L;
+  private final String ZNodeName = "regionchecker";
+
+  /**
+   * is regionCheckerEnabled - got from knob
+   */
+  private final boolean regionCheckerEnabled;
+
+  public boolean isEnabled()
+  {
+    return regionCheckerEnabled;
+  }
+
+  public RegionChecker(final HMaster master) {
+    Configuration conf = master.getConfiguration();
+    this.regionCheckerEnabled = conf.getBoolean(HConstants.REGION_CHECKER_ENABLED, HConstants.DEFAULT_REGION_CHECKER_ENABLED);
+    this.master = master;
+    if (this.regionCheckerEnabled) {
+      this.zkWrapper = ZooKeeperWrapper.getInstance(conf, master.getZKWrapperName());
+    }
+  }
+
+  /**
+   * When region becomes closed this method is called to store information of each region's
latest
+   * fail
+   * @param regionHash
+   */
+  synchronized public void becameClosed(final HRegionInfo rInfo) {
+    this.becameClosed(rInfo, EnvironmentEdgeManager.currentTimeMillis());
+  }
+
+  /**
+   * When region becomes closed this method is called to store information of each region's
latest
+   * fail
+   * @param regionHash
+   */
+  synchronized public void becameClosed(final HRegionInfo rInfo, long createdTime) {
+    if (!this.regionCheckerEnabled) {
+      return;
+    }
+
+    final String region = rInfo.getRegionNameAsString();
+    String previous = this.getPrevious(region);
+    if (previous.equals("") || previous.equals("0")) {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("region '" + region + ":\t becameClosed");
+      }
+
+      this.setPrevious(region, Long.toString(createdTime));
+    } else {
+      LOG.warn("region '" + region + "' becameClosed came second: previous is already set
to '" + previous + "' and now is '" + Long.toString(createdTime) + "'");
+    }
+  }
+
+  /**
+   * When region becomes opened this method is called to store information of each region's
+   * unassigment interval
+   * @param regionHash
+   */
+  synchronized public void becameOpened(final HRegionInfo rInfo) {
+    if (!this.regionCheckerEnabled) {
+      return;
+    }
+
+    final String region = rInfo.getRegionNameAsString();
+    String previous = this.getPrevious(region);
+    if (!previous.equals("") && !previous.equals("0")) {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("region '" + region + ":\t becameOpened");
+      }
+
+      this.setPrevious(region, "0");
+      long currTime = EnvironmentEdgeManager.currentTimeMillis();
+      long prevTime = Long.parseLong(previous);
+
+      String info = prevTime + " " + (currTime - prevTime) + "\n";
+
+      if(LOG.isDebugEnabled()) {
+        LOG.debug("region '" + region + ":\t" + info);
+      }
+
+      String znodeData = this.getData(region);
+      String strData = "";
+      StringTokenizer in = new StringTokenizer(znodeData);
+      while (in.hasMoreTokens()) {
+        long ms = Long.parseLong(in.nextToken());
+        long duration = Long.parseLong(in.nextToken());
+
+        // we store only information of the latest MAX_LOG_TIME_DIF milliseconds
+        if (currTime - ms <= MAX_LOG_TIME_DIF) {
+          strData = znodeData.substring(znodeData.indexOf(ms + " " + duration));
+          break;
+        }
+      }
+
+      this.setData(region, strData + info);
+    } else {
+      LOG.warn("region '" + region + "' was called to became Opened without being previously
called to became Closed");
+    }
+  }
+
+  /**
+   * Prints to log: cluster's last day availability cluster's last week availability each
region's
+   * last day availability each region's last week availability
+   */
+  private void printAvailabilityInfoToLog() {
+    if (!LOG.isDebugEnabled()) {
+      return;
+    }
+    LOG.debug("getLastDayAvailability =\t" + this.getLastDayAvailability());
+    LOG.debug("getLastWeekAvailability =\t" + this.getLastWeekAvailability());
+
+    Map<String, RegionAvailabilityInfo> detailedDayInfo = this.getDetailedLastDayAvailability();
+    LOG.debug("detailedDayInfo:\n");
+    for (String key : detailedDayInfo.keySet()) {
+      LOG.debug("\t" + "[" + key + "]" + " - " + detailedDayInfo.get(key));
+    }
+
+    Map<String, RegionAvailabilityInfo> detailedWeekInfo = this.getDetailedLastWeekAvailability();
+    LOG.debug("detailedWeekInfo:\n");
+    for (String key : detailedWeekInfo.keySet()) {
+      LOG.debug("\t" + "[" + key + "]" + " - " + detailedWeekInfo.get(key));
+    }
+  }
+
+  /**
+   * @return double - cluster's availability for last day
+   */
+  public double getLastDayAvailability() {
+    return this.getAvailability(DAY_TIME_DIF);
+  }
+
+  /**
+   * @return double - cluster's availability for last week
+   */
+  public double getLastWeekAvailability() {
+    return this.getAvailability(WEEK_TIME_DIF);
+  }
+
+  /**
+   * @return Map<String, RegionAvailabilityInfo> - each pair is <region, its availabilityInfo
for last day>
+   */
+  public Map<String, RegionAvailabilityInfo> getDetailedLastDayAvailability() {
+    return this.getDetailedAvailability(DAY_TIME_DIF);
+  }
+
+  /**
+   * @return Map<String, RegionAvailabilityInfo> - each pair is <region, its availabilityInfo
for last week>
+   */
+  public Map<String, RegionAvailabilityInfo> getDetailedLastWeekAvailability() {
+    return this.getDetailedAvailability(WEEK_TIME_DIF);
+  }
+
+  /**
+   * @param timeDif - method uses information of last timeDif milliseconds to calculate availability
+   * @return cluster's availability of last timeDif milliseconds
+   */
+  private double getAvailability(long timeDif) {
+    if (!this.regionCheckerEnabled) {
+      return -1.0;
+    }
+
+    Map<String, RegionAvailabilityInfo> detailed = getDetailedAvailability(timeDif);
+
+    double res = 0.0;
+    for(RegionAvailabilityInfo info : detailed.values()) {
+      res += info.getAvailability();
+    }
+
+    res += this.master.getRegionManager().getRegionsCount() - detailed.size();
+    res /= this.master.getRegionManager().getRegionsCount();
+
+    return res;
+  }
+
+  /**
+   * @param timeDif - method uses information of last timeDif milliseconds to calculate availability
+   * @return each regions's availability in Map<String, RegionAvailabilityInfo> - each
pair is <region, its
+   *         availabilityInfo for last timeDif milliseconds>
+   */
+  private Map<String, RegionAvailabilityInfo> getDetailedAvailability(long timeDif)
{
+    if (!this.regionCheckerEnabled) {
+      return new HashMap<String, RegionAvailabilityInfo>();
+    }
+    long curTime = EnvironmentEdgeManager.currentTimeMillis();
+
+    Map<String, RegionAvailabilityInfo> availabilityMap = new HashMap<String, RegionAvailabilityInfo>();
+
+    Iterable<String> dataNodes = this.zkWrapper.listZnodes(this.joinPath(this.zkWrapper.parentZNode,
this.ZNodeName, "data"));
+    if (dataNodes != null) {
+      for (String node : dataNodes) {
+        availabilityMap.put(node, new RegionAvailabilityInfo().addAvailabilityInfoFromData(this.getData(node),
curTime, timeDif));
+      }
+    }
+
+    Iterable<String> prevNodes = this.zkWrapper.listZnodes(this.joinPath(this.zkWrapper.parentZNode,
this.ZNodeName, "previous"));
+    if (prevNodes != null) {
+      for (String node : prevNodes) {
+        String previous = this.getPrevious(node);
+        // check if file exits and this region is currently unassigned
+        if (!previous.equals("") && !previous.equals("0")) {
+          if(!availabilityMap.containsKey(node)) {
+            availabilityMap.put(node, new RegionAvailabilityInfo());
+          }
+          availabilityMap.put(node, availabilityMap.get(node).addDuration(Long.parseLong(previous),
timeDif));
+        }
+      }
+    }
+
+    return availabilityMap;
+  }
+
+  public class RegionAvailabilityInfo
+  {
+    private long intervalsCount;
+    private long lastIntervalStart;
+    private long lastIntervalEnd;
+    private boolean isCurrentlyAssigned;
+    private double availability;
+
+    private RegionAvailabilityInfo()
+    {
+      isCurrentlyAssigned = true;
+      availability = 1.0;
+      intervalsCount = 0;
+    }
+
+    /**
+     * adds unavailability duration (interval [from-NOW]) to AvailabilityInfo
+     * @param from - unavailability interval start
+     * @param timeDif - time through which availability is calculated
+     * @return this
+     */
+    private RegionAvailabilityInfo addDuration(long from, long timeDif)
+    {
+      isCurrentlyAssigned = false;
+      return addDuration(from, EnvironmentEdgeManager.currentTimeMillis(), timeDif);
+    }
+
+    /**
+     * adds unavailability duration (interval [from-to]) to AvailabilityInfo
+     * @param from - unavailability interval start
+     * @param to - unavailability interval end
+     * @param timeDif - time through which availability is calculated
+     * @return this
+     */
+    private RegionAvailabilityInfo addDuration(long from, long to, long timeDif)
+    {
+      availability -= (double) (to-from) / timeDif;
+      lastIntervalStart = from;
+      lastIntervalEnd = to;
+      intervalsCount++;
+
+      return this;
+    }
+
+    /**
+     * @return last unavailability interval start time
+     */
+    public long getLastIntervalStart()
+    {
+      return lastIntervalStart;
+    }
+
+    /**
+     * @return last unavailability interval end time
+     */
+    public long getLastIntervalEnd()
+    {
+      return lastIntervalEnd;
+    }
+
+    /**
+     * @return availability
+     */
+    public double getAvailability()
+    {
+      return availability;
+    }
+
+    /**
+     * @return if last unavailability interval is still continues
+     */
+    public boolean isCurrentlyAssigned()
+    {
+      return isCurrentlyAssigned;
+    }
+
+    /**
+     * @return duration of last unavailability interval
+     */
+    public long getDuration()
+    {
+      return lastIntervalEnd-lastIntervalStart;
+    }
+
+    /**
+     * @return amount of unavailability intervals
+     */
+    public long getIntervalsCount()
+    {
+      return intervalsCount;
+    }
+
+    public String getInterval()
+    {
+      if(intervalsCount == 0)
+        return "";
+      SimpleDateFormat dateFormat = new SimpleDateFormat("dd-MM-yyyy HH:mm:ss SSS");
+      return (dateFormat.format(new Date(lastIntervalStart)) + "  ->  " +  (isCurrentlyAssigned()?dateFormat.format(new
Date(lastIntervalEnd)):"NOW"));
+    }
+
+    /**
+     * get Availability info stored in znodeData
+     * @param znodeData - string that contains data
+     * @param curTime - current time in ms
+     * @param timeDif - time through which availability is calculated
+     * @return AvailabilityInfo object
+     */
+    private RegionAvailabilityInfo addAvailabilityInfoFromData(String znodeData, long curTime,
long timeDif) {
+      StringTokenizer in = new StringTokenizer(znodeData);
+      while (in.hasMoreTokens()) {
+        long ms = Long.parseLong(in.nextToken());
+        long duration = Long.parseLong(in.nextToken());
+        if (curTime - ms <= timeDif) {
+          addDuration(ms, ms+duration, timeDif);
+        }
+      }
+      return this;
+    }
+  }
+
+  /**
+   * @param regionHash
+   * @return data stored in this region's previous-file or "" if there is no such file
+   */
+  private String getPrevious(String region) {
+    return this.getZnode("previous", region);
+  }
+
+  /**
+   * method sets strData to this region's previous-file
+   * @param regionHash
+   * @param strData
+   */
+  private void setPrevious(String region, String strData) {
+    this.setZnode("previous", region, strData);
+  }
+
+  /**
+   * @param regionHash
+   * @return data stored in this region's data-file or "" if there is no such file
+   */
+  private String getData(String region) {
+    return this.getZnode("data", region);
+  }
+
+  /**
+   * method sets strData to this region's data-file
+   * @param regionHash
+   * @param strData
+   */
+  private void setData(String region, String strData) {
+    this.setZnode("data", region, strData);
+  }
+
+  /**
+   * method returns text stored in folder/regionHash
+   * @param folder
+   * @param regionHash
+   * @return data from folder/regionHash
+   * @throws IOException
+   */
+  private String getZnode(String folder, String region) {
+    String path = this.joinPath(this.ZNodeName, folder, region);
+    this.ensureExists(path);
+
+    try {
+      byte[] bt = this.zkWrapper.readZNode(path, null);
+      return bt == null ? "" : new String(bt);
+    } catch (IOException e) {
+      LOG.error("Exception occured during read from " + path, e);
+      return "";
+    }
+  }
+
+  /**
+   * writes strData to folder/regionHash
+   * @param folder
+   * @param regionHash
+   * @param strData
+   */
+  private void setZnode(String folder, String region, String strData) {
+    String path = this.joinPath(this.ZNodeName, folder, region);
+    this.ensureExists(path);
+
+    try {
+      this.zkWrapper.writeZNode(this.zkWrapper.parentZNode, path, strData);
+    } catch (final InterruptedException e) {
+      LOG.error("Can't get data from ZNode '" + this.zkWrapper.parentZNode + "->"
+          + path + "' after calling ensureExists.", e);
+    } catch (final KeeperException e) {
+      LOG.error("Can't set data to ZNode '" + this.zkWrapper.parentZNode + "->"
+          + path + "' after calling ensureExists.", e);
+    }
+  }
+
+  /**
+   * Make sure this znode exists by creating it if it's missing
+   * @param path
+   */
+  private void ensureExists(String path) {
+    path = this.joinPath(this.zkWrapper.parentZNode, path);
+    this.zkWrapper.ensureExists(path);
+  }
+
+  /**
+   * @param args - names of folders, files
+   * @return joined path
+   */
+  private String joinPath(String... args) {
+    String res = "";
+    if (args.length > 0) {
+      res = args[0];
+      for (int i = 1; i < args.length; i++) {
+        res = this.zkWrapper.getZNode(res, args[i]);
+      }
+    }
+    return res;
+  }
+}

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java?rev=1509360&r1=1509359&r2=1509360&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java
(original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/RegionManager.java
Thu Aug  1 18:18:18 2013
@@ -96,6 +96,9 @@ public class RegionManager {
 
   private final AssignmentManager assignmentManager;
 
+  private int metaRegionsCount = 0;
+  private int notMetaRegionsCount = 0;
+
   /**
    * Map key -> tableName, value -> ThrottledRegionReopener
    * An entry is created in the map before an alter operation is performed on the
@@ -1716,6 +1719,22 @@ public class RegionManager {
     numberOfMetaRegions.set(num);
   }
 
+  public void addRegionsInfo(int r, boolean isMeta)
+  {
+    if(isMeta) {
+      metaRegionsCount = r;
+    }
+    else {
+      notMetaRegionsCount = r;
+    }
+  }
+
+  public int getRegionsCount()
+  {
+    //meta + not-meta + root
+    return metaRegionsCount + notMetaRegionsCount + 1;
+  }
+
   /**
    * Starts an action that is specific to a column family.
    * @param regionName

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1509360&r1=1509359&r2=1509360&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
(original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
Thu Aug  1 18:18:18 2013
@@ -145,6 +145,8 @@ public class ServerManager {
   private static final ConcurrentHashMap<String, Long> blacklistedRSHostPortMap =
       new ConcurrentHashMap<String, Long>();
 
+  private final RegionChecker regionChecker;
+
   /*
    * Dumps into log current stats on dead servers and number of servers
    * TODO: Make this a metric; dump metrics into log.
@@ -226,6 +228,9 @@ public class ServerManager {
         n + "ServerManager-Timeout-Monitor");
     
     this.pendingMsgsToSvrsMap = new ConcurrentHashMap<HServerInfo, ArrayList<HMsg>>();
+
+    this.regionChecker = new RegionChecker(master);
+
     this.blacklistNodeExpirationTimeWindow = c.getLong("hbase.master.blacklist.expiration.window",
         DEFAULT_BLACKLIST_NODE_EXPIRATION_WINDOW);
     this.blacklistUpdateInterval = c.getLong("hbase.master.blacklist.update.interval",
@@ -476,6 +481,11 @@ public class ServerManager {
    * @param msgs
    */
   private void processRegionServerExit(HServerInfo serverInfo, HMsg[] msgs) {
+
+    for(int i = 1; i < msgs.length; i++) {
+      this.regionChecker.becameClosed(msgs[i].getRegionInfo());
+    }
+
     // This method removes ROOT/META from the list and marks them to be
     // reassigned in addition to other housework.
     processServerInfoOnShutdown(serverInfo);
@@ -811,6 +821,7 @@ public class ServerManager {
    */
   public void processRegionOpen(HServerInfo serverInfo,
       HRegionInfo region, ArrayList<HMsg> returnMsgs) {
+
     boolean duplicateAssignment = false;
     RegionManager regionManager = master.getRegionManager();
     synchronized (regionManager) {
@@ -865,6 +876,7 @@ public class ServerManager {
         if (region.isRootRegion()) {
           // it was assigned, and it's not a duplicate assignment, so take it out
           // of the unassigned list.
+          regionChecker.becameOpened(region);
           regionManager.removeRegion(region);
 
           // Store the Root Region location (in memory)
@@ -892,6 +904,8 @@ public class ServerManager {
    * @param region
    */
   public void processRegionClose(HServerInfo serverInfo, HRegionInfo region) {
+    this.regionChecker.becameClosed(region);
+
     synchronized (this.master.getRegionManager()) {
       if (region.isRootRegion()) {
         // Root region
@@ -1087,6 +1101,7 @@ public class ServerManager {
     // First check a server to expire.  ServerName is of the form:
     // <hostname> , <port> , <startcode>
     String serverName = hsi.getServerName();
+
     HServerInfo info = this.serversToServerInfo.get(serverName);
     if (info == null) {
       LOG.warn("No HServerInfo for " + serverName);
@@ -1096,6 +1111,9 @@ public class ServerManager {
       LOG.warn("Already processing shutdown of " + serverName);
       return;
     }
+
+    long expiredSince = serversToLoad.get(serverName).lastLoadRefreshTime;
+
     synchronized (deadServerStatusLock) {
       // Remove the server from the known servers lists and update load info
       this.serversToServerInfo.remove(serverName);
@@ -1109,7 +1127,7 @@ public class ServerManager {
       this.master.getSplitLogManager().handleDeadServer(serverName);
     }
     this.master.getRegionServerOperationQueue().
-      put(new ProcessServerShutdown(master, info));
+      put(new ProcessServerShutdown(master, info, expiredSince));
     this.master.getMetrics().incRegionServerExpired();
   }
 
@@ -1149,6 +1167,10 @@ public class ServerManager {
     return this.deadServers;
   }
 
+  public RegionChecker getRegionChecker() {
+    return this.regionChecker;
+  }
+
   public ServerLoadMap<HServerLoad> getServersToLoad() {
     return serversToLoad;
   }

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=1509360&r1=1509359&r2=1509360&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
(original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
Thu Aug  1 18:18:18 2013
@@ -904,28 +904,30 @@ public class HRegionServer implements HR
       LOG.warn("Stopping - unexpected ...", t);
     }
 
-    // tell the master that we are going to shut down
-    // do it on separate thread because we don't want to block here if
-    // master is inaccessible. It is OK if this thread's message arrives
-    // out of order at the master.
-    Thread t = new Thread() {
-      @Override
-      public void run() {
-        try {
-          HMsg[] exitMsg = new HMsg[1];
-          exitMsg[0] = REPORT_BEGINNING_OF_THE_END;
-          LOG.info("prepping master for region server shutdown : " +
-              serverInfo.getServerName());
-          hbaseMaster.regionServerReport(serverInfo, exitMsg, (HRegionInfo[])null);
-        } catch (Throwable e) {
-          LOG.warn("Failed to send exiting message to master: ",
-              RemoteExceptionHandler.checkThrowable(e));
+    if(!killed) {
+      // tell the master that we are going to shut down
+      // do it on separate thread because we don't want to block here if
+      // master is inaccessible. It is OK if this thread's message arrives
+      // out of order at the master.
+      Thread t = new Thread() {
+        @Override
+        public void run() {
+          try {
+            HMsg[] exitMsg = new HMsg[1];
+            exitMsg[0] = REPORT_BEGINNING_OF_THE_END;
+            LOG.info("prepping master for region server shutdown : " +
+                serverInfo.getServerName());
+            hbaseMaster.regionServerReport(serverInfo, exitMsg, (HRegionInfo[])null);
+          } catch (Throwable e) {
+            LOG.warn("Failed to send exiting message to master: ",
+                RemoteExceptionHandler.checkThrowable(e));
+          }
         }
-      }
-    };
-    t.setName("reporting-start-of-exit-to-master");
-    t.setDaemon(true);
-    t.start();
+      };
+      t.setName("reporting-start-of-exit-to-master");
+      t.setDaemon(true);
+      t.start();
+    }
 
     if (killed) {
       // Just skip out w/o closing regions.

Added: hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestRegionChecker.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestRegionChecker.java?rev=1509360&view=auto
==============================================================================
--- hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestRegionChecker.java
(added)
+++ hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestRegionChecker.java
Thu Aug  1 18:18:18 2013
@@ -0,0 +1,320 @@
+package org.apache.hadoop.hbase.master;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HColumnDescriptor;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.HRegionInfo;
+import org.apache.hadoop.hbase.HServerAddress;
+import org.apache.hadoop.hbase.HTableDescriptor;
+import org.apache.hadoop.hbase.MiniHBaseCluster;
+import org.apache.hadoop.hbase.client.HBaseAdmin;
+import org.apache.hadoop.hbase.client.HTable;
+import org.apache.hadoop.hbase.master.RegionChecker.RegionAvailabilityInfo;
+import org.apache.hadoop.hbase.regionserver.HRegion;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class TestRegionChecker {
+  final static Log LOG = LogFactory.getLog(TestRegionChecker.class);
+  private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+  private final static int SLAVES = 4;
+  private static int lastRegionOpenedCount = 0;
+  private static HBaseAdmin admin;
+  private static int REGION_NUM = 10;
+  private static int META_REGION_NUM = 2;
+  private static RegionChecker regionChecker;
+  private static MiniHBaseCluster cluster;
+  private static final String TABLE_NAME_BASE = "testRegionAssignment";
+  private static boolean firstTableCreated = false;
+
+  /*
+    EPS is small enough and fits for comparing availabilities
+    before and after some RegionChecker events:
+
+    we have 2 availabilities - numbers like [0, 1]:
+    a - availability before
+    b = availability after
+    after killing region and after it's being unavailable for 1 sec
+    b will be = a - 1000/timeDif
+    timeDif is 7*24*60*60*1000 for week => b = a-1.65344*1e-6
+    timeDif is 24*60*60*1000 for day => b = a-11.57408*1e-6;
+  */
+  private final double EPS = 1e-9;
+
+  @BeforeClass
+  public static void setupBeforeClass() throws Exception {
+    init(true);
+    // ONLY meta regions, ROOT and META, are assigned at beginning.
+    verifyRegionMovementNum(META_REGION_NUM);
+  }
+
+  public static void init(boolean enableRegionChecker) throws Exception
+  {
+    Configuration conf = TEST_UTIL.getConfiguration();
+    // Enable the favored nodes based load balancer
+    conf.set("hbase.loadbalancer.impl",
+      "org.apache.hadoop.hbase.master.RegionManager$AssignmentLoadBalancer");
+
+    conf.setInt("hbase.master.meta.thread.rescanfrequency", 5000);
+    conf.setInt("hbase.regionserver.msginterval", 1000);
+    conf.setLong("hbase.regionserver.transientAssignment.regionHoldPeriod", 2000);
+    conf.setBoolean("hbase.master.regionchecker.enabled", enableRegionChecker);
+
+    TEST_UTIL.startMiniCluster(SLAVES);
+
+    admin = new HBaseAdmin(conf);
+
+    cluster = TEST_UTIL.getHBaseCluster();
+    regionChecker = cluster.getActiveMaster().getServerManager().getRegionChecker();
+  }
+
+  @AfterClass
+  public static void tearDownAfterClass() throws Exception {
+    TEST_UTIL.shutdownMiniCluster();
+  }
+
+  @Test(timeout = 180000)
+  public void testDisabledRegionChecker() throws Exception {
+    TEST_UTIL.shutdownMiniCluster();
+    init(false);
+
+    assertEquals(-1.0, regionChecker.getLastDayAvailability(), EPS);
+    assertEquals(-1.0, regionChecker.getLastWeekAvailability(), EPS);
+    assertTrue(regionChecker.getDetailedLastDayAvailability().isEmpty());
+    assertTrue(regionChecker.getDetailedLastWeekAvailability().isEmpty());
+  }
+
+  @Test(timeout = 180000)
+  public void testAvailabilityGoesDownWithRegionFail() throws Exception {
+    TEST_UTIL.shutdownMiniCluster();
+    init(true);
+
+    // Create a table with REGION_NUM regions.
+    String tableName = TABLE_NAME_BASE + "testAvailabilityGoesDownWithRegionFail";
+    createTable(tableName, REGION_NUM);
+    HTable ht = new HTable(TEST_UTIL.getConfiguration(), tableName);
+    Set<HRegionInfo> allRegions = ht.getRegionsInfo().keySet();
+    final int regionsMove = 1;
+
+    int serverId = this.getRegionServerId();
+    HRegionInfo regionToKill = null;
+    List<String> regionsToKill = new ArrayList<String> ();
+    for (HRegion region : cluster.getRegionServer(serverId).getOnlineRegionsAsArray()) {
+      if (!region.getRegionInfo().isMetaRegion() && !region.getRegionInfo().isRootRegion())
{
+        regionToKill = region.getRegionInfo();
+        regionsToKill.add(regionToKill.getRegionNameAsString());
+        break;
+      }
+    }
+
+    LOG.debug("killing '" + regionToKill.getRegionNameAsString() + "' region");
+    cluster.getRegionServer(serverId).closeRegion(regionToKill, true);
+    verifyRegionMovementNum(regionsMove);
+    LOG.debug("killed '" + regionToKill.getRegionNameAsString() + "' region");
+
+    check(allRegions, regionsToKill);
+
+    deleteTable(tableName, regionsMove);
+  }
+
+  @Test(timeout = 180000)
+  public void testAvailabilityGoesDownWithRegionServerCleanFail() throws Exception {
+    testAvailabilityGoesDownWithRegionServerFail(true);
+  }
+
+  @Test(timeout = 180000)
+  public void testAvailabilityGoesDownWithRegionServerUncleanFail() throws Exception {
+    testAvailabilityGoesDownWithRegionServerFail(false);
+  }
+
+  public void testAvailabilityGoesDownWithRegionServerFail(boolean isFailClean) throws Exception
{
+    TEST_UTIL.shutdownMiniCluster();
+    init(true);
+
+    // Create a table with REGION_NUM regions.
+    String tableName = TABLE_NAME_BASE + "testAvailabilityGoesDownWithRegionServerFail" +
isFailClean;
+    createTable(tableName, REGION_NUM);
+    HTable ht = new HTable(TEST_UTIL.getConfiguration(), tableName);
+    Set<HRegionInfo> allRegions = ht.getRegionsInfo().keySet();
+
+    int serverId = this.getRegionServerId();
+
+    List<String> regionsToKill = new ArrayList<String>();
+    for (HRegionInfo info : cluster.getRegionServer(serverId).getRegionsAssignment()) {
+      regionsToKill.add(info.getRegionNameAsString());
+    }
+
+    int regionCnt = cluster.getRegionServer(serverId).getOnlineRegions().size();
+
+    if(isFailClean) {
+      LOG.debug("killing regionServer clean");
+      cluster.stopRegionServer(serverId);
+      LOG.debug("killed regionServer clean");
+    }
+    else {
+      LOG.debug("killing regionServer unclean");
+      cluster.getRegionServer(serverId).kill();
+      LOG.debug("killed regionServer unclean");
+    }
+
+    verifyRegionMovementNum(regionCnt);
+
+    check(allRegions, regionsToKill);
+
+    deleteTable(tableName, regionCnt);
+  }
+
+  private void check(Set<HRegionInfo> allRegions, List<String> regionsToKill)
+  {
+    double avDayBefore = 1.0;
+    double avWeekBefore = 1.0;
+    double avDayAfter = regionChecker.getLastDayAvailability();
+    double avWeekAfter = regionChecker.getLastWeekAvailability();
+    Map<String, RegionAvailabilityInfo> avDetDayAfter = regionChecker.getDetailedLastDayAvailability();
+    Map<String, RegionAvailabilityInfo> avDetWeekAfter = regionChecker.getDetailedLastWeekAvailability();
+
+    LOG.debug("avDayBefore " + avDayBefore);
+    LOG.debug("avDayAfter " + avDayAfter);
+    LOG.debug("avWeekBefore " + avWeekBefore);
+    LOG.debug("avWeekAfter " + avWeekAfter);
+
+    // check that after killing some server dayAvailability and weekAvailability decreases
+    assertTrue(avDayBefore - avDayAfter > this.EPS);
+    assertTrue(avWeekBefore - avWeekAfter > this.EPS);
+
+    // server regions avDetDay:
+    for(String region : regionsToKill) {
+      if(!avDetDayAfter.containsKey(region)) {
+        fail("Day detailed info must contain availability info about region '" + region +
"', because it was closed");
+      }
+      assert (1.0 - avDetDayAfter.get(region).getAvailability() > this.EPS);
+    }
+
+    // server regions avWeekDay:
+    for(String region : regionsToKill) {
+      if(!avDetWeekAfter.containsKey(region)) {
+        fail("Week detailed info must contain availability info about region '" + region
+ "', because it was closed");
+      }
+      assert (1.0 - avDetWeekAfter.get(region).getAvailability() > this.EPS);
+    }
+
+    // not server regions avDetDay:
+    for (HRegionInfo info : allRegions) {
+      String region = info.getRegionNameAsString();
+      if (avDetDayAfter.containsKey(region) && !regionsToKill.contains(region)) {
+        fail("Detailed availibility map shouldn't contain such a key " + region + ", because
this region wasn't killed");
+      }
+    }
+
+    // not server regions avWeekDay:
+    for (HRegionInfo info : allRegions) {
+      String region = info.getRegionNameAsString();
+      if (avDetWeekAfter.containsKey(region) && !regionsToKill.contains(region))
{
+        fail("Detailed availibility map shouldn't contain such a key " + region + ", because
this region wasn't killed");
+      }
+    }
+  }
+
+  /** Get the region server
+   * who is currently hosting ROOT
+   * @return
+   * @throws IOException
+   */
+  private int getRegionServerId() throws IOException {
+    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+    for (int i = 0; i < SLAVES; i++) {
+      if (cluster.getRegionServer(i).getRegionsAssignment().length > 0) {
+        return i;
+      }
+    }
+    return -1;
+  }
+
+  /**
+   * Verify the number of region movement is expected
+   * @param expected
+   * @throws InterruptedException
+   */
+  private static void verifyRegionMovementNum(int expected) throws InterruptedException {
+    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
+    HMaster m = cluster.getActiveMaster();
+
+    int retry = 10;
+    long sleep = 3 * TEST_UTIL.getConfiguration().getInt("hbase.regionserver.msginterval",
1000);
+    int attempt = 0;
+    int currentRegionOpened, regionMovement;
+    do {
+      currentRegionOpened = m.getMetrics().getRegionsOpened();
+      regionMovement = currentRegionOpened - lastRegionOpenedCount;
+      LOG.debug("There are " + regionMovement + "/" + expected + " regions moved after "
+ attempt
+          + " attempts");
+      Thread.sleep((++attempt) * sleep);
+    } while (regionMovement != expected && attempt <= retry);
+
+    // update the lastRegionOpenedCount
+    lastRegionOpenedCount = currentRegionOpened;
+
+    assertEquals("There are only " + regionMovement + " instead of " + expected
+        + " region movement for " + attempt + " attempts", regionMovement, expected);
+  }
+
+  /**
+   * Create a table with specified table name and region number.
+   * @param table
+   * @param regionNum
+   * @return
+   * @throws IOException
+   * @throws InterruptedException
+   */
+  private static void createTable(String table, int regionNum)
+      throws IOException, InterruptedException {
+
+    byte[] tableName = Bytes.toBytes(table);
+    int expectedRegions = regionNum;
+    byte[][] splitKeys = new byte[expectedRegions - 1][];
+    for (int i = 1; i < expectedRegions; i++) {
+      byte splitKey = (byte) i;
+      splitKeys[i - 1] = new byte[] { splitKey, splitKey, splitKey };
+    }
+
+    HTableDescriptor desc = new HTableDescriptor(tableName);
+    desc.addFamily(new HColumnDescriptor(HConstants.CATALOG_FAMILY));
+    admin.createTable(desc, splitKeys);
+
+    HTable ht = new HTable(TEST_UTIL.getConfiguration(), tableName);
+    Map<HRegionInfo, HServerAddress> regions = ht.getRegionsInfo();
+    assertEquals(
+      "Tried to create " + expectedRegions + " regions " + "but only found " + regions.size(),
+      expectedRegions, regions.size());
+
+    if(firstTableCreated == false)
+    {
+      firstTableCreated = true;
+      verifyRegionMovementNum(REGION_NUM);
+    }
+
+    return;
+  }
+
+  private static void deleteTable(String tableName, final int regionsMove) throws IOException
{
+    admin.disableTable(tableName);
+    admin.deleteTable(tableName);
+    lastRegionOpenedCount -= regionsMove;
+  }
+}



Mime
View raw message