Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 117AC200CBC for ; Tue, 20 Jun 2017 22:07:34 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 0E035160BE1; Tue, 20 Jun 2017 20:07:34 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 53818160BCC for ; Tue, 20 Jun 2017 22:07:33 +0200 (CEST) Received: (qmail 38084 invoked by uid 500); 20 Jun 2017 20:07:32 -0000 Mailing-List: contact commits-help@hbase.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@hbase.apache.org Delivered-To: mailing list commits@hbase.apache.org Received: (qmail 38075 invoked by uid 99); 20 Jun 2017 20:07:32 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 20 Jun 2017 20:07:32 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 63E25DFB94; Tue, 20 Jun 2017 20:07:32 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: syuanjiang@apache.org To: commits@hbase.apache.org Message-Id: <0aebeb87c90a479d9bb0a47edb56f96d@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: hbase git commit: HBASE-18036 Data locality is not maintained after cluster restart or SSH (Stephen Yuan Jiang) Date: Tue, 20 Jun 2017 20:07:32 +0000 (UTC) archived-at: Tue, 20 Jun 2017 20:07:34 -0000 Repository: hbase Updated Branches: refs/heads/branch-1 dead08d66 -> 532e0dda1 HBASE-18036 Data locality is not maintained after cluster restart or SSH (Stephen Yuan Jiang) Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/532e0dda Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/532e0dda Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/532e0dda Branch: refs/heads/branch-1 Commit: 532e0dda16f3c5034aa337201bf6d733cc0a1c7b Parents: dead08d Author: Stephen Yuan Jiang Authored: Tue Jun 20 13:07:19 2017 -0700 Committer: Stephen Yuan Jiang Committed: Tue Jun 20 13:07:19 2017 -0700 ---------------------------------------------------------------------- .../hadoop/hbase/master/ServerManager.java | 8 ++++++ .../master/procedure/ServerCrashProcedure.java | 26 +++++++++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/532e0dda/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 71d03ce..2361c0c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -1203,6 +1203,14 @@ public class ServerManager { } /** + * Check whether a server is online based on hostname and port + * @return true if finding a server with matching hostname and port. + */ + public boolean isServerWithSameHostnamePortOnline(final ServerName serverName) { + return findServerWithSameHostnamePortWithLock(serverName) != null; + } + + /** * Check if a server is known to be dead. A server can be online, * or known to be dead, or unknown to this manager (i.e, not online, * not known to be dead either. it is simply not tracked by the http://git-wip-us.apache.org/repos/asf/hbase/blob/532e0dda/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java index bfe3cc6..3463000 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java @@ -23,8 +23,10 @@ import java.io.InterruptedIOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.concurrent.locks.Lock; @@ -547,13 +549,31 @@ implements ServerProcedureInterface { private boolean assign(final MasterProcedureEnv env, final List hris) throws InterruptedIOException { AssignmentManager am = env.getMasterServices().getAssignmentManager(); + // If the dead server already restarted, assign to the same server to preserve locality + boolean retainAssignment = + env.getMasterServices().getServerManager().isServerWithSameHostnamePortOnline(serverName) ? + true : false; try { - am.assign(hris); + if (retainAssignment) { + Map hriServerMap = + new HashMap(hris.size()); + for (HRegionInfo hri: hris) { + hriServerMap.put(hri, serverName); + } + LOG.info("Best effort in SSH to retain assignment of " + hris.size() + + " regions from the dead server " + serverName); + am.assign(hriServerMap); + } else { + LOG.info("Using round robin in SSH to assign " + hris.size() + + " regions from the dead server " + serverName); + am.assign(hris); + } } catch (InterruptedException ie) { - LOG.error("Caught " + ie + " during round-robin assignment"); + LOG.error("Caught " + ie + " during " + (retainAssignment ? "retaining" : "round-robin") + + " assignment"); throw (InterruptedIOException)new InterruptedIOException().initCause(ie); } catch (IOException ioe) { - LOG.info("Caught " + ioe + " during region assignment, will retry"); + LOG.warn("Caught " + ioe + " during region assignment, will retry"); return false; } return true;