Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 2626C200CCE for ; Sun, 23 Jul 2017 17:08:45 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 24BBF1646B2; Sun, 23 Jul 2017 15:08:45 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id D1ADD1642F9 for ; Sun, 23 Jul 2017 17:08:42 +0200 (CEST) Received: (qmail 67874 invoked by uid 500); 23 Jul 2017 15:08:39 -0000 Mailing-List: contact commits-help@hbase.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@hbase.apache.org Delivered-To: mailing list commits@hbase.apache.org Received: (qmail 66311 invoked by uid 99); 23 Jul 2017 15:08:37 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Sun, 23 Jul 2017 15:08:37 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id D170EDFF8A; Sun, 23 Jul 2017 15:08:36 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: git-site-role@apache.org To: commits@hbase.apache.org Date: Sun, 23 Jul 2017 15:08:53 -0000 Message-Id: In-Reply-To: <10dce31be4034f58b712555cb49aa90c@git.apache.org> References: <10dce31be4034f58b712555cb49aa90c@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [18/51] [partial] hbase-site git commit: Published site at 82d554e3783372cc6b05489452c815b57c06f6cd. archived-at: Sun, 23 Jul 2017 15:08:45 -0000 http://git-wip-us.apache.org/repos/asf/hbase-site/blob/0383a9c2/devapidocs/src-html/org/apache/hadoop/hbase/master/ServerManager.html ---------------------------------------------------------------------- diff --git a/devapidocs/src-html/org/apache/hadoop/hbase/master/ServerManager.html b/devapidocs/src-html/org/apache/hadoop/hbase/master/ServerManager.html index 15fec0a..d30e78b 100644 --- a/devapidocs/src-html/org/apache/hadoop/hbase/master/ServerManager.html +++ b/devapidocs/src-html/org/apache/hadoop/hbase/master/ServerManager.html @@ -630,684 +630,653 @@ 622 } 623 } 624 -625 /** -626 * Sends an MERGE REGIONS RPC to the specified server to merge the specified -627 * regions. -628 * <p> -629 * A region server could reject the close request because it either does not -630 * have the specified region. -631 * @param server server to merge regions -632 * @param region_a region to merge -633 * @param region_b region to merge -634 * @param forcible true if do a compulsory merge, otherwise we will only merge -635 * two adjacent regions -636 * @throws IOException -637 */ -638 public void sendRegionsMerge(ServerName server, HRegionInfo region_a, -639 HRegionInfo region_b, boolean forcible, final User user) throws IOException { -640 if (server == null) -641 throw new NullPointerException("Passed server is null"); -642 if (region_a == null || region_b == null) -643 throw new NullPointerException("Passed region is null"); -644 AdminService.BlockingInterface admin = getRsAdmin(server); -645 if (admin == null) { -646 throw new IOException("Attempting to send MERGE REGIONS RPC to server " -647 + server.toString() + " for region " -648 + region_a.getRegionNameAsString() + "," -649 + region_b.getRegionNameAsString() -650 + " failed because no RPC connection found to this server"); -651 } -652 HBaseRpcController controller = newRpcController(); -653 ProtobufUtil.mergeRegions(controller, admin, region_a, region_b, forcible, user); -654 } -655 -656 @VisibleForTesting -657 public void moveFromOnlineToDeadServers(final ServerName sn) { -658 synchronized (onlineServers) { -659 if (!this.onlineServers.containsKey(sn)) { -660 LOG.warn("Expiration of " + sn + " but server not online"); -661 } -662 // Remove the server from the known servers lists and update load info BUT -663 // add to deadservers first; do this so it'll show in dead servers list if -664 // not in online servers list. -665 this.deadservers.add(sn); -666 this.onlineServers.remove(sn); -667 onlineServers.notifyAll(); -668 } -669 this.rsAdmins.remove(sn); -670 } -671 -672 public synchronized void processDeadServer(final ServerName serverName, boolean shouldSplitWal) { -673 // When assignment manager is cleaning up the zookeeper nodes and rebuilding the -674 // in-memory region states, region servers could be down. Meta table can and -675 // should be re-assigned, log splitting can be done too. However, it is better to -676 // wait till the cleanup is done before re-assigning user regions. -677 // -678 // We should not wait in the server shutdown handler thread since it can clog -679 // the handler threads and meta table could not be re-assigned in case -680 // the corresponding server is down. So we queue them up here instead. -681 if (!master.getAssignmentManager().isFailoverCleanupDone()) { -682 requeuedDeadServers.put(serverName, shouldSplitWal); -683 return; -684 } -685 -686 this.deadservers.add(serverName); -687 master.getAssignmentManager().submitServerCrash(serverName, shouldSplitWal); -688 } -689 -690 /** -691 * Process the servers which died during master's initialization. It will be -692 * called after HMaster#assignMeta and AssignmentManager#joinCluster. -693 * */ -694 synchronized void processQueuedDeadServers() { -695 if (!master.isServerCrashProcessingEnabled()) { -696 LOG.info("Master hasn't enabled ServerShutdownHandler"); -697 } -698 Iterator<ServerName> serverIterator = queuedDeadServers.iterator(); -699 while (serverIterator.hasNext()) { -700 ServerName tmpServerName = serverIterator.next(); -701 expireServer(tmpServerName); -702 serverIterator.remove(); -703 requeuedDeadServers.remove(tmpServerName); -704 } -705 -706 if (!master.getAssignmentManager().isFailoverCleanupDone()) { -707 LOG.info("AssignmentManager hasn't finished failover cleanup; waiting"); -708 } -709 -710 for (Map.Entry<ServerName, Boolean> entry : requeuedDeadServers.entrySet()) { -711 processDeadServer(entry.getKey(), entry.getValue()); -712 } -713 requeuedDeadServers.clear(); -714 } -715 -716 /* -717 * Remove the server from the drain list. -718 */ -719 public boolean removeServerFromDrainList(final ServerName sn) { -720 // Warn if the server (sn) is not online. ServerName is of the form: -721 // <hostname> , <port> , <startcode> +625 @VisibleForTesting +626 public void moveFromOnlineToDeadServers(final ServerName sn) { +627 synchronized (onlineServers) { +628 if (!this.onlineServers.containsKey(sn)) { +629 LOG.warn("Expiration of " + sn + " but server not online"); +630 } +631 // Remove the server from the known servers lists and update load info BUT +632 // add to deadservers first; do this so it'll show in dead servers list if +633 // not in online servers list. +634 this.deadservers.add(sn); +635 this.onlineServers.remove(sn); +636 onlineServers.notifyAll(); +637 } +638 this.rsAdmins.remove(sn); +639 } +640 +641 public synchronized void processDeadServer(final ServerName serverName, boolean shouldSplitWal) { +642 // When assignment manager is cleaning up the zookeeper nodes and rebuilding the +643 // in-memory region states, region servers could be down. Meta table can and +644 // should be re-assigned, log splitting can be done too. However, it is better to +645 // wait till the cleanup is done before re-assigning user regions. +646 // +647 // We should not wait in the server shutdown handler thread since it can clog +648 // the handler threads and meta table could not be re-assigned in case +649 // the corresponding server is down. So we queue them up here instead. +650 if (!master.getAssignmentManager().isFailoverCleanupDone()) { +651 requeuedDeadServers.put(serverName, shouldSplitWal); +652 return; +653 } +654 +655 this.deadservers.add(serverName); +656 master.getAssignmentManager().submitServerCrash(serverName, shouldSplitWal); +657 } +658 +659 /** +660 * Process the servers which died during master's initialization. It will be +661 * called after HMaster#assignMeta and AssignmentManager#joinCluster. +662 * */ +663 synchronized void processQueuedDeadServers() { +664 if (!master.isServerCrashProcessingEnabled()) { +665 LOG.info("Master hasn't enabled ServerShutdownHandler"); +666 } +667 Iterator<ServerName> serverIterator = queuedDeadServers.iterator(); +668 while (serverIterator.hasNext()) { +669 ServerName tmpServerName = serverIterator.next(); +670 expireServer(tmpServerName); +671 serverIterator.remove(); +672 requeuedDeadServers.remove(tmpServerName); +673 } +674 +675 if (!master.getAssignmentManager().isFailoverCleanupDone()) { +676 LOG.info("AssignmentManager hasn't finished failover cleanup; waiting"); +677 } +678 +679 for (Map.Entry<ServerName, Boolean> entry : requeuedDeadServers.entrySet()) { +680 processDeadServer(entry.getKey(), entry.getValue()); +681 } +682 requeuedDeadServers.clear(); +683 } +684 +685 /* +686 * Remove the server from the drain list. +687 */ +688 public boolean removeServerFromDrainList(final ServerName sn) { +689 // Warn if the server (sn) is not online. ServerName is of the form: +690 // <hostname> , <port> , <startcode> +691 +692 if (!this.isServerOnline(sn)) { +693 LOG.warn("Server " + sn + " is not currently online. " + +694 "Removing from draining list anyway, as requested."); +695 } +696 // Remove the server from the draining servers lists. +697 return this.drainingServers.remove(sn); +698 } +699 +700 /* +701 * Add the server to the drain list. +702 */ +703 public boolean addServerToDrainList(final ServerName sn) { +704 // Warn if the server (sn) is not online. ServerName is of the form: +705 // <hostname> , <port> , <startcode> +706 +707 if (!this.isServerOnline(sn)) { +708 LOG.warn("Server " + sn + " is not currently online. " + +709 "Ignoring request to add it to draining list."); +710 return false; +711 } +712 // Add the server to the draining servers lists, if it's not already in +713 // it. +714 if (this.drainingServers.contains(sn)) { +715 LOG.warn("Server " + sn + " is already in the draining server list." + +716 "Ignoring request to add it again."); +717 return false; +718 } +719 LOG.info("Server " + sn + " added to draining server list."); +720 return this.drainingServers.add(sn); +721 } 722 -723 if (!this.isServerOnline(sn)) { -724 LOG.warn("Server " + sn + " is not currently online. " + -725 "Removing from draining list anyway, as requested."); -726 } -727 // Remove the server from the draining servers lists. -728 return this.drainingServers.remove(sn); -729 } -730 -731 /* -732 * Add the server to the drain list. +723 // RPC methods to region servers +724 +725 /** +726 * Sends an OPEN RPC to the specified server to open the specified region. +727 * <p> +728 * Open should not fail but can if server just crashed. +729 * <p> +730 * @param server server to open a region +731 * @param region region to open +732 * @param favoredNodes 733 */ -734 public boolean addServerToDrainList(final ServerName sn) { -735 // Warn if the server (sn) is not online. ServerName is of the form: -736 // <hostname> , <port> , <startcode> -737 -738 if (!this.isServerOnline(sn)) { -739 LOG.warn("Server " + sn + " is not currently online. " + -740 "Ignoring request to add it to draining list."); -741 return false; -742 } -743 // Add the server to the draining servers lists, if it's not already in -744 // it. -745 if (this.drainingServers.contains(sn)) { -746 LOG.warn("Server " + sn + " is already in the draining server list." + -747 "Ignoring request to add it again."); -748 return false; -749 } -750 LOG.info("Server " + sn + " added to draining server list."); -751 return this.drainingServers.add(sn); -752 } -753 -754 // RPC methods to region servers -755 -756 /** -757 * Sends an OPEN RPC to the specified server to open the specified region. -758 * <p> -759 * Open should not fail but can if server just crashed. -760 * <p> -761 * @param server server to open a region -762 * @param region region to open -763 * @param favoredNodes -764 */ -765 public RegionOpeningState sendRegionOpen(final ServerName server, -766 HRegionInfo region, List<ServerName> favoredNodes) -767 throws IOException { -768 AdminService.BlockingInterface admin = getRsAdmin(server); -769 if (admin == null) { -770 throw new IOException("Attempting to send OPEN RPC to server " + server.toString() + -771 " failed because no RPC connection found to this server"); -772 } -773 OpenRegionRequest request = -774 RequestConverter.buildOpenRegionRequest(server, region, favoredNodes, false); -775 try { -776 OpenRegionResponse response = admin.openRegion(null, request); -777 return ResponseConverter.getRegionOpeningState(response); -778 } catch (ServiceException se) { -779 checkForRSznode(server, se); -780 throw ProtobufUtil.getRemoteException(se); -781 } -782 } -783 -784 /** -785 * Check for an odd state, where we think an RS is up but it is not. Do it on OPEN. -786 * This is only case where the check makes sense. -787 * -788 * <p>We are checking for instance of HBASE-9593 where a RS registered but died before it put -789 * up its znode in zk. In this case, the RS made it into the list of online servers but it -790 * is not actually UP. We do the check here where there is an evident problem rather -791 * than do some crazy footwork where we'd have master check zk after a RS had reported -792 * for duty with provisional state followed by a confirmed state; that'd be a mess. -793 * Real fix is HBASE-17733. -794 */ -795 private void checkForRSznode(final ServerName serverName, final ServiceException se) { -796 if (se.getCause() == null) return; -797 Throwable t = se.getCause(); -798 if (t instanceof ConnectException) { -799 // If this, proceed to do cleanup. -800 } else { -801 // Look for FailedServerException -802 if (!(t instanceof IOException)) return; -803 if (t.getCause() == null) return; -804 if (!(t.getCause() instanceof FailedServerException)) return; -805 // Ok, found FailedServerException -- continue. -806 } -807 if (!isServerOnline(serverName)) return; -808 // We think this server is online. Check it has a znode up. Currently, a RS -809 // registers an ephereral znode in zk. If not present, something is up. Maybe -810 // HBASE-9593 where RS crashed AFTER reportForDuty but BEFORE it put up an ephemeral -811 // znode. -812 List<String> servers = null; -813 try { -814 servers = getRegionServersInZK(this.master.getZooKeeper()); -815 } catch (KeeperException ke) { -816 LOG.warn("Failed to list regionservers", ke); -817 // ZK is malfunctioning, don't hang here -818 } -819 boolean found = false; -820 if (servers != null) { -821 for (String serverNameAsStr: servers) { -822 ServerName sn = ServerName.valueOf(serverNameAsStr); -823 if (sn.equals(serverName)) { -824 // Found a server up in zk. -825 found = true; -826 break; -827 } -828 } -829 } -830 if (!found) { -831 LOG.warn("Online server " + serverName.toString() + " has no corresponding " + -832 "ephemeral znode (Did it die before registering in zk?); " + -833 "calling expire to clean it up!"); -834 expireServer(serverName); -835 } -836 } -837 -838 /** -839 * Sends an OPEN RPC to the specified server to open the specified region. -840 * <p> -841 * Open should not fail but can if server just crashed. +734 public RegionOpeningState sendRegionOpen(final ServerName server, +735 HRegionInfo region, List<ServerName> favoredNodes) +736 throws IOException { +737 AdminService.BlockingInterface admin = getRsAdmin(server); +738 if (admin == null) { +739 throw new IOException("Attempting to send OPEN RPC to server " + server.toString() + +740 " failed because no RPC connection found to this server"); +741 } +742 OpenRegionRequest request = +743 RequestConverter.buildOpenRegionRequest(server, region, favoredNodes, false); +744 try { +745 OpenRegionResponse response = admin.openRegion(null, request); +746 return ResponseConverter.getRegionOpeningState(response); +747 } catch (ServiceException se) { +748 checkForRSznode(server, se); +749 throw ProtobufUtil.getRemoteException(se); +750 } +751 } +752 +753 /** +754 * Check for an odd state, where we think an RS is up but it is not. Do it on OPEN. +755 * This is only case where the check makes sense. +756 * +757 * <p>We are checking for instance of HBASE-9593 where a RS registered but died before it put +758 * up its znode in zk. In this case, the RS made it into the list of online servers but it +759 * is not actually UP. We do the check here where there is an evident problem rather +760 * than do some crazy footwork where we'd have master check zk after a RS had reported +761 * for duty with provisional state followed by a confirmed state; that'd be a mess. +762 * Real fix is HBASE-17733. +763 */ +764 private void checkForRSznode(final ServerName serverName, final ServiceException se) { +765 if (se.getCause() == null) return; +766 Throwable t = se.getCause(); +767 if (t instanceof ConnectException) { +768 // If this, proceed to do cleanup. +769 } else { +770 // Look for FailedServerException +771 if (!(t instanceof IOException)) return; +772 if (t.getCause() == null) return; +773 if (!(t.getCause() instanceof FailedServerException)) return; +774 // Ok, found FailedServerException -- continue. +775 } +776 if (!isServerOnline(serverName)) return; +777 // We think this server is online. Check it has a znode up. Currently, a RS +778 // registers an ephereral znode in zk. If not present, something is up. Maybe +779 // HBASE-9593 where RS crashed AFTER reportForDuty but BEFORE it put up an ephemeral +780 // znode. +781 List<String> servers = null; +782 try { +783 servers = getRegionServersInZK(this.master.getZooKeeper()); +784 } catch (KeeperException ke) { +785 LOG.warn("Failed to list regionservers", ke); +786 // ZK is malfunctioning, don't hang here +787 } +788 boolean found = false; +789 if (servers != null) { +790 for (String serverNameAsStr: servers) { +791 ServerName sn = ServerName.valueOf(serverNameAsStr); +792 if (sn.equals(serverName)) { +793 // Found a server up in zk. +794 found = true; +795 break; +796 } +797 } +798 } +799 if (!found) { +800 LOG.warn("Online server " + serverName.toString() + " has no corresponding " + +801 "ephemeral znode (Did it die before registering in zk?); " + +802 "calling expire to clean it up!"); +803 expireServer(serverName); +804 } +805 } +806 +807 /** +808 * Sends an OPEN RPC to the specified server to open the specified region. +809 * <p> +810 * Open should not fail but can if server just crashed. +811 * <p> +812 * @param server server to open a region +813 * @param regionOpenInfos info of a list of regions to open +814 * @return a list of region opening states +815 */ +816 public List<RegionOpeningState> sendRegionOpen(ServerName server, +817 List<Pair<HRegionInfo, List<ServerName>>> regionOpenInfos) +818 throws IOException { +819 AdminService.BlockingInterface admin = getRsAdmin(server); +820 if (admin == null) { +821 throw new IOException("Attempting to send OPEN RPC to server " + server.toString() + +822 " failed because no RPC connection found to this server"); +823 } +824 +825 OpenRegionRequest request = +826 RequestConverter.buildOpenRegionRequest(server, regionOpenInfos, false); +827 try { +828 OpenRegionResponse response = admin.openRegion(null, request); +829 return ResponseConverter.getRegionOpeningStateList(response); +830 } catch (ServiceException se) { +831 checkForRSznode(server, se); +832 throw ProtobufUtil.getRemoteException(se); +833 } +834 } +835 +836 private HBaseRpcController newRpcController() { +837 return rpcControllerFactory == null ? null : rpcControllerFactory.newController(); +838 } +839 +840 /** +841 * Sends an CLOSE RPC to the specified server to close the specified region. 842 * <p> -843 * @param server server to open a region -844 * @param regionOpenInfos info of a list of regions to open -845 * @return a list of region opening states -846 */ -847 public List<RegionOpeningState> sendRegionOpen(ServerName server, -848 List<Pair<HRegionInfo, List<ServerName>>> regionOpenInfos) -849 throws IOException { -850 AdminService.BlockingInterface admin = getRsAdmin(server); -851 if (admin == null) { -852 throw new IOException("Attempting to send OPEN RPC to server " + server.toString() + -853 " failed because no RPC connection found to this server"); -854 } -855 -856 OpenRegionRequest request = -857 RequestConverter.buildOpenRegionRequest(server, regionOpenInfos, false); -858 try { -859 OpenRegionResponse response = admin.openRegion(null, request); -860 return ResponseConverter.getRegionOpeningStateList(response); -861 } catch (ServiceException se) { -862 checkForRSznode(server, se); -863 throw ProtobufUtil.getRemoteException(se); -864 } -865 } -866 -867 private HBaseRpcController newRpcController() { -868 return rpcControllerFactory == null ? null : rpcControllerFactory.newController(); -869 } -870 -871 /** -872 * Sends an CLOSE RPC to the specified server to close the specified region. -873 * <p> -874 * A region server could reject the close request because it either does not -875 * have the specified region or the region is being split. -876 * @param server server to open a region -877 * @param region region to open -878 * @param dest - if the region is moved to another server, the destination server. null otherwise. -879 * @throws IOException -880 */ -881 public boolean sendRegionClose(ServerName server, HRegionInfo region, -882 ServerName dest) throws IOException { -883 if (server == null) throw new NullPointerException("Passed server is null"); -884 AdminService.BlockingInterface admin = getRsAdmin(server); -885 if (admin == null) { -886 throw new IOException("Attempting to send CLOSE RPC to server " + -887 server.toString() + " for region " + -888 region.getRegionNameAsString() + -889 " failed because no RPC connection found to this server"); -890 } -891 HBaseRpcController controller = newRpcController(); -892 return ProtobufUtil.closeRegion(controller, admin, server, region.getRegionName(), dest); -893 } -894 -895 public boolean sendRegionClose(ServerName server, -896 HRegionInfo region) throws IOException { -897 return sendRegionClose(server, region, null); -898 } -899 -900 /** -901 * Sends a WARMUP RPC to the specified server to warmup the specified region. -902 * <p> -903 * A region server could reject the close request because it either does not -904 * have the specified region or the region is being split. -905 * @param server server to warmup a region -906 * @param region region to warmup -907 */ -908 public void sendRegionWarmup(ServerName server, -909 HRegionInfo region) { -910 if (server == null) return; -911 try { -912 AdminService.BlockingInterface admin = getRsAdmin(server); -913 HBaseRpcController controller = newRpcController(); -914 ProtobufUtil.warmupRegion(controller, admin, region); -915 } catch (IOException e) { -916 LOG.error("Received exception in RPC for warmup server:" + -917 server + "region: " + region + -918 "exception: " + e); -919 } -920 } -921 -922 /** -923 * Contacts a region server and waits up to timeout ms -924 * to close the region. This bypasses the active hmaster. +843 * A region server could reject the close request because it either does not +844 * have the specified region or the region is being split. +845 * @param server server to open a region +846 * @param region region to open +847 * @param dest - if the region is moved to another server, the destination server. null otherwise. +848 * @throws IOException +849 */ +850 public boolean sendRegionClose(ServerName server, HRegionInfo region, +851 ServerName dest) throws IOException { +852 if (server == null) throw new NullPointerException("Passed server is null"); +853 AdminService.BlockingInterface admin = getRsAdmin(server); +854 if (admin == null) { +855 throw new IOException("Attempting to send CLOSE RPC to server " + +856 server.toString() + " for region " + +857 region.getRegionNameAsString() + +858 " failed because no RPC connection found to this server"); +859 } +860 HBaseRpcController controller = newRpcController(); +861 return ProtobufUtil.closeRegion(controller, admin, server, region.getRegionName(), dest); +862 } +863 +864 public boolean sendRegionClose(ServerName server, +865 HRegionInfo region) throws IOException { +866 return sendRegionClose(server, region, null); +867 } +868 +869 /** +870 * Sends a WARMUP RPC to the specified server to warmup the specified region. +871 * <p> +872 * A region server could reject the close request because it either does not +873 * have the specified region or the region is being split. +874 * @param server server to warmup a region +875 * @param region region to warmup +876 */ +877 public void sendRegionWarmup(ServerName server, +878 HRegionInfo region) { +879 if (server == null) return; +880 try { +881 AdminService.BlockingInterface admin = getRsAdmin(server); +882 HBaseRpcController controller = newRpcController(); +883 ProtobufUtil.warmupRegion(controller, admin, region); +884 } catch (IOException e) { +885 LOG.error("Received exception in RPC for warmup server:" + +886 server + "region: " + region + +887 "exception: " + e); +888 } +889 } +890 +891 /** +892 * Contacts a region server and waits up to timeout ms +893 * to close the region. This bypasses the active hmaster. +894 */ +895 public static void closeRegionSilentlyAndWait(ClusterConnection connection, +896 ServerName server, HRegionInfo region, long timeout) throws IOException, InterruptedException { +897 AdminService.BlockingInterface rs = connection.getAdmin(server); +898 HBaseRpcController controller = connection.getRpcControllerFactory().newController(); +899 try { +900 ProtobufUtil.closeRegion(controller, rs, server, region.getRegionName()); +901 } catch (IOException e) { +902 LOG.warn("Exception when closing region: " + region.getRegionNameAsString(), e); +903 } +904 long expiration = timeout + System.currentTimeMillis(); +905 while (System.currentTimeMillis() < expiration) { +906 controller.reset(); +907 try { +908 HRegionInfo rsRegion = +909 ProtobufUtil.getRegionInfo(controller, rs, region.getRegionName()); +910 if (rsRegion == null) return; +911 } catch (IOException ioe) { +912 if (ioe instanceof NotServingRegionException) // no need to retry again +913 return; +914 LOG.warn("Exception when retrieving regioninfo from: " +915 + region.getRegionNameAsString(), ioe); +916 } +917 Thread.sleep(1000); +918 } +919 throw new IOException("Region " + region + " failed to close within" +920 + " timeout " + timeout); +921 } +922 +923 /** +924 * Check if a region server is reachable and has the expected start code 925 */ -926 public static void closeRegionSilentlyAndWait(ClusterConnection connection, -927 ServerName server, HRegionInfo region, long timeout) throws IOException, InterruptedException { -928 AdminService.BlockingInterface rs = connection.getAdmin(server); -929 HBaseRpcController controller = connection.getRpcControllerFactory().newController(); -930 try { -931 ProtobufUtil.closeRegion(controller, rs, server, region.getRegionName()); -932 } catch (IOException e) { -933 LOG.warn("Exception when closing region: " + region.getRegionNameAsString(), e); -934 } -935 long expiration = timeout + System.currentTimeMillis(); -936 while (System.currentTimeMillis() < expiration) { -937 controller.reset(); -938 try { -939 HRegionInfo rsRegion = -940 ProtobufUtil.getRegionInfo(controller, rs, region.getRegionName()); -941 if (rsRegion == null) return; -942 } catch (IOException ioe) { -943 if (ioe instanceof NotServingRegionException) // no need to retry again -944 return; -945 LOG.warn("Exception when retrieving regioninfo from: " -946 + region.getRegionNameAsString(), ioe); -947 } -948 Thread.sleep(1000); +926 public boolean isServerReachable(ServerName server) { +927 if (server == null) throw new NullPointerException("Passed server is null"); +928 +929 +930 RetryCounter retryCounter = pingRetryCounterFactory.create(); +931 while (retryCounter.shouldRetry()) { +932 try { +933 HBaseRpcController controller = newRpcController(); +934 AdminService.BlockingInterface admin = getRsAdmin(server); +935 if (admin != null) { +936 ServerInfo info = ProtobufUtil.getServerInfo(controller, admin); +937 return info != null && info.hasServerName() +938 && server.getStartcode() == info.getServerName().getStartCode(); +939 } +940 } catch (IOException ioe) { +941 LOG.debug("Couldn't reach " + server + ", try=" + retryCounter.getAttemptTimes() +942 + " of " + retryCounter.getMaxAttempts(), ioe); +943 try { +944 retryCounter.sleepUntilNextRetry(); +945 } catch(InterruptedException ie) { +946 Thread.currentThread().interrupt(); +947 } +948 } 949 } -950 throw new IOException("Region " + region + " failed to close within" -951 + " timeout " + timeout); -952 } -953 -954 /** -955 * Check if a region server is reachable and has the expected start code -956 */ -957 public boolean isServerReachable(ServerName server) { -958 if (server == null) throw new NullPointerException("Passed server is null"); -959 -960 -961 RetryCounter retryCounter = pingRetryCounterFactory.create(); -962 while (retryCounter.shouldRetry()) { -963 try { -964 HBaseRpcController controller = newRpcController(); -965 AdminService.BlockingInterface admin = getRsAdmin(server); -966 if (admin != null) { -967 ServerInfo info = ProtobufUtil.getServerInfo(controller, admin); -968 return info != null && info.hasServerName() -969 && server.getStartcode() == info.getServerName().getStartCode(); -970 } -971 } catch (IOException ioe) { -972 LOG.debug("Couldn't reach " + server + ", try=" + retryCounter.getAttemptTimes() -973 + " of " + retryCounter.getMaxAttempts(), ioe); -974 try { -975 retryCounter.sleepUntilNextRetry(); -976 } catch(InterruptedException ie) { -977 Thread.currentThread().interrupt(); -978 } -979 } -980 } -981 return false; -982 } -983 -984 /** -985 * @param sn -986 * @return Admin interface for the remote regionserver named <code>sn</code> -987 * @throws IOException -988 * @throws RetriesExhaustedException wrapping a ConnectException if failed -989 */ -990 public AdminService.BlockingInterface getRsAdmin(final ServerName sn) -991 throws IOException { -992 AdminService.BlockingInterface admin = this.rsAdmins.get(sn); -993 if (admin == null) { -994 LOG.debug("New admin connection to " + sn.toString()); -995 if (sn.equals(master.getServerName()) && master instanceof HRegionServer) { -996 // A master is also a region server now, see HBASE-10569 for details -997 admin = ((HRegionServer)master).getRSRpcServices(); -998 } else { -999 admin = this.connection.getAdmin(sn); -1000 } -1001 this.rsAdmins.put(sn, admin); -1002 } -1003 return admin; -1004 } -1005 -1006 /** -1007 * Calculate min necessary to start. This is not an absolute. It is just -1008 * a friction that will cause us hang around a bit longer waiting on -1009 * RegionServers to check-in. -1010 */ -1011 private int getMinToStart() { -1012 // One server should be enough to get us off the ground. -1013 int requiredMinToStart = 1; -1014 if (BaseLoadBalancer.tablesOnMaster(master.getConfiguration())) { -1015 if (!BaseLoadBalancer.userTablesOnMaster(master.getConfiguration())) { -1016 // If Master is carrying regions but NOT user-space regions (the current default), -1017 // since the Master shows as a 'server', we need at least one more server to check -1018 // in before we can start up so up defaultMinToStart to 2. -1019 requiredMinToStart = 2; -1020 } -1021 } -1022 int minToStart = this.master.getConfiguration().getInt(WAIT_ON_REGIONSERVERS_MINTOSTART, -1); -1023 // Ensure we are never less than requiredMinToStart else stuff won't work. -1024 return minToStart == -1 || minToStart < requiredMinToStart? requiredMinToStart: minToStart; -1025 } -1026 -1027 /** -1028 * Wait for the region servers to report in. -1029 * We will wait until one of this condition is met: -1030 * - the master is stopped -1031 * - the 'hbase.master.wait.on.regionservers.maxtostart' number of -1032 * region servers is reached -1033 * - the 'hbase.master.wait.on.regionservers.mintostart' is reached AND -1034 * there have been no new region server in for -1035 * 'hbase.master.wait.on.regionservers.interval' time AND -1036 * the 'hbase.master.wait.on.regionservers.timeout' is reached -1037 * -1038 * @throws InterruptedException -1039 */ -1040 public void waitForRegionServers(MonitoredTask status) throws InterruptedException { -1041 final long interval = this.master.getConfiguration(). -1042 getLong(WAIT_ON_REGIONSERVERS_INTERVAL, 1500); -1043 final long timeout = this.master.getConfiguration(). -1044 getLong(WAIT_ON_REGIONSERVERS_TIMEOUT, 4500); -1045 // Min is not an absolute; just a friction making us wait longer on server checkin. -1046 int minToStart = getMinToStart(); -1047 int maxToStart = this.master.getConfiguration(). -1048 getInt(WAIT_ON_REGIONSERVERS_MAXTOSTART, Integer.MAX_VALUE); -1049 if (maxToStart < minToStart) { -1050 LOG.warn(String.format("The value of '%s' (%d) is set less than '%s' (%d), ignoring.", -1051 WAIT_ON_REGIONSERVERS_MAXTOSTART, maxToStart, -1052 WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart)); -1053 maxToStart = Integer.MAX_VALUE; -1054 } -1055 -1056 long now = System.currentTimeMillis(); -1057 final long startTime = now; -1058 long slept = 0; -1059 long lastLogTime = 0; -1060 long lastCountChange = startTime; -1061 int count = countOfRegionServers(); -1062 int oldCount = 0; -1063 // This while test is a little hard to read. We try to comment it in below but in essence: -1064 // Wait if Master is not stopped and the number of regionservers that have checked-in is -1065 // less than the maxToStart. Both of these conditions will be true near universally. -1066 // Next, we will keep cycling if ANY of the following three conditions are true: -1067 // 1. The time since a regionserver registered is < interval (means servers are actively checking in). -1068 // 2. We are under the total timeout. -1069 // 3. The count of servers is < minimum. -1070 for (ServerListener listener: this.listeners) { -1071 listener.waiting(); -1072 } -1073 while (!this.master.isStopped() && count < maxToStart && -1074 ((lastCountChange + interval) > now || timeout > slept || count < minToStart)) { -1075 // Log some info at every interval time or if there is a change -1076 if (oldCount != count || lastLogTime + interval < now) { -1077 lastLogTime = now; -1078 String msg = -1079 "Waiting on RegionServer count=" + count + " to settle; waited="+ -1080 slept + "ms, expecting min=" + minToStart + " server(s), max="+ getStrForMax(maxToStart) + -1081 " server(s), " + "timeout=" + timeout + "ms, lastChange=" + (lastCountChange - now) + "ms"; -1082 LOG.info(msg); -1083 status.setStatus(msg); -1084 } -1085 -1086 // We sleep for some time -1087 final long sleepTime = 50; -1088 Thread.sleep(sleepTime); -1089 now = System.currentTimeMillis(); -1090 slept = now - startTime; -1091 -1092 oldCount = count; -1093 count = countOfRegionServers(); -1094 if (count != oldCount) { -1095 lastCountChange = now; -1096 } -1097 } -1098 LOG.info("Finished wait on RegionServer count=" + count + "; waited=" + slept + "ms," + -1099 " expected min=" + minToStart + " server(s), max=" + getStrForMax(maxToStart) + " server(s),"+ -1100 " master is "+ (this.master.isStopped() ? "stopped.": "running")); -1101 } -1102 -1103 private String getStrForMax(final int max) { -1104 return max == Integer.MAX_VALUE? "NO_LIMIT": Integer.toString(max); -1105 } -1106 -1107 /** -1108 * @return A copy of the internal list of online servers. -1109 */ -1110 public List<ServerName> getOnlineServersList() { -1111 // TODO: optimize the load balancer call so we don't need to make a new list -1112 // TODO: FIX. THIS IS POPULAR CALL. -1113 return new ArrayList<>(this.onlineServers.keySet()); -1114 } -1115 -1116 /** -1117 * @param keys The target server name -1118 * @param idleServerPredicator Evaluates the server on the given load -1119 * @return A copy of the internal list of online servers matched by the predicator -1120 */ -1121 public List<ServerName> getOnlineServersListWithPredicator(List<ServerName> keys, -1122 Predicate<ServerLoad> idleServerPredicator) { -1123 List<ServerName> names = new ArrayList<>(); -1124 if (keys != null && idleServerPredicator != null) { -1125 keys.forEach(name -> { -1126 ServerLoad load = onlineServers.get(name); -1127 if (load != null) { -1128 if (idleServerPredicator.test(load)) { -1129 names.add(name); -1130 } -1131 } -1132 }); -1133 } -1134 return names; +950 return false; +951 } +952 +953 /** +954 * @param sn +955 * @return Admin interface for the remote regionserver named <code>sn</code> +956 * @throws IOException +957 * @throws RetriesExhaustedException wrapping a ConnectException if failed +958 */ +959 public AdminService.BlockingInterface getRsAdmin(final ServerName sn) +960 throws IOException { +961 AdminService.BlockingInterface admin = this.rsAdmins.get(sn); +962 if (admin == null) { +963 LOG.debug("New admin connection to " + sn.toString()); +964 if (sn.equals(master.getServerName()) && master instanceof HRegionServer) { +965 // A master is also a region server now, see HBASE-10569 for details +966 admin = ((HRegionServer)master).getRSRpcServices(); +967 } else { +968 admin = this.connection.getAdmin(sn); +969 } +970 this.rsAdmins.put(sn, admin); +971 } +972 return admin; +973 } +974 +975 /** +976 * Calculate min necessary to start. This is not an absolute. It is just +977 * a friction that will cause us hang around a bit longer waiting on +978 * RegionServers to check-in. +979 */ +980 private int getMinToStart() { +981 // One server should be enough to get us off the ground. +982 int requiredMinToStart = 1; +983 if (BaseLoadBalancer.tablesOnMaster(master.getConfiguration())) { +984 if (!BaseLoadBalancer.userTablesOnMaster(master.getConfiguration())) { +985 // If Master is carrying regions but NOT user-space regions (the current default), +986 // since the Master