From commits-return-65149-archive-asf-public=cust-asf.ponee.io@hbase.apache.org Thu Jan 11 16:31:04 2018 Return-Path: X-Original-To: archive-asf-public@eu.ponee.io Delivered-To: archive-asf-public@eu.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by mx-eu-01.ponee.io (Postfix) with ESMTP id 56FF8180787 for ; Thu, 11 Jan 2018 16:31:04 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id 43FED160C4F; Thu, 11 Jan 2018 15:31:04 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 8A21E160C46 for ; Thu, 11 Jan 2018 16:30:59 +0100 (CET) Received: (qmail 3238 invoked by uid 500); 11 Jan 2018 15:30:57 -0000 Mailing-List: contact commits-help@hbase.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@hbase.apache.org Delivered-To: mailing list commits@hbase.apache.org Received: (qmail 1911 invoked by uid 99); 11 Jan 2018 15:30:56 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 11 Jan 2018 15:30:56 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id E5C54F3522; Thu, 11 Jan 2018 15:30:51 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: git-site-role@apache.org To: commits@hbase.apache.org Date: Thu, 11 Jan 2018 15:31:24 -0000 Message-Id: In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [35/51] [partial] hbase-site git commit: Published site at . http://git-wip-us.apache.org/repos/asf/hbase-site/blob/f183e80f/devapidocs/src-html/org/apache/hadoop/hbase/regionserver/HRegionServer.html ---------------------------------------------------------------------- diff --git a/devapidocs/src-html/org/apache/hadoop/hbase/regionserver/HRegionServer.html b/devapidocs/src-html/org/apache/hadoop/hbase/regionserver/HRegionServer.html index e743560..163ade0 100644 --- a/devapidocs/src-html/org/apache/hadoop/hbase/regionserver/HRegionServer.html +++ b/devapidocs/src-html/org/apache/hadoop/hbase/regionserver/HRegionServer.html @@ -2124,1654 +2124,1642 @@ 2116 return healthy; 2117 } 2118 -2119 private static final byte[] UNSPECIFIED_REGION = new byte[]{}; -2120 -2121 @Override -2122 public List<WAL> getWALs() throws IOException { -2123 return walFactory.getWALs(); -2124 } -2125 -2126 @Override -2127 public WAL getWAL(RegionInfo regionInfo) throws IOException { -2128 WAL wal; -2129 // _ROOT_ and hbase:meta regions have separate WAL. -2130 if (regionInfo != null && regionInfo.isMetaRegion() -2131 && regionInfo.getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID) { -2132 wal = walFactory.getMetaWAL(regionInfo.getEncodedNameAsBytes()); -2133 } else if (regionInfo == null) { -2134 wal = walFactory.getWAL(UNSPECIFIED_REGION, null); -2135 } else { -2136 byte[] namespace = regionInfo.getTable().getNamespace(); -2137 wal = walFactory.getWAL(regionInfo.getEncodedNameAsBytes(), namespace); -2138 } -2139 if (this.walRoller != null) { -2140 this.walRoller.addWAL(wal); -2141 } -2142 return wal; -2143 } -2144 -2145 public LogRoller getWalRoller() { -2146 return walRoller; -2147 } -2148 -2149 @Override -2150 public Connection getConnection() { -2151 return getClusterConnection(); -2152 } -2153 -2154 @Override -2155 public ClusterConnection getClusterConnection() { -2156 return this.clusterConnection; -2157 } -2158 -2159 @Override -2160 public MetaTableLocator getMetaTableLocator() { -2161 return this.metaTableLocator; -2162 } -2163 -2164 @Override -2165 public void stop(final String msg) { -2166 stop(msg, false, RpcServer.getRequestUser().orElse(null)); -2167 } -2168 -2169 /** -2170 * Stops the regionserver. -2171 * @param msg Status message -2172 * @param force True if this is a regionserver abort -2173 * @param user The user executing the stop request, or null if no user is associated -2174 */ -2175 public void stop(final String msg, final boolean force, final User user) { -2176 if (!this.stopped) { -2177 LOG.info("***** STOPPING region server '" + this + "' *****"); -2178 if (this.rsHost != null) { -2179 // when forced via abort don't allow CPs to override -2180 try { -2181 this.rsHost.preStop(msg, user); -2182 } catch (IOException ioe) { -2183 if (!force) { -2184 LOG.warn("The region server did not stop", ioe); -2185 return; -2186 } -2187 LOG.warn("Skipping coprocessor exception on preStop() due to forced shutdown", ioe); -2188 } -2189 } -2190 this.stopped = true; -2191 LOG.info("STOPPED: " + msg); -2192 // Wakes run() if it is sleeping -2193 sleeper.skipSleepCycle(); -2194 } -2195 } -2196 -2197 public void waitForServerOnline(){ -2198 while (!isStopped() && !isOnline()) { -2199 synchronized (online) { -2200 try { -2201 online.wait(msgInterval); -2202 } catch (InterruptedException ie) { -2203 Thread.currentThread().interrupt(); -2204 break; -2205 } -2206 } -2207 } -2208 } -2209 -2210 @Override -2211 public void postOpenDeployTasks(final PostOpenDeployContext context) -2212 throws KeeperException, IOException { -2213 HRegion r = context.getRegion(); -2214 long masterSystemTime = context.getMasterSystemTime(); -2215 rpcServices.checkOpen(); -2216 LOG.info("Post open deploy tasks for " + r.getRegionInfo().getRegionNameAsString()); -2217 // Do checks to see if we need to compact (references or too many files) -2218 for (HStore s : r.stores.values()) { -2219 if (s.hasReferences() || s.needsCompaction()) { -2220 this.compactSplitThread.requestSystemCompaction(r, s, "Opening Region"); -2221 } -2222 } -2223 long openSeqNum = r.getOpenSeqNum(); -2224 if (openSeqNum == HConstants.NO_SEQNUM) { -2225 // If we opened a region, we should have read some sequence number from it. -2226 LOG.error("No sequence number found when opening " + -2227 r.getRegionInfo().getRegionNameAsString()); -2228 openSeqNum = 0; -2229 } +2119 @Override +2120 public List<WAL> getWALs() throws IOException { +2121 return walFactory.getWALs(); +2122 } +2123 +2124 @Override +2125 public WAL getWAL(RegionInfo regionInfo) throws IOException { +2126 WAL wal = walFactory.getWAL(regionInfo); +2127 if (this.walRoller != null) { +2128 this.walRoller.addWAL(wal); +2129 } +2130 return wal; +2131 } +2132 +2133 public LogRoller getWalRoller() { +2134 return walRoller; +2135 } +2136 +2137 @Override +2138 public Connection getConnection() { +2139 return getClusterConnection(); +2140 } +2141 +2142 @Override +2143 public ClusterConnection getClusterConnection() { +2144 return this.clusterConnection; +2145 } +2146 +2147 @Override +2148 public MetaTableLocator getMetaTableLocator() { +2149 return this.metaTableLocator; +2150 } +2151 +2152 @Override +2153 public void stop(final String msg) { +2154 stop(msg, false, RpcServer.getRequestUser().orElse(null)); +2155 } +2156 +2157 /** +2158 * Stops the regionserver. +2159 * @param msg Status message +2160 * @param force True if this is a regionserver abort +2161 * @param user The user executing the stop request, or null if no user is associated +2162 */ +2163 public void stop(final String msg, final boolean force, final User user) { +2164 if (!this.stopped) { +2165 LOG.info("***** STOPPING region server '" + this + "' *****"); +2166 if (this.rsHost != null) { +2167 // when forced via abort don't allow CPs to override +2168 try { +2169 this.rsHost.preStop(msg, user); +2170 } catch (IOException ioe) { +2171 if (!force) { +2172 LOG.warn("The region server did not stop", ioe); +2173 return; +2174 } +2175 LOG.warn("Skipping coprocessor exception on preStop() due to forced shutdown", ioe); +2176 } +2177 } +2178 this.stopped = true; +2179 LOG.info("STOPPED: " + msg); +2180 // Wakes run() if it is sleeping +2181 sleeper.skipSleepCycle(); +2182 } +2183 } +2184 +2185 public void waitForServerOnline(){ +2186 while (!isStopped() && !isOnline()) { +2187 synchronized (online) { +2188 try { +2189 online.wait(msgInterval); +2190 } catch (InterruptedException ie) { +2191 Thread.currentThread().interrupt(); +2192 break; +2193 } +2194 } +2195 } +2196 } +2197 +2198 @Override +2199 public void postOpenDeployTasks(final PostOpenDeployContext context) +2200 throws KeeperException, IOException { +2201 HRegion r = context.getRegion(); +2202 long masterSystemTime = context.getMasterSystemTime(); +2203 rpcServices.checkOpen(); +2204 LOG.info("Post open deploy tasks for " + r.getRegionInfo().getRegionNameAsString()); +2205 // Do checks to see if we need to compact (references or too many files) +2206 for (HStore s : r.stores.values()) { +2207 if (s.hasReferences() || s.needsCompaction()) { +2208 this.compactSplitThread.requestSystemCompaction(r, s, "Opening Region"); +2209 } +2210 } +2211 long openSeqNum = r.getOpenSeqNum(); +2212 if (openSeqNum == HConstants.NO_SEQNUM) { +2213 // If we opened a region, we should have read some sequence number from it. +2214 LOG.error("No sequence number found when opening " + +2215 r.getRegionInfo().getRegionNameAsString()); +2216 openSeqNum = 0; +2217 } +2218 +2219 // Notify master +2220 if (!reportRegionStateTransition(new RegionStateTransitionContext( +2221 TransitionCode.OPENED, openSeqNum, masterSystemTime, r.getRegionInfo()))) { +2222 throw new IOException("Failed to report opened region to master: " +2223 + r.getRegionInfo().getRegionNameAsString()); +2224 } +2225 +2226 triggerFlushInPrimaryRegion(r); +2227 +2228 LOG.debug("Finished post open deploy task for " + r.getRegionInfo().getRegionNameAsString()); +2229 } 2230 -2231 // Notify master -2232 if (!reportRegionStateTransition(new RegionStateTransitionContext( -2233 TransitionCode.OPENED, openSeqNum, masterSystemTime, r.getRegionInfo()))) { -2234 throw new IOException("Failed to report opened region to master: " -2235 + r.getRegionInfo().getRegionNameAsString()); -2236 } +2231 @Override +2232 public boolean reportRegionStateTransition(final RegionStateTransitionContext context) { +2233 TransitionCode code = context.getCode(); +2234 long openSeqNum = context.getOpenSeqNum(); +2235 long masterSystemTime = context.getMasterSystemTime(); +2236 RegionInfo[] hris = context.getHris(); 2237 -2238 triggerFlushInPrimaryRegion(r); -2239 -2240 LOG.debug("Finished post open deploy task for " + r.getRegionInfo().getRegionNameAsString()); -2241 } -2242 -2243 @Override -2244 public boolean reportRegionStateTransition(final RegionStateTransitionContext context) { -2245 TransitionCode code = context.getCode(); -2246 long openSeqNum = context.getOpenSeqNum(); -2247 long masterSystemTime = context.getMasterSystemTime(); -2248 RegionInfo[] hris = context.getHris(); -2249 -2250 if (TEST_SKIP_REPORTING_TRANSITION) { -2251 // This is for testing only in case there is no master -2252 // to handle the region transition report at all. -2253 if (code == TransitionCode.OPENED) { -2254 Preconditions.checkArgument(hris != null && hris.length == 1); -2255 if (hris[0].isMetaRegion()) { -2256 try { -2257 MetaTableLocator.setMetaLocation(getZooKeeper(), serverName, -2258 hris[0].getReplicaId(),State.OPEN); -2259 } catch (KeeperException e) { -2260 LOG.info("Failed to update meta location", e); -2261 return false; -2262 } -2263 } else { -2264 try { -2265 MetaTableAccessor.updateRegionLocation(clusterConnection, -2266 hris[0], serverName, openSeqNum, masterSystemTime); -2267 } catch (IOException e) { -2268 LOG.info("Failed to update meta", e); -2269 return false; -2270 } -2271 } -2272 } -2273 return true; +2238 if (TEST_SKIP_REPORTING_TRANSITION) { +2239 // This is for testing only in case there is no master +2240 // to handle the region transition report at all. +2241 if (code == TransitionCode.OPENED) { +2242 Preconditions.checkArgument(hris != null && hris.length == 1); +2243 if (hris[0].isMetaRegion()) { +2244 try { +2245 MetaTableLocator.setMetaLocation(getZooKeeper(), serverName, +2246 hris[0].getReplicaId(),State.OPEN); +2247 } catch (KeeperException e) { +2248 LOG.info("Failed to update meta location", e); +2249 return false; +2250 } +2251 } else { +2252 try { +2253 MetaTableAccessor.updateRegionLocation(clusterConnection, +2254 hris[0], serverName, openSeqNum, masterSystemTime); +2255 } catch (IOException e) { +2256 LOG.info("Failed to update meta", e); +2257 return false; +2258 } +2259 } +2260 } +2261 return true; +2262 } +2263 +2264 ReportRegionStateTransitionRequest.Builder builder = +2265 ReportRegionStateTransitionRequest.newBuilder(); +2266 builder.setServer(ProtobufUtil.toServerName(serverName)); +2267 RegionStateTransition.Builder transition = builder.addTransitionBuilder(); +2268 transition.setTransitionCode(code); +2269 if (code == TransitionCode.OPENED && openSeqNum >= 0) { +2270 transition.setOpenSeqNum(openSeqNum); +2271 } +2272 for (RegionInfo hri: hris) { +2273 transition.addRegionInfo(ProtobufUtil.toRegionInfo(hri)); 2274 } -2275 -2276 ReportRegionStateTransitionRequest.Builder builder = -2277 ReportRegionStateTransitionRequest.newBuilder(); -2278 builder.setServer(ProtobufUtil.toServerName(serverName)); -2279 RegionStateTransition.Builder transition = builder.addTransitionBuilder(); -2280 transition.setTransitionCode(code); -2281 if (code == TransitionCode.OPENED && openSeqNum >= 0) { -2282 transition.setOpenSeqNum(openSeqNum); -2283 } -2284 for (RegionInfo hri: hris) { -2285 transition.addRegionInfo(ProtobufUtil.toRegionInfo(hri)); -2286 } -2287 ReportRegionStateTransitionRequest request = builder.build(); -2288 int tries = 0; -2289 long pauseTime = INIT_PAUSE_TIME_MS; -2290 // Keep looping till we get an error. We want to send reports even though server is going down. -2291 // Only go down if clusterConnection is null. It is set to null almost as last thing as the -2292 // HRegionServer does down. -2293 while (this.clusterConnection != null && !this.clusterConnection.isClosed()) { -2294 RegionServerStatusService.BlockingInterface rss = rssStub; -2295 try { -2296 if (rss == null) { -2297 createRegionServerStatusStub(); -2298 continue; -2299 } -2300 ReportRegionStateTransitionResponse response = -2301 rss.reportRegionStateTransition(null, request); -2302 if (response.hasErrorMessage()) { -2303 LOG.info("TRANSITION FAILED " + request + ": " + response.getErrorMessage()); -2304 break; -2305 } -2306 // Log if we had to retry else don't log unless TRACE. We want to -2307 // know if were successful after an attempt showed in logs as failed. -2308 if (tries > 0 || LOG.isTraceEnabled()) { -2309 LOG.info("TRANSITION REPORTED " + request); +2275 ReportRegionStateTransitionRequest request = builder.build(); +2276 int tries = 0; +2277 long pauseTime = INIT_PAUSE_TIME_MS; +2278 // Keep looping till we get an error. We want to send reports even though server is going down. +2279 // Only go down if clusterConnection is null. It is set to null almost as last thing as the +2280 // HRegionServer does down. +2281 while (this.clusterConnection != null && !this.clusterConnection.isClosed()) { +2282 RegionServerStatusService.BlockingInterface rss = rssStub; +2283 try { +2284 if (rss == null) { +2285 createRegionServerStatusStub(); +2286 continue; +2287 } +2288 ReportRegionStateTransitionResponse response = +2289 rss.reportRegionStateTransition(null, request); +2290 if (response.hasErrorMessage()) { +2291 LOG.info("TRANSITION FAILED " + request + ": " + response.getErrorMessage()); +2292 break; +2293 } +2294 // Log if we had to retry else don't log unless TRACE. We want to +2295 // know if were successful after an attempt showed in logs as failed. +2296 if (tries > 0 || LOG.isTraceEnabled()) { +2297 LOG.info("TRANSITION REPORTED " + request); +2298 } +2299 // NOTE: Return mid-method!!! +2300 return true; +2301 } catch (ServiceException se) { +2302 IOException ioe = ProtobufUtil.getRemoteException(se); +2303 boolean pause = ioe instanceof ServerNotRunningYetException || +2304 ioe instanceof PleaseHoldException; +2305 if (pause) { +2306 // Do backoff else we flood the Master with requests. +2307 pauseTime = ConnectionUtils.getPauseTime(INIT_PAUSE_TIME_MS, tries); +2308 } else { +2309 pauseTime = INIT_PAUSE_TIME_MS; // Reset. 2310 } -2311 // NOTE: Return mid-method!!! -2312 return true; -2313 } catch (ServiceException se) { -2314 IOException ioe = ProtobufUtil.getRemoteException(se); -2315 boolean pause = ioe instanceof ServerNotRunningYetException || -2316 ioe instanceof PleaseHoldException; -2317 if (pause) { -2318 // Do backoff else we flood the Master with requests. -2319 pauseTime = ConnectionUtils.getPauseTime(INIT_PAUSE_TIME_MS, tries); -2320 } else { -2321 pauseTime = INIT_PAUSE_TIME_MS; // Reset. -2322 } -2323 LOG.info("Failed report transition " + -2324 TextFormat.shortDebugString(request) + "; retry (#" + tries + ")" + -2325 (pause? -2326 " after " + pauseTime + "ms delay (Master is coming online...).": -2327 " immediately."), -2328 ioe); -2329 if (pause) Threads.sleep(pauseTime); -2330 tries++; -2331 if (rssStub == rss) { -2332 rssStub = null; -2333 } -2334 } -2335 } -2336 return false; -2337 } -2338 -2339 /** -2340 * Trigger a flush in the primary region replica if this region is a secondary replica. Does not -2341 * block this thread. See RegionReplicaFlushHandler for details. -2342 */ -2343 void triggerFlushInPrimaryRegion(final HRegion region) { -2344 if (ServerRegionReplicaUtil.isDefaultReplica(region.getRegionInfo())) { -2345 return; -2346 } -2347 if (!ServerRegionReplicaUtil.isRegionReplicaReplicationEnabled(region.conf) || -2348 !ServerRegionReplicaUtil.isRegionReplicaWaitForPrimaryFlushEnabled( -2349 region.conf)) { -2350 region.setReadsEnabled(true); -2351 return; -2352 } -2353 -2354 region.setReadsEnabled(false); // disable reads before marking the region as opened. -2355 // RegionReplicaFlushHandler might reset this. +2311 LOG.info("Failed report transition " + +2312 TextFormat.shortDebugString(request) + "; retry (#" + tries + ")" + +2313 (pause? +2314 " after " + pauseTime + "ms delay (Master is coming online...).": +2315 " immediately."), +2316 ioe); +2317 if (pause) Threads.sleep(pauseTime); +2318 tries++; +2319 if (rssStub == rss) { +2320 rssStub = null; +2321 } +2322 } +2323 } +2324 return false; +2325 } +2326 +2327 /** +2328 * Trigger a flush in the primary region replica if this region is a secondary replica. Does not +2329 * block this thread. See RegionReplicaFlushHandler for details. +2330 */ +2331 void triggerFlushInPrimaryRegion(final HRegion region) { +2332 if (ServerRegionReplicaUtil.isDefaultReplica(region.getRegionInfo())) { +2333 return; +2334 } +2335 if (!ServerRegionReplicaUtil.isRegionReplicaReplicationEnabled(region.conf) || +2336 !ServerRegionReplicaUtil.isRegionReplicaWaitForPrimaryFlushEnabled( +2337 region.conf)) { +2338 region.setReadsEnabled(true); +2339 return; +2340 } +2341 +2342 region.setReadsEnabled(false); // disable reads before marking the region as opened. +2343 // RegionReplicaFlushHandler might reset this. +2344 +2345 // submit it to be handled by one of the handlers so that we do not block OpenRegionHandler +2346 if (this.executorService != null) { +2347 this.executorService.submit(new RegionReplicaFlushHandler(this, clusterConnection, +2348 rpcRetryingCallerFactory, rpcControllerFactory, operationTimeout, region)); +2349 } +2350 } +2351 +2352 @Override +2353 public RpcServerInterface getRpcServer() { +2354 return rpcServices.rpcServer; +2355 } 2356 -2357 // submit it to be handled by one of the handlers so that we do not block OpenRegionHandler -2358 if (this.executorService != null) { -2359 this.executorService.submit(new RegionReplicaFlushHandler(this, clusterConnection, -2360 rpcRetryingCallerFactory, rpcControllerFactory, operationTimeout, region)); -2361 } -2362 } -2363 -2364 @Override -2365 public RpcServerInterface getRpcServer() { -2366 return rpcServices.rpcServer; -2367 } -2368 -2369 @VisibleForTesting -2370 public RSRpcServices getRSRpcServices() { -2371 return rpcServices; -2372 } -2373 -2374 /** -2375 * Cause the server to exit without closing the regions it is serving, the log -2376 * it is using and without notifying the master. Used unit testing and on -2377 * catastrophic events such as HDFS is yanked out from under hbase or we OOME. -2378 * -2379 * @param reason -2380 * the reason we are aborting -2381 * @param cause -2382 * the exception that caused the abort, or null -2383 */ -2384 @Override -2385 public void abort(String reason, Throwable cause) { -2386 String msg = "***** ABORTING region server " + this + ": " + reason + " *****"; -2387 if (cause != null) { -2388 LOG.error(HBaseMarkers.FATAL, msg, cause); -2389 } else { -2390 LOG.error(HBaseMarkers.FATAL, msg); +2357 @VisibleForTesting +2358 public RSRpcServices getRSRpcServices() { +2359 return rpcServices; +2360 } +2361 +2362 /** +2363 * Cause the server to exit without closing the regions it is serving, the log +2364 * it is using and without notifying the master. Used unit testing and on +2365 * catastrophic events such as HDFS is yanked out from under hbase or we OOME. +2366 * +2367 * @param reason +2368 * the reason we are aborting +2369 * @param cause +2370 * the exception that caused the abort, or null +2371 */ +2372 @Override +2373 public void abort(String reason, Throwable cause) { +2374 String msg = "***** ABORTING region server " + this + ": " + reason + " *****"; +2375 if (cause != null) { +2376 LOG.error(HBaseMarkers.FATAL, msg, cause); +2377 } else { +2378 LOG.error(HBaseMarkers.FATAL, msg); +2379 } +2380 this.abortRequested = true; +2381 // HBASE-4014: show list of coprocessors that were loaded to help debug +2382 // regionserver crashes.Note that we're implicitly using +2383 // java.util.HashSet's toString() method to print the coprocessor names. +2384 LOG.error(HBaseMarkers.FATAL, "RegionServer abort: loaded coprocessors are: " + +2385 CoprocessorHost.getLoadedCoprocessors()); +2386 // Try and dump metrics if abort -- might give clue as to how fatal came about.... +2387 try { +2388 LOG.info("Dump of metrics as JSON on abort: " + DumpRegionServerMetrics.dumpMetrics()); +2389 } catch (MalformedObjectNameException | IOException e) { +2390 LOG.warn("Failed dumping metrics", e); 2391 } -2392 this.abortRequested = true; -2393 // HBASE-4014: show list of coprocessors that were loaded to help debug -2394 // regionserver crashes.Note that we're implicitly using -2395 // java.util.HashSet's toString() method to print the coprocessor names. -2396 LOG.error(HBaseMarkers.FATAL, "RegionServer abort: loaded coprocessors are: " + -2397 CoprocessorHost.getLoadedCoprocessors()); -2398 // Try and dump metrics if abort -- might give clue as to how fatal came about.... -2399 try { -2400 LOG.info("Dump of metrics as JSON on abort: " + DumpRegionServerMetrics.dumpMetrics()); -2401 } catch (MalformedObjectNameException | IOException e) { -2402 LOG.warn("Failed dumping metrics", e); -2403 } -2404 -2405 // Do our best to report our abort to the master, but this may not work -2406 try { -2407 if (cause != null) { -2408 msg += "\nCause:\n" + StringUtils.stringifyException(cause); -2409 } -2410 // Report to the master but only if we have already registered with the master. -2411 if (rssStub != null && this.serverName != null) { -2412 ReportRSFatalErrorRequest.Builder builder = -2413 ReportRSFatalErrorRequest.newBuilder(); -2414 ServerName sn = -2415 ServerName.parseVersionedServerName(this.serverName.getVersionedBytes()); -2416 builder.setServer(ProtobufUtil.toServerName(sn)); -2417 builder.setErrorMessage(msg); -2418 rssStub.reportRSFatalError(null, builder.build()); -2419 } -2420 } catch (Throwable t) { -2421 LOG.warn("Unable to report fatal error to master", t); -2422 } -2423 // shutdown should be run as the internal user -2424 stop(reason, true, null); +2392 +2393 // Do our best to report our abort to the master, but this may not work +2394 try { +2395 if (cause != null) { +2396 msg += "\nCause:\n" + StringUtils.stringifyException(cause); +2397 } +2398 // Report to the master but only if we have already registered with the master. +2399 if (rssStub != null && this.serverName != null) { +2400 ReportRSFatalErrorRequest.Builder builder = +2401 ReportRSFatalErrorRequest.newBuilder(); +2402 ServerName sn = +2403 ServerName.parseVersionedServerName(this.serverName.getVersionedBytes()); +2404 builder.setServer(ProtobufUtil.toServerName(sn)); +2405 builder.setErrorMessage(msg); +2406 rssStub.reportRSFatalError(null, builder.build()); +2407 } +2408 } catch (Throwable t) { +2409 LOG.warn("Unable to report fatal error to master", t); +2410 } +2411 // shutdown should be run as the internal user +2412 stop(reason, true, null); +2413 } +2414 +2415 /** +2416 * @see HRegionServer#abort(String, Throwable) +2417 */ +2418 public void abort(String reason) { +2419 abort(reason, null); +2420 } +2421 +2422 @Override +2423 public boolean isAborted() { +2424 return this.abortRequested; 2425 } 2426 -2427 /** -2428 * @see HRegionServer#abort(String, Throwable) -2429 */ -2430 public void abort(String reason) { -2431 abort(reason, null); -2432 } -2433 -2434 @Override -2435 public boolean isAborted() { -2436 return this.abortRequested; -2437 } -2438 -2439 /* -2440 * Simulate a kill -9 of this server. Exits w/o closing regions or cleaninup -2441 * logs but it does close socket in case want to bring up server on old -2442 * hostname+port immediately. -2443 */ -2444 @VisibleForTesting -2445 protected void kill() { -2446 this.killed = true; -2447 abort("Simulated kill"); -2448 } -2449 -2450 /** -2451 * Called on stop/abort before closing the cluster connection and meta locator. -2452 */ -2453 protected void sendShutdownInterrupt() { -2454 } -2455 -2456 /** -2457 * Wait on all threads to finish. Presumption is that all closes and stops -2458 * have already been called. -2459 */ -2460 protected void stopServiceThreads() { -2461 // clean up the scheduled chores -2462 if (this.choreService != null) choreService.shutdown(); -2463 if (this.nonceManagerChore != null) nonceManagerChore.cancel(true); -2464 if (this.compactionChecker != null) compactionChecker.cancel(true); -2465 if (this.periodicFlusher != null) periodicFlusher.cancel(true); -2466 if (this.healthCheckChore != null) healthCheckChore.cancel(true); -2467 if (this.storefileRefresher != null) storefileRefresher.cancel(true); -2468 if (this.movedRegionsCleaner != null) movedRegionsCleaner.cancel(true); -2469 if (this.fsUtilizationChore != null) fsUtilizationChore.cancel(true); -2470 -2471 if (this.cacheFlusher != null) { -2472 this.cacheFlusher.join(); -2473 } -2474 -2475 if (this.spanReceiverHost != null) { -2476 this.spanReceiverHost.closeReceivers(); -2477 } -2478 if (this.walRoller != null) { -2479 this.walRoller.close(); -2480 } -2481 if (this.compactSplitThread != null) { -2482 this.compactSplitThread.join(); +2427 /* +2428 * Simulate a kill -9 of this server. Exits w/o closing regions or cleaninup +2429 * logs but it does close socket in case want to bring up server on old +2430 * hostname+port immediately. +2431 */ +2432 @VisibleForTesting +2433 protected void kill() { +2434 this.killed = true; +2435 abort("Simulated kill"); +2436 } +2437 +2438 /** +2439 * Called on stop/abort before closing the cluster connection and meta locator. +2440 */ +2441 protected void sendShutdownInterrupt() { +2442 } +2443 +2444 /** +2445 * Wait on all threads to finish. Presumption is that all closes and stops +2446 * have already been called. +2447 */ +2448 protected void stopServiceThreads() { +2449 // clean up the scheduled chores +2450 if (this.choreService != null) choreService.shutdown(); +2451 if (this.nonceManagerChore != null) nonceManagerChore.cancel(true); +2452 if (this.compactionChecker != null) compactionChecker.cancel(true); +2453 if (this.periodicFlusher != null) periodicFlusher.cancel(true); +2454 if (this.healthCheckChore != null) healthCheckChore.cancel(true); +2455 if (this.storefileRefresher != null) storefileRefresher.cancel(true); +2456 if (this.movedRegionsCleaner != null) movedRegionsCleaner.cancel(true); +2457 if (this.fsUtilizationChore != null) fsUtilizationChore.cancel(true); +2458 +2459 if (this.cacheFlusher != null) { +2460 this.cacheFlusher.join(); +2461 } +2462 +2463 if (this.spanReceiverHost != null) { +2464 this.spanReceiverHost.closeReceivers(); +2465 } +2466 if (this.walRoller != null) { +2467 this.walRoller.close(); +2468 } +2469 if (this.compactSplitThread != null) { +2470 this.compactSplitThread.join(); +2471 } +2472 if (this.executorService != null) this.executorService.shutdown(); +2473 if (this.replicationSourceHandler != null && +2474 this.replicationSourceHandler == this.replicationSinkHandler) { +2475 this.replicationSourceHandler.stopReplicationService(); +2476 } else { +2477 if (this.replicationSourceHandler != null) { +2478 this.replicationSourceHandler.stopReplicationService(); +2479 } +2480 if (this.replicationSinkHandler != null) { +2481 this.replicationSinkHandler.stopReplicationService(); +2482 } 2483 } -2484 if (this.executorService != null) this.executorService.shutdown(); -2485 if (this.replicationSourceHandler != null && -2486 this.replicationSourceHandler == this.replicationSinkHandler) { -2487 this.replicationSourceHandler.stopReplicationService(); -2488 } else { -2489 if (this.replicationSourceHandler != null) { -2490 this.replicationSourceHandler.stopReplicationService(); -2491 } -2492 if (this.replicationSinkHandler != null) { -2493 this.replicationSinkHandler.stopReplicationService(); -2494 } -2495 } -2496 } -2497 -2498 /** -2499 * @return Return the object that implements the replication -2500 * source executorService. -2501 */ -2502 @VisibleForTesting -2503 public ReplicationSourceService getReplicationSourceService() { -2504 return replicationSourceHandler; -2505 } -2506 -2507 /** -2508 * @return Return the object that implements the replication -2509 * sink executorService. +2484 } +2485 +2486 /** +2487 * @return Return the object that implements the replication +2488 * source executorService. +2489 */ +2490 @VisibleForTesting +2491 public ReplicationSourceService getReplicationSourceService() { +2492 return replicationSourceHandler; +2493 } +2494 +2495 /** +2496 * @return Return the object that implements the replication +2497 * sink executorService. +2498 */ +2499 ReplicationSinkService getReplicationSinkService() { +2500 return replicationSinkHandler; +2501 } +2502 +2503 /** +2504 * Get the current master from ZooKeeper and open the RPC connection to it. +2505 * To get a fresh connection, the current rssStub must be null. +2506 * Method will block until a master is available. You can break from this +2507 * block by requesting the server stop. +2508 * +2509 * @return master + port, or null if server has been stopped 2510 */ -2511 ReplicationSinkService getReplicationSinkService() { -2512 return replicationSinkHandler; -2513 } -2514 -2515 /** -2516 * Get the current master from ZooKeeper and open the RPC connection to it. -2517 * To get a fresh connection, the current rssStub must be null. -2518 * Method will block until a master is available. You can break from this -2519 * block by requesting the server stop. -2520 * -2521 * @return master + port, or null if server has been stopped -2522 */ -2523 @VisibleForTesting -2524 protected synchronized ServerName createRegionServerStatusStub() { -2525 // Create RS stub without refreshing the master node from ZK, use cached data -2526 return createRegionServerStatusStub(false); -2527 } -2528 -2529 /** -2530 * Get the current master from ZooKeeper and open the RPC connection to it. To get a fresh -2531 * connection, the current rssStub must be null. Method will block until a master is available. -2532 * You can break from this block by requesting the server stop. -2533 * @param refresh If true then master address will be read from ZK, otherwise use cached data -2534 * @return master + port, or null if server has been stopped -2535 */ -2536 @VisibleForTesting -2537 protected synchronized ServerName createRegionServerStatusStub(boolean refresh) { -2538 if (rssStub != null) { -2539 return masterAddressTracker.getMasterAddress(); -2540 } -2541 ServerName sn = null; -2542 long previousLogTime = 0; -2543 RegionServerStatusService.BlockingInterface intRssStub = null; -2544 LockService.BlockingInterface intLockStub = null; -2545 boolean interrupted = false; -2546 try { -2547 while (keepLooping()) { -2548 sn = this.masterAddressTracker.getMasterAddress(refresh); -2549 if (sn == null) { -2550 if (!keepLooping()) { -2551 // give up with no connection. -2552 LOG.debug("No master found and cluster is stopped; bailing out"); -2553 return null; -2554 } -2555 if (System.currentTimeMillis() > (previousLogTime + 1000)) { -2556 LOG.debug("No master found; retry"); -2557 previousLogTime = System.currentTimeMillis(); -2558 } -2559 refresh = true; // let's try pull it from ZK directly -2560 if (sleep(200)) { -2561 interrupted = true; -2562 } -2563 continue; -2564 } -2565 -2566 // If we are on the active master, use the shortcut -2567 if (this instanceof HMaster && sn.equals(getServerName())) { -2568 intRssStub = ((HMaster)this).getMasterRpcServices(); -2569 intLockStub = ((HMaster)this).getMasterRpcServices(); -2570 break; -2571 } -2572 try { -2573 BlockingRpcChannel channel = -2574 this.rpcClient.createBlockingRpcChannel(sn, userProvider.getCurrent(), -2575 shortOperationTimeout); -2576 intRssStub = RegionServerStatusService.newBlockingStub(channel); -2577 intLockStub = LockService.newBlockingStub(channel); -2578 break; -2579 } catch (IOException e) { -2580 if (System.currentTimeMillis() > (previousLogTime + 1000)) { -2581 e = e instanceof RemoteException ? -2582 ((RemoteException)e).unwrapRemoteException() : e; -2583 if (e instanceof ServerNotRunningYetException) { -2584 LOG.info("Master isn't available yet, retrying"); -2585 } else { -2586 LOG.warn("Unable to connect to master. Retrying. Error was:", e); -2587 } -2588 previousLogTime = System.currentTimeMillis(); -2589 } -2590 if (sleep(200)) { -2591 interrupted = true; -2592 } -2593 } -2594 } -2595 } finally { -2596 if (interrupted) { -2597 Thread.currentThread().interrupt(); -2598 } -2599 } -2600 this.rssStub = intRssStub; -2601 this.lockStub = intLockStub; -2602 return sn; -2603 } -2604 -2605 /** -2606 * @return True if we should break loop because cluster is going down or -2607 * this server has been stopped or hdfs has gone bad. -2608 */ -2609 private boolean keepLooping() { -2610 return !this.stopped && isClusterUp(); -2611 } -2612 -2613 /* -2614 * Let the master know we're here Run initialization using parameters passed -2615 * us by the master. -2616 * @return A Map of key/value configurations we got from the Master else -2617 * null if we failed to register. -2618 * @throws IOException -2619 */ -2620 private RegionServerStartupResponse reportForDuty() throws IOException { -2621 if (this.masterless) return RegionServerStartupResponse.getDefaultInstance(); -2622 ServerName masterServerName = createRegionServerStatusStub(true); -2623 if (masterServerName == null) return null; -2624 RegionServerStartupResponse result = null; -2625 try { -2626 rpcServices.requestCount.reset(); -2627 rpcServices.rpcGetRequestCount.reset(); -2628 rpcServices.rpcScanRequestCount.reset(); -2629 rpcServices.rpcMultiRequestCount.reset(); -2630 rpcServices.rpcMutateRequestCount.reset(); -