From commits-return-23004-archive-asf-public=cust-asf.ponee.io@accumulo.apache.org Thu Jun 13 20:59:56 2019 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [207.244.88.153]) by mx-eu-01.ponee.io (Postfix) with SMTP id BD08D18064E for ; Thu, 13 Jun 2019 22:59:55 +0200 (CEST) Received: (qmail 72670 invoked by uid 500); 13 Jun 2019 20:59:55 -0000 Mailing-List: contact commits-help@accumulo.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@accumulo.apache.org Delivered-To: mailing list commits@accumulo.apache.org Received: (qmail 72661 invoked by uid 99); 13 Jun 2019 20:59:55 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 13 Jun 2019 20:59:55 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id 1387E87AD7; Thu, 13 Jun 2019 20:59:50 +0000 (UTC) Date: Thu, 13 Jun 2019 20:59:50 +0000 To: "commits@accumulo.apache.org" Subject: [accumulo] branch 2.0 updated: Allow master start to block waiting for a number of tservers (#1204) MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Message-ID: <156045958996.7296.6506575764167942618@gitbox.apache.org> From: edcoleman@apache.org X-Git-Host: gitbox.apache.org X-Git-Repo: accumulo X-Git-Refname: refs/heads/2.0 X-Git-Reftype: branch X-Git-Oldrev: 27753af5e2546fc16b454f479fe9b435e6c49a83 X-Git-Newrev: de9ba68633a0d7e0ed55ee98533c82ab0b4bb12b X-Git-Rev: de9ba68633a0d7e0ed55ee98533c82ab0b4bb12b X-Git-NotificationType: ref_changed_plus_diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated This is an automated email from the ASF dual-hosted git repository. edcoleman pushed a commit to branch 2.0 in repository https://gitbox.apache.org/repos/asf/accumulo.git The following commit(s) were added to refs/heads/2.0 by this push: new de9ba68 Allow master start to block waiting for a number of tservers (#1204) de9ba68 is described below commit de9ba68633a0d7e0ed55ee98533c82ab0b4bb12b Author: EdColeman AuthorDate: Thu Jun 13 16:59:45 2019 -0400 Allow master start to block waiting for a number of tservers (#1204) Adds two parameters: - MASTER_STARTUP_TSERVER_AVAIL_MIN_COUNT - sets desried number of tservers - MASTER_STARTUP_TSERVER_AVAIL_MAX_WAIT - sets maximum time to wait. This is the same changes submitted as pull request #1158 for 1.9.x, merged to 2.0. Request #1158 will be available as a patch, but is not expected to be merged becuase of semver requirements. --- .../org/apache/accumulo/core/conf/Property.java | 13 ++- .../java/org/apache/accumulo/master/Master.java | 92 ++++++++++++++++++++++ 2 files changed, 104 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/accumulo/core/conf/Property.java b/core/src/main/java/org/apache/accumulo/core/conf/Property.java index 4c88785..69f9f63 100644 --- a/core/src/main/java/org/apache/accumulo/core/conf/Property.java +++ b/core/src/main/java/org/apache/accumulo/core/conf/Property.java @@ -285,7 +285,18 @@ public enum Property { MASTER_METADATA_SUSPENDABLE("master.metadata.suspendable", "false", PropertyType.BOOLEAN, "Allow tablets for the " + MetadataTable.NAME + " table to be suspended via table.suspend.duration."), - + MASTER_STARTUP_TSERVER_AVAIL_MIN_COUNT("master.startup.tserver.avail.min.count", "0", + PropertyType.COUNT, + "Minimum number of tservers that need to be registered before master will " + + "start tablet assignment - checked at master initialization, when master gets lock. " + + " When set to 0 or less, no blocking occurs. Default is 0 (disabled) to keep original " + + " behaviour. Added with version 2.0"), + MASTER_STARTUP_TSERVER_AVAIL_MAX_WAIT("master.startup.tserver.avail.max.wait", "0", + PropertyType.TIMEDURATION, + "Maximum time master will wait for tserver available threshold " + + "to be reached before continuing. When set to 0 or less, will block " + + "indefinitely. Default is 0 to block indefinitely. Only valid when tserver available " + + "threshold is set greater than 0. Added with version 2.0"), // properties that are specific to tablet server behavior TSERV_PREFIX("tserver.", null, PropertyType.PREFIX, "Properties in this category affect the behavior of the tablet servers"), diff --git a/server/master/src/main/java/org/apache/accumulo/master/Master.java b/server/master/src/main/java/org/apache/accumulo/master/Master.java index 0efd08d..b5921d6 100644 --- a/server/master/src/main/java/org/apache/accumulo/master/Master.java +++ b/server/master/src/main/java/org/apache/accumulo/master/Master.java @@ -77,6 +77,7 @@ import org.apache.accumulo.core.trace.TraceUtil; import org.apache.accumulo.core.util.Daemon; import org.apache.accumulo.fate.AgeOffStore; import org.apache.accumulo.fate.Fate; +import org.apache.accumulo.fate.util.Retry; import org.apache.accumulo.fate.zookeeper.ZooLock; import org.apache.accumulo.fate.zookeeper.ZooLock.LockLossReason; import org.apache.accumulo.fate.zookeeper.ZooReaderWriter; @@ -1021,6 +1022,12 @@ public class Master extends AbstractServer tserverSet.startListeningForTabletServerChanges(); + try { + blockForTservers(); + } catch (InterruptedException ex) { + Thread.currentThread().interrupt(); + } + ZooReaderWriter zReaderWriter = context.getZooReaderWriter(); try { @@ -1207,6 +1214,91 @@ public class Master extends AbstractServer log.info("exiting"); } + /** + * Allows property configuration to block master start-up waiting for a minimum number of tservers + * to register in zookeeper. It also accepts a maximum time to wait - if the time expires, the + * start-up will continue with any tservers available. This check is only performed at master + * initialization, when the master aquires the lock. The following properties are used to control + * the behaviour: + *
    + *
  • MASTER_STARTUP_TSERVER_AVAIL_MIN_COUNT - when set to 0 or less, no blocking occurs (default + * behaviour) otherwise will block until the number of tservers are available.
  • + *
  • MASTER_STARTUP_TSERVER_AVAIL_MAX_WAIT - time to wait in milliseconds. When set to 0 or + * less, will block indefinitely.
  • + *
+ * + * @throws InterruptedException + * if interrupted while blocking, propagated for caller to handle. + */ + private void blockForTservers() throws InterruptedException { + + long waitStart = System.currentTimeMillis(); + + long minTserverCount = + getConfiguration().getCount(Property.MASTER_STARTUP_TSERVER_AVAIL_MIN_COUNT); + + if (minTserverCount <= 0) { + log.info( + "tserver availability check disabled, contining with-{} servers." + "To enable, set {}", + tserverSet.size(), Property.MASTER_STARTUP_TSERVER_AVAIL_MIN_COUNT.getKey()); + return; + } + + long maxWait = + getConfiguration().getTimeInMillis(Property.MASTER_STARTUP_TSERVER_AVAIL_MAX_WAIT); + + if (maxWait <= 0) { + log.info("tserver availability check set to block indefinitely, To change, set {} > 0.", + Property.MASTER_STARTUP_TSERVER_AVAIL_MAX_WAIT.getKey()); + maxWait = Long.MAX_VALUE; + } + + // honor Retry condition that initial wait < max wait, otherwise use small value to allow thread + // yield to happen + long initialWait = Math.min(50, maxWait / 2); + + Retry tserverRetry = + Retry.builder().infiniteRetries().retryAfter(initialWait, TimeUnit.MILLISECONDS) + .incrementBy(15_000, TimeUnit.MILLISECONDS).maxWait(maxWait, TimeUnit.MILLISECONDS) + .backOffFactor(1).logInterval(30_000, TimeUnit.MILLISECONDS).createRetry(); + + log.info("Checking for tserver availability - need to reach {} servers. Have {}", + minTserverCount, tserverSet.size()); + + boolean needTservers = tserverSet.size() < minTserverCount; + + while (needTservers && tserverRetry.canRetry()) { + + tserverRetry.waitForNextAttempt(); + + needTservers = tserverSet.size() < minTserverCount; + + // suppress last message once threshold reached. + if (needTservers) { + log.info( + "Blocking for tserver availability - need to reach {} servers. Have {}" + + " Time spent blocking {} sec.", + minTserverCount, tserverSet.size(), + TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() - waitStart)); + } + } + + if (tserverSet.size() < minTserverCount) { + log.warn( + "tserver availability check time expired - continuing. Requested {}, have {} tservers on line. " + + " Time waiting {} ms", + tserverSet.size(), minTserverCount, + TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() - waitStart)); + + } else { + log.info( + "tserver availability check completed. Requested {}, have {} tservers on line. " + + " Time waiting {} ms", + tserverSet.size(), minTserverCount, + TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() - waitStart)); + } + } + private TServer setupReplication() throws UnknownHostException, KeeperException, InterruptedException { ServerContext context = getContext();