From dev-return-60346-archive-asf-public=cust-asf.ponee.io@storm.apache.org Thu Aug 1 10:06:22 2019 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [207.244.88.153]) by mx-eu-01.ponee.io (Postfix) with SMTP id C3AC8180644 for ; Thu, 1 Aug 2019 12:06:21 +0200 (CEST) Received: (qmail 87538 invoked by uid 500); 1 Aug 2019 10:06:20 -0000 Mailing-List: contact dev-help@storm.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@storm.apache.org Delivered-To: mailing list dev@storm.apache.org Received: (qmail 87366 invoked by uid 99); 1 Aug 2019 10:06:20 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 01 Aug 2019 10:06:20 +0000 From: GitBox To: dev@storm.apache.org Subject: [GitHub] [storm] srdo commented on a change in pull request #3092: STORM-3474 Large fragmented cluster scheduling time test Message-ID: <156465397493.2264.18303816924505257154.gitbox@gitbox.apache.org> Date: Thu, 01 Aug 2019 10:06:14 -0000 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit srdo commented on a change in pull request #3092: STORM-3474 Large fragmented cluster scheduling time test URL: https://github.com/apache/storm/pull/3092#discussion_r309609442 ########## File path: storm-server/src/test/java/org/apache/storm/scheduler/resource/TestResourceAwareScheduler.java ########## @@ -1067,6 +1068,208 @@ public void minCpuWorkerSplitFails() { assertFalse("Topo-1 unscheduled?", cluster.getAssignmentById(topo1.getId()) != null); } + protected static class TimeBlockResult { + long firstBlockTime; + long lastBlockTime; + } + + private long getMedianBlockTime(TimeBlockResult[] runResults, boolean firstBlock) { + final int numRuns = runResults.length; + assert(numRuns % 2 == 1); // number of runs must be odd to compute median as below + long[] times = new long[numRuns]; + for (int i = 0; i < numRuns; ++i) { + times[i] = firstBlock ? runResults[i].firstBlockTime : runResults[i].lastBlockTime; + } + Arrays.sort(times); + + final int medianIndex = (int) Math.floor(numRuns / 2); + return times[medianIndex]; + } + + /** + * Check time to schedule a fragmented cluster using different strategies + * + * Simulate scheduling on a large production cluster. Find the ratio of time to schedule a set of topologies when + * the cluster is empty and when the cluster is nearly full. While the cluster has sufficient resources to schedule + * all topologies, when nearly full the cluster becomes fragmented and some topologies fail to schedule. + */ + @Test + public void TestLargeFragmentedClusterScheduling() { + /* + Without fragmentation, the cluster would be able to schedule both topologies on each node. Let's call each node + with both topologies scheduled as 100% scheduled. + + We schedule the cluster in 3 blocks of topologies, measuring the time to schedule the blocks. The first, middle + and last blocks attempt to schedule the following 0-10%, 10%-90%, 90%-100%. The last block has a number of + scheduling failures due to cluster fragmentation and its time is dominated by attempting to evict topologies. + + Timing results for scheduling are noisy. As a result, we do multiple runs and use median values for FirstBlock + and LastBlock times. (somewhere a statistician is crying). The ratio of LastBlock / FirstBlock remains fairly constant. + + + TestLargeFragmentedClusterScheduling took 91118 ms + DefaultResourceAwareStrategy, FirstBlock 249.0, LastBlock 1734.0 ratio 6.963855421686747 + GenericResourceAwareStrategy, FirstBlock 215.0, LastBlock 1673.0 ratio 7.78139534883721 + ConstraintSolverStrategy, FirstBlock 279.0, LastBlock 2200.0 ratio 7.885304659498208 + + TestLargeFragmentedClusterScheduling took 98455 ms + DefaultResourceAwareStrategy, FirstBlock 266.0, LastBlock 1812.0 ratio 6.81203007518797 + GenericResourceAwareStrategy, FirstBlock 235.0, LastBlock 1802.0 ratio 7.6680851063829785 + ConstraintSolverStrategy, FirstBlock 304.0, LastBlock 2320.0 ratio 7.631578947368421 + + TestLargeFragmentedClusterScheduling took 97268 ms + DefaultResourceAwareStrategy, FirstBlock 251.0, LastBlock 1826.0 ratio 7.274900398406374 + GenericResourceAwareStrategy, FirstBlock 220.0, LastBlock 1719.0 ratio 7.8136363636363635 + ConstraintSolverStrategy, FirstBlock 296.0, LastBlock 2469.0 ratio 8.341216216216216 + + TestLargeFragmentedClusterScheduling took 97963 ms + DefaultResourceAwareStrategy, FirstBlock 249.0, LastBlock 1788.0 ratio 7.180722891566265 + GenericResourceAwareStrategy, FirstBlock 240.0, LastBlock 1796.0 ratio 7.483333333333333 + ConstraintSolverStrategy, FirstBlock 328.0, LastBlock 2544.0 ratio 7.7560975609756095 + + TestLargeFragmentedClusterScheduling took 93106 ms + DefaultResourceAwareStrategy, FirstBlock 258.0, LastBlock 1714.0 ratio 6.6434108527131785 + GenericResourceAwareStrategy, FirstBlock 215.0, LastBlock 1692.0 ratio 7.869767441860465 + ConstraintSolverStrategy, FirstBlock 309.0, LastBlock 2342.0 ratio 7.5792880258899675 + + Choose the median value of the values above + DefaultResourceAwareStrategy 6.96 + GenericResourceAwareStrategy 7.78 + ConstraintSolverStrategy 7.75 + */ + + final int numNodes = 500; + final String[] strategies = new String[]{ + DefaultResourceAwareStrategy.class.getName(), + GenericResourceAwareStrategy.class.getName(), + ConstraintSolverStrategy.class.getName() + }; + + final int numStrategies = strategies.length; + final int numRuns = 5; + TimeBlockResult testResults[][] = new TimeBlockResult[numStrategies][numRuns]; + + // Get first and last block times for multiple runs and strategies + long startTime = Time.currentTimeMillis(); + for (int strategyIdx = 0; strategyIdx < numStrategies; ++strategyIdx) { + String strategy = strategies[strategyIdx]; + + for (int run = 0; run < numRuns; ++run) { + testResults[strategyIdx][run] = testLargeClusterSchedulingTiming(numNodes, strategy); + } + } + + // Log median ratios for different strategies + LOG.info("TestLargeFragmentedClusterScheduling took {} ms", Time.currentTimeMillis() - startTime); + for (int strategyIdx = 0; strategyIdx < numStrategies; ++strategyIdx) { + double medianFirstBlockTime = getMedianBlockTime(testResults[strategyIdx], true); + double medianLastBlockTime = getMedianBlockTime(testResults[strategyIdx], false); + double ratio = medianLastBlockTime / medianFirstBlockTime; + LOG.info("{}, FirstBlock {}, LastBlock {} ratio {}", strategies[strategyIdx], medianFirstBlockTime, medianLastBlockTime, ratio); + } + + // Check last block scheduling time does not get significantly slower + final double[] acceptedStrategyTimeRatios = {6.96, 7.78, 7.75}; Review comment: How did you find these values? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: users@infra.apache.org With regards, Apache Git Services