hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wan...@apache.org
Subject [2/6] hadoop git commit: YARN-6608. Backport all SLS improvements from trunk to branch-2. (Carlo Curino via wangda)
Date Wed, 18 Oct 2017 02:45:24 GMT
http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/web/SLSWebApp.java
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/web/SLSWebApp.java b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/web/SLSWebApp.java
index e152696..154bcc9 100644
--- a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/web/SLSWebApp.java
+++ b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/web/SLSWebApp.java
@@ -18,7 +18,6 @@
 
 package org.apache.hadoop.yarn.sls.web;
 
-import java.io.File;
 import java.io.IOException;
 import java.io.ObjectInputStream;
 import java.text.MessageFormat;
@@ -26,11 +25,12 @@ import java.util.HashMap;
 import java.util.Map;
 import java.util.Set;
 
+import javax.servlet.ServletException;
 import javax.servlet.http.HttpServlet;
 import javax.servlet.http.HttpServletRequest;
 import javax.servlet.http.HttpServletResponse;
 
-import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.classification.InterfaceAudience.Private;
 import org.apache.hadoop.classification.InterfaceStability.Unstable;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType;
@@ -38,12 +38,12 @@ import org.apache.hadoop.yarn.sls.SLSRunner;
 import org.apache.hadoop.yarn.sls.scheduler.FairSchedulerMetrics;
 import org.apache.hadoop.yarn.sls.scheduler.SchedulerMetrics;
 import org.apache.hadoop.yarn.sls.scheduler.SchedulerWrapper;
+
 import org.mortbay.jetty.Handler;
 import org.mortbay.jetty.Request;
 import org.mortbay.jetty.Server;
 import org.mortbay.jetty.handler.AbstractHandler;
 import org.mortbay.jetty.handler.ResourceHandler;
-
 import com.codahale.metrics.Counter;
 import com.codahale.metrics.Gauge;
 import com.codahale.metrics.Histogram;
@@ -84,12 +84,12 @@ public class SLSWebApp extends HttpServlet {
     // load templates
     ClassLoader cl = Thread.currentThread().getContextClassLoader();
     try {
-      simulateInfoTemplate = FileUtils.readFileToString(new File(
-              cl.getResource("simulate.info.html.template").getFile()));
-      simulateTemplate = FileUtils.readFileToString(new File(
-              cl.getResource("simulate.html.template").getFile()));
-      trackTemplate = FileUtils.readFileToString(new File(
-              cl.getResource("track.html.template").getFile()));
+      simulateInfoTemplate = IOUtils.toString(
+          cl.getResourceAsStream("html/simulate.info.html.template"));
+      simulateTemplate = IOUtils.toString(
+          cl.getResourceAsStream("html/simulate.html.template"));
+      trackTemplate = IOUtils.toString(
+          cl.getResourceAsStream("html/track.html.template"));
     } catch (IOException e) {
       e.printStackTrace();
     }
@@ -105,24 +105,23 @@ public class SLSWebApp extends HttpServlet {
 
   public SLSWebApp(SchedulerWrapper wrapper, int metricsAddressPort) {
     this.wrapper = wrapper;
-    metrics = wrapper.getMetrics();
-    handleOperTimecostHistogramMap =
-            new HashMap<SchedulerEventType, Histogram>();
-    queueAllocatedMemoryCounterMap = new HashMap<String, Counter>();
-    queueAllocatedVCoresCounterMap = new HashMap<String, Counter>();
+    handleOperTimecostHistogramMap = new HashMap<>();
+    queueAllocatedMemoryCounterMap = new HashMap<>();
+    queueAllocatedVCoresCounterMap = new HashMap<>();
     schedulerMetrics = wrapper.getSchedulerMetrics();
+    metrics = schedulerMetrics.getMetrics();
     port = metricsAddressPort;
   }
 
   public void start() throws Exception {
-    // static files
     final ResourceHandler staticHandler = new ResourceHandler();
     staticHandler.setResourceBase("html");
 
     Handler handler = new AbstractHandler() {
       @Override
       public void handle(String target, HttpServletRequest request,
-                         HttpServletResponse response, int dispatch) {
+          HttpServletResponse response, int dispatch)
+          throws IOException, ServletException {
         try{
           // timeunit
           int timeunit = 1000;   // second, divide millionsecond / 1000
@@ -183,14 +182,14 @@ public class SLSWebApp extends HttpServlet {
     response.setStatus(HttpServletResponse.SC_OK);
 
     String simulateInfo;
-    if (SLSRunner.simulateInfoMap.isEmpty()) {
+    if (SLSRunner.getSimulateInfoMap().isEmpty()) {
       String empty = "<tr><td colspan='2' align='center'>" +
               "No information available</td></tr>";
       simulateInfo = MessageFormat.format(simulateInfoTemplate, empty);
     } else {
       StringBuilder info = new StringBuilder();
       for (Map.Entry<String, Object> entry :
-              SLSRunner.simulateInfoMap.entrySet()) {
+              SLSRunner.getSimulateInfoMap().entrySet()) {
         info.append("<tr>");
         info.append("<td class='td1'>").append(entry.getKey()).append("</td>");
         info.append("<td class='td2'>").append(entry.getValue())
@@ -221,7 +220,7 @@ public class SLSWebApp extends HttpServlet {
     response.setStatus(HttpServletResponse.SC_OK);
 
     // queues {0}
-    Set<String> queues = wrapper.getQueueSet();
+    Set<String> queues = wrapper.getTracker().getQueueSet();
     StringBuilder queueInfo = new StringBuilder();
 
     int i = 0;
@@ -260,7 +259,7 @@ public class SLSWebApp extends HttpServlet {
 
     // tracked queues {0}
     StringBuilder trackedQueueInfo = new StringBuilder();
-    Set<String> trackedQueues = wrapper.getQueueSet();
+    Set<String> trackedQueues = wrapper.getTracker().getQueueSet();
     for(String queue : trackedQueues) {
       trackedQueueInfo.append("<option value='Queue ").append(queue)
               .append("'>").append(queue).append("</option>");
@@ -268,7 +267,7 @@ public class SLSWebApp extends HttpServlet {
 
     // tracked apps {1}
     StringBuilder trackedAppInfo = new StringBuilder();
-    Set<String> trackedApps = wrapper.getTrackedAppSet();
+    Set<String> trackedApps = wrapper.getTracker().getTrackedAppSet();
     for(String job : trackedApps) {
       trackedAppInfo.append("<option value='Job ").append(job)
               .append("'>").append(job).append("</option>");
@@ -417,7 +416,7 @@ public class SLSWebApp extends HttpServlet {
     // allocated resource for each queue
     Map<String, Double> queueAllocatedMemoryMap = new HashMap<String, Double>();
     Map<String, Long> queueAllocatedVCoresMap = new HashMap<String, Long>();
-    for (String queue : wrapper.getQueueSet()) {
+    for (String queue : wrapper.getTracker().getQueueSet()) {
       // memory
       String key = "counter.queue." + queue + ".allocated.memory";
       if (! queueAllocatedMemoryCounterMap.containsKey(queue) &&
@@ -457,7 +456,7 @@ public class SLSWebApp extends HttpServlet {
             .append(",\"cluster.available.memory\":").append(availableMemoryGB)
             .append(",\"cluster.available.vcores\":").append(availableVCoresGB);
 
-    for (String queue : wrapper.getQueueSet()) {
+    for (String queue : wrapper.getTracker().getQueueSet()) {
       sb.append(",\"queue.").append(queue).append(".allocated.memory\":")
               .append(queueAllocatedMemoryMap.get(queue));
       sb.append(",\"queue.").append(queue).append(".allocated.vcores\":")

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/site/markdown/SchedulerLoadSimulator.md
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/site/markdown/SchedulerLoadSimulator.md b/hadoop-tools/hadoop-sls/src/site/markdown/SchedulerLoadSimulator.md
index 2cffc86..d1848e8 100644
--- a/hadoop-tools/hadoop-sls/src/site/markdown/SchedulerLoadSimulator.md
+++ b/hadoop-tools/hadoop-sls/src/site/markdown/SchedulerLoadSimulator.md
@@ -27,9 +27,11 @@ Yarn Scheduler Load Simulator (SLS)
     * [Metrics](#Metrics)
         * [Real-time Tracking](#Real-time_Tracking)
         * [Offline Analysis](#Offline_Analysis)
+    * [Synthetic Load Generator](#SynthGen)
     * [Appendix](#Appendix)
         * [Resources](#Resources)
         * [SLS JSON input file format](#SLS_JSON_input_file_format)
+        * [SYNTH JSON input file format](#SYNTH_JSON_input_file_format)
         * [Simulator input topology file format](#Simulator_input_topology_file_format)
 
 Overview
@@ -72,7 +74,7 @@ The following figure illustrates the implementation architecture of the simulato
 
 ![The architecture of the simulator](images/sls_arch.png)
 
-The simulator takes input of workload traces, and fetches the cluster and applications information. For each NM and AM, the simulator builds a simulator to simulate their running. All NM/AM simulators run in a thread pool. The simulator reuses Yarn Resource Manager, and builds a wrapper out of the scheduler. The Scheduler Wrapper can track the scheduler behaviors and generates several logs, which are the outputs of the simulator and can be further analyzed.
+The simulator takes input of workload traces, or synthetic load distributions and generaters the cluster and applications information. For each NM and AM, the simulator builds a simulator to simulate their running. All NM/AM simulators run in a thread pool. The simulator reuses Yarn Resource Manager, and builds a wrapper out of the scheduler. The Scheduler Wrapper can track the scheduler behaviors and generates several logs, which are the outputs of the simulator and can be further analyzed.
 
 ### Usecases
 
@@ -97,7 +99,7 @@ This section will show how to use the simulator. Here let `$HADOOP_ROOT` represe
 
 *   `bin`: contains running scripts for the simulator.
 
-*   `html`: contains several html/css/js files we needed for real-time tracking.
+*   `html`: Users can also reproduce those real-time tracking charts in offline mode. Just upload the `realtimetrack.json` to `$HADOOP_ROOT/share/hadoop/tools/sls/html/showSimulationTrace.html`. For browser security problem, need to put files `realtimetrack.json` and `showSimulationTrace.html` in the same directory.
 
 *   `sample-conf`: specifies the simulator configurations.
 
@@ -179,17 +181,30 @@ The simulator supports two types of input files: the rumen traces and its own in
 
     $ cd $HADOOP_ROOT/share/hadoop/tools/sls
     $ bin/slsrun.sh
-      --input-rumen |--input-sls=<TRACE_FILE1,TRACE_FILE2,...>
-      --output-dir=<SLS_SIMULATION_OUTPUT_DIRECTORY> [--nodes=<SLS_NODES_FILE>]
-        [--track-jobs=<JOBID1,JOBID2,...>] [--print-simulation]
+      Usage: slsrun.sh <OPTIONS>
+                 --tracetype=<SYNTH | SLS | RUMEN>
+                 --tracelocation=<FILE1,FILE2,...>
+                 (deprecated --input-rumen=<FILE1,FILE2,...>  | --input-sls=<FILE1,FILE2,...>)
+                 --output-dir=<SLS_SIMULATION_OUTPUT_DIRECTORY>
+                 [--nodes=<SLS_NODES_FILE>]
+                 [--track-jobs=<JOBID1,JOBID2,...>]
+                 [--print-simulation]
+
 
 *   `--input-rumen`: The input rumen trace files. Users can input multiple
     files, separated by comma. One example trace is provided in
     `$HADOOP_ROOT/share/hadoop/tools/sls/sample-data/2jobs2min-rumen-jh.json`.
+    This is equivalent to `--tracetype=RUMEN --tracelocation=<path_to_trace>`.
 
 *   `--input-sls`: Simulator its own file format. The simulator also
     provides a tool to convert rumen traces to sls traces (`rumen2sls.sh`).
     Refer to appendix for an example of sls input json file.
+    This is equivalent to `--tracetype=SLS --tracelocation=<path_to_trace>`.
+
+*   `--tracetype`: This is the new way to configure the trace generation and
+    takes values RUMEN, SLS, or SYNTH, to trigger the three type of load generation
+
+*   `--tracelocation`: Path to the input file, matching the tracetype above.
 
 *   `--output-dir`: The output directory for generated running logs and
     metrics.
@@ -281,30 +296,57 @@ After the simulator finishes, all logs are saved in the output directory specifi
 
 Users can also reproduce those real-time tracking charts in offline mode. Just upload the `realtimetrack.json` to `$HADOOP_ROOT/share/hadoop/tools/sls/html/showSimulationTrace.html`. For browser security problem, need to put files `realtimetrack.json` and `showSimulationTrace.html` in the same directory.
 
+
+Synthetic Load Generator
+------------------------
+The Synthetic Load Generator complements the extensive nature of SLS-native and RUMEN traces, by providing a
+distribution-driven generation of load. The load generator is organized as a JobStoryProducer
+(compatible with rumen, and thus gridmix for later integration). We seed the Random number generator so
+that results randomized but deterministic---hence reproducible.
+We organize the jobs being generated around */workloads/job_class* hierarchy, which allow to easily
+group jobs with similar behaviors and categorize them (e.g., jobs with long running containers, or maponly
+computations, etc..). The user can control average and standard deviations for many of the
+important parameters, such as number of mappers/reducers, duration of mapper/reducers, size
+(mem/cpu) of containers, chance of reservation, etc. We use weighted-random sampling (whenever we
+pick among a small number of options) or LogNormal distributions (to avoid negative values) when we
+pick from wide ranges of values---see appendix on LogNormal distributions.
+
+The SYNTH mode of SLS is very convenient to generate very large loads without the need for extensive input
+files. This allows to easily explore wide range of use cases (e.g., imagine simulating 100k jobs, and in different
+runs simply tune the average number of mappers, or average task duration), in an efficient and compact way.
+
 Appendix
 --------
 
 ### Resources
 
 [YARN-1021](https://issues.apache.org/jira/browse/YARN-1021) is the main JIRA that introduces Yarn Scheduler Load Simulator to Hadoop Yarn project.
+[YARN-6363](https://issues.apache.org/jira/browse/YARN-6363) is the main JIRA that introduces the Synthetic Load Generator to SLS.
 
 ### SLS JSON input file format
 
 Here we provide an example format of the sls json file, which contains 2 jobs. The first job has 3 map tasks and the second one has 2 map tasks.
 
     {
-      "am.type" : "mapreduce",
-      "job.start.ms" : 0,
-      "job.end.ms" : 95375,
-      "job.queue.name" : "sls_queue_1",
-      "job.id" : "job_1",
-      "job.user" : "default",
+      "num.nodes": 3,  // total number of nodes in the cluster
+      "num.racks": 1   // total number of racks in the cluster, it divides num.nodes into the racks evenly, optional, the default value is 1
+    }
+    {
+      "am.type" : "mapreduce", // type of AM, optional, the default value is "mapreduce"
+      "job.start.ms" : 0,      // job start time
+      "job.end.ms" : 95375,    // job finish time, optional, the default value is 0
+      "job.queue.name" : "sls_queue_1", // the queue job will be submitted to
+      "job.id" : "job_1",      // the job id used to track the job, optional. The default value, an zero-based integer increasing with number of jobs, is used if this is not specified or job.count > 1
+      "job.user" : "default",  // user, optional, the default value is "default"
+      "job.count" : 1,         // number of jobs, optional, the default value is 1
       "job.tasks" : [ {
-        "container.host" : "/default-rack/node1",
-        "container.start.ms" : 6664,
-        "container.end.ms" : 23707,
-        "container.priority" : 20,
-        "container.type" : "map"
+        "count": 1,    // number of tasks, optional, the default value is 1
+        "container.host" : "/default-rack/node1",  // host the container asks for
+        "container.start.ms" : 6664,  // container start time, optional
+        "container.end.ms" : 23707,   // container finish time, optional
+        "duration.ms":  50000,        // duration of the container, optional if start and end time is specified
+        "container.priority" : 20,    // priority of the container, optional, the default value is 20
+        "container.type" : "map"      // type of the container, could be "map" or "reduce", optional, the default value is "map"
       }, {
         "container.host" : "/default-rack/node3",
         "container.start.ms" : 6665,
@@ -341,6 +383,77 @@ Here we provide an example format of the sls json file, which contains 2 jobs. T
       } ]
     }
 
+
+### SYNTH JSON input file format
+Here we provide an example format of the synthetic generator json file. We use *(json-non-conforming)* inline comments to explain the use of each parameter.
+
+    {
+      "description" : "tiny jobs workload",    //description of the meaning of this collection of workloads
+      "num_nodes" : 10,  //total nodes in the simulated cluster
+      "nodes_per_rack" : 4, //number of nodes in each simulated rack
+      "num_jobs" : 10, // total number of jobs being simulated
+      "rand_seed" : 2, //the random seed used for deterministic randomized runs
+
+      // a list of “workloads”, each of which has job classes, and temporal properties
+      "workloads" : [
+        {
+          "workload_name" : "tiny-test", // name of the workload
+          "workload_weight": 0.5,  // used for weighted random selection of which workload to sample from
+          "queue_name" : "sls_queue_1", //queue the job will be submitted to
+
+        //different classes of jobs for this workload
+           "job_classes" : [
+            {
+              "class_name" : "class_1", //name of the class
+              "class_weight" : 1.0, //used for weighted random selection of class within workload
+
+              //nextr group controls average and standard deviation of a LogNormal distribution that
+              //determines the number of mappers and reducers for thejob.
+              "mtasks_avg" : 5,
+              "mtasks_stddev" : 1,
+              "rtasks_avg" : 5,
+              "rtasks_stddev" : 1,
+
+              //averge and stdev input param of LogNormal distribution controlling job duration
+              "dur_avg" : 60,
+              "dur_stddev" : 5,
+
+              //averge and stdev input param of LogNormal distribution controlling mappers and reducers durations
+              "mtime_avg" : 10,
+              "mtime_stddev" : 2,
+              "rtime_avg" : 20,
+              "rtime_stddev" : 4,
+
+              //averge and stdev input param of LogNormal distribution controlling memory and cores for map and reduce
+              "map_max_memory_avg" : 1024,
+              "map_max_memory_stddev" : 0.001,
+              "reduce_max_memory_avg" : 2048,
+              "reduce_max_memory_stddev" : 0.001,
+              "map_max_vcores_avg" : 1,
+              "map_max_vcores_stddev" : 0.001,
+              "reduce_max_vcores_avg" : 2,
+              "reduce_max_vcores_stddev" : 0.001,
+
+              //probability of running this job with a reservation
+              "chance_of_reservation" : 0.5,
+              //input parameters of LogNormal distribution that determines the deadline slack (as a multiplier of job duration)
+              "deadline_factor_avg" : 10.0,
+              "deadline_factor_stddev" : 0.001,
+            }
+           ],
+        // for each workload determines with what probability each time bucket is picked to choose the job starttime.
+        // In the example below the jobs have twice as much chance to start in the first minute than in the second minute
+        // of simulation, and then zero chance thereafter.
+          "time_distribution" : [
+            { "time" : 1, "weight" : 66 },
+            { "time" : 60, "weight" : 33 },
+            { "time" : 120, "jobs" : 0 }
+         ]
+        }
+     ]
+    }
+
+
 ### Simulator input topology file format
 
 Here is an example input topology file which has 3 nodes organized in 1 rack.
@@ -355,3 +468,9 @@ Here is an example input topology file which has 3 nodes organized in 1 rack.
         "node" : "node3"
       }]
     }
+
+### Notes on LogNormal distribution:
+LogNormal distributions represent well many of the parameters we see in practice (e.g., most jobs have
+a small number of mappers, but few might be very large, and few very small, but greater than zero. It is
+however worth noticing that it might be tricky to use, as the average is typically on the right side of the
+peak (most common value) of the distribution, because the distribution has a one-side tail.

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/BaseSLSRunnerTest.java
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/BaseSLSRunnerTest.java b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/BaseSLSRunnerTest.java
new file mode 100644
index 0000000..6b369f2
--- /dev/null
+++ b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/BaseSLSRunnerTest.java
@@ -0,0 +1,151 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.yarn.sls;
+
+import net.jcip.annotations.NotThreadSafe;
+import org.apache.commons.lang3.ArrayUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants.MetricsInvariantChecker;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameter;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.UUID;
+
+/**
+ * This is a base class to ease the implementation of SLS-based tests.
+ */
+@RunWith(value = Parameterized.class)
+@NotThreadSafe
+@SuppressWarnings("VisibilityModifier")
+public abstract class BaseSLSRunnerTest {
+
+  @Parameter(value = 0)
+  public String schedulerType;
+
+  @Parameter(value = 1)
+  public String traceType;
+
+  @Parameter(value = 2)
+  public String traceLocation;
+
+  @Parameter(value = 3)
+  public String nodeFile;
+
+  protected SLSRunner sls;
+  protected String ongoingInvariantFile;
+  protected String exitInvariantFile;
+
+  @Before
+  public abstract void setup();
+
+  @After
+  public void tearDown() throws InterruptedException {
+    sls.stop();
+  }
+
+  public void runSLS(Configuration conf, long timeout) throws Exception {
+    File tempDir = new File("target", UUID.randomUUID().toString());
+    final List<Throwable> exceptionList =
+        Collections.synchronizedList(new ArrayList<Throwable>());
+
+    Thread.setDefaultUncaughtExceptionHandler(
+        new Thread.UncaughtExceptionHandler() {
+          @Override
+          public void uncaughtException(Thread t, Throwable e) {
+            e.printStackTrace();
+            exceptionList.add(e);
+          }
+        });
+
+    // start the simulator
+    File slsOutputDir = new File(tempDir.getAbsolutePath() + "/slsoutput/");
+
+    String[] args;
+
+    switch (traceType) {
+    case "OLD_SLS":
+      args = new String[] {"-inputsls", traceLocation, "-output",
+          slsOutputDir.getAbsolutePath() };
+      break;
+    case "OLD_RUMEN":
+      args = new String[] {"-inputrumen", traceLocation, "-output",
+          slsOutputDir.getAbsolutePath() };
+      break;
+    default:
+      args = new String[] {"-tracetype", traceType, "-tracelocation",
+          traceLocation, "-output", slsOutputDir.getAbsolutePath() };
+    }
+
+    if (nodeFile != null) {
+      args = ArrayUtils.addAll(args, new String[] {"-nodes", nodeFile });
+    }
+
+    // enable continuous invariant checks
+    conf.set(YarnConfiguration.RM_SCHEDULER, schedulerType);
+    if (ongoingInvariantFile != null) {
+      conf.set(YarnConfiguration.RM_SCHEDULER_MONITOR_POLICIES,
+          MetricsInvariantChecker.class.getCanonicalName());
+      conf.set(MetricsInvariantChecker.INVARIANTS_FILE, ongoingInvariantFile);
+      conf.setBoolean(MetricsInvariantChecker.THROW_ON_VIOLATION, true);
+    }
+
+    sls = new SLSRunner(conf);
+    sls.run(args);
+
+    // wait for timeout seconds before stop, unless there is an uncaught
+    // exception in which
+    // case fail fast.
+    while (timeout >= 0) {
+      Thread.sleep(1000);
+
+      if (!exceptionList.isEmpty()) {
+        sls.stop();
+        Assert.fail("TestSLSRunner catched exception from child thread "
+            + "(TaskRunner.Task): " + exceptionList);
+        break;
+      }
+      timeout--;
+    }
+    shutdownHookInvariantCheck();
+  }
+
+  /**
+   * Checks exit invariants (e.g., number of apps submitted, completed, etc.).
+   */
+  private void shutdownHookInvariantCheck() {
+
+    if(exitInvariantFile!=null) {
+      MetricsInvariantChecker ic = new MetricsInvariantChecker();
+      Configuration conf = new Configuration();
+      conf.set(MetricsInvariantChecker.INVARIANTS_FILE, exitInvariantFile);
+      conf.setBoolean(MetricsInvariantChecker.THROW_ON_VIOLATION, true);
+      ic.init(conf, null, null);
+      ic.editSchedule();
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestReservationSystemInvariants.java
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestReservationSystemInvariants.java b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestReservationSystemInvariants.java
new file mode 100644
index 0000000..22e1e2e
--- /dev/null
+++ b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestReservationSystemInvariants.java
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.sls;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants.InvariantsChecker;
+import org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants.ReservationInvariantsChecker;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import net.jcip.annotations.NotThreadSafe;
+
+/**
+ * This test performs an SLS run enabling a
+ * {@code ReservationInvariantsChecker}.
+ */
+@RunWith(value = Parameterized.class)
+@NotThreadSafe
+public class TestReservationSystemInvariants extends BaseSLSRunnerTest {
+
+  @Parameters(name = "Testing with: {1}, {0}, (nodeFile {3})")
+  public static Collection<Object[]> data() {
+    // Test with both schedulers, and all three trace types
+    return Arrays.asList(new Object[][] {
+        {CapacityScheduler.class.getCanonicalName(), "SYNTH",
+            "src/test/resources/syn.json", null},
+        {FairScheduler.class.getCanonicalName(), "SYNTH",
+            "src/test/resources/syn.json", null}
+    });
+  }
+
+  @Test(timeout = 120000)
+  @SuppressWarnings("all")
+  public void testSimulatorRunning() throws Exception {
+
+    Configuration conf = new Configuration(false);
+    conf.set(YarnConfiguration.RM_SCHEDULER, schedulerType);
+    conf.setBoolean(YarnConfiguration.RM_SCHEDULER_ENABLE_MONITORS, true);
+    conf.set(YarnConfiguration.RM_SCHEDULER_MONITOR_POLICIES,
+        ReservationInvariantsChecker.class.getCanonicalName());
+    conf.setBoolean(InvariantsChecker.THROW_ON_VIOLATION, true);
+
+
+    long timeTillShutDownInSec = 90;
+    runSLS(conf, timeTillShutDownInSec);
+
+  }
+
+  @Override
+  public void setup() {
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestSLSRunner.java
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestSLSRunner.java b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestSLSRunner.java
index 9da8ef3..5ab893d 100644
--- a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestSLSRunner.java
+++ b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestSLSRunner.java
@@ -18,53 +18,72 @@
 
 package org.apache.hadoop.yarn.sls;
 
-import org.junit.Assert;
+import net.jcip.annotations.NotThreadSafe;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler;
+import org.junit.Before;
 import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.*;
 
-import java.io.File;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.UUID;
+import java.util.*;
 
-public class TestSLSRunner {
+/**
+ * This test performs simple runs of the SLS with different trace types and
+ * schedulers.
+ */
+@RunWith(value = Parameterized.class)
+@NotThreadSafe
+public class TestSLSRunner extends BaseSLSRunnerTest {
 
-  @Test
-  @SuppressWarnings("all")
-  public void testSimulatorRunning() throws Exception {
-    File tempDir = new File("target", UUID.randomUUID().toString());
-    final List<Throwable> exceptionList =
-        Collections.synchronizedList(new ArrayList<Throwable>());
+  @Parameters(name = "Testing with: {1}, {0}, (nodeFile {3})")
+  public static Collection<Object[]> data() {
 
-    Thread.setDefaultUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler() {
-      @Override
-      public void uncaughtException(Thread t, Throwable e) {
-        exceptionList.add(e);
-      }
-    });
+    String capScheduler = CapacityScheduler.class.getCanonicalName();
+    String fairScheduler = FairScheduler.class.getCanonicalName();
+    String slsTraceFile = "src/test/resources/inputsls.json";
+    String rumenTraceFile = "src/main/data/2jobs2min-rumen-jh.json";
+    String synthTraceFile = "src/test/resources/syn.json";
+    String nodeFile = "src/test/resources/nodes.json";
+
+    // Test with both schedulers, and all three load producers.
+    return Arrays.asList(new Object[][] {
 
-    // start the simulator
-    File slsOutputDir = new File(tempDir.getAbsolutePath() + "/slsoutput/");
-    String args[] = new String[]{
-            "-inputrumen", "src/main/data/2jobs2min-rumen-jh.json",
-            "-output", slsOutputDir.getAbsolutePath()};
-    SLSRunner.main(args);
+        // covering old commandline in tests
+        {capScheduler, "OLD_RUMEN", rumenTraceFile, nodeFile },
+        {capScheduler, "OLD_SLS", slsTraceFile, nodeFile },
 
-    // wait for 20 seconds before stop
-    int count = 20;
-    while (count >= 0) {
-      Thread.sleep(1000);
+        // covering the no nodeFile case
+        {capScheduler, "SYNTH", synthTraceFile, null },
+        {capScheduler, "RUMEN", rumenTraceFile, null },
+        {capScheduler, "SLS", slsTraceFile, null },
 
-      if (! exceptionList.isEmpty()) {
-        SLSRunner.getRunner().stop();
-        Assert.fail("TestSLSRunner catched exception from child thread " +
-            "(TaskRunner.Task): " + exceptionList.get(0).getMessage());
-        break;
-      }
-      count--;
-    }
+        // covering new commandline and CapacityScheduler
+        {capScheduler, "SYNTH", synthTraceFile, nodeFile },
+        {capScheduler, "RUMEN", rumenTraceFile, nodeFile },
+        {capScheduler, "SLS", slsTraceFile, nodeFile },
 
-    SLSRunner.getRunner().stop();
+        // covering FairScheduler
+        {fairScheduler, "SYNTH", synthTraceFile, nodeFile },
+        {fairScheduler, "RUMEN", rumenTraceFile, nodeFile },
+        {fairScheduler, "SLS", slsTraceFile, nodeFile }
+    });
+  }
+
+  @Before
+  public void setup() {
+    ongoingInvariantFile = "src/test/resources/ongoing-invariants.txt";
+    exitInvariantFile = "src/test/resources/exit-invariants.txt";
+  }
+
+  @Test(timeout = 120000)
+  @SuppressWarnings("all")
+  public void testSimulatorRunning() throws Exception {
+    Configuration conf = new Configuration(false);
+    long timeTillShutdownInsec = 20L;
+    runSLS(conf, timeTillShutdownInsec);
   }
 
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestSynthJobGeneration.java
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestSynthJobGeneration.java b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestSynthJobGeneration.java
new file mode 100644
index 0000000..2b1971a
--- /dev/null
+++ b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/TestSynthJobGeneration.java
@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.yarn.sls;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.TaskType;
+import org.apache.hadoop.tools.rumen.TaskAttemptInfo;
+import org.apache.hadoop.yarn.sls.synthetic.SynthJob;
+import org.apache.hadoop.yarn.sls.synthetic.SynthTraceJobProducer;
+import org.apache.hadoop.mapreduce.MRJobConfig;
+import org.apache.log4j.Logger;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.io.IOException;
+
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Simple test class driving the {@code SynthTraceJobProducer}, and validating
+ * jobs produce are within expected range.
+ */
+public class TestSynthJobGeneration {
+
+  public final static Logger LOG =
+      Logger.getLogger(TestSynthJobGeneration.class);
+
+  @Test
+  public void test() throws IllegalArgumentException, IOException {
+
+    Configuration conf = new Configuration();
+
+    conf.set(SynthTraceJobProducer.SLS_SYNTHETIC_TRACE_FILE,
+        "src/test/resources/syn.json");
+
+    SynthTraceJobProducer stjp = new SynthTraceJobProducer(conf);
+
+    SynthJob js = (SynthJob) stjp.getNextJob();
+
+    int jobCount = 0;
+
+    while (js != null) {
+      LOG.info((jobCount++) + " " + js.getQueueName() + " -- "
+          + js.getJobClass().getClassName() + " (conf: "
+          + js.getJobConf().get(MRJobConfig.QUEUE_NAME) + ") " + " submission: "
+          + js.getSubmissionTime() + ", " + " duration: " + js.getDuration()
+          + " numMaps: " + js.getNumberMaps() + " numReduces: "
+          + js.getNumberReduces());
+
+      validateJob(js);
+      js = (SynthJob) stjp.getNextJob();
+    }
+
+    Assert.assertEquals(stjp.getNumJobs(), jobCount);
+  }
+
+  private void validateJob(SynthJob js) {
+
+    assertTrue(js.getSubmissionTime() > 0);
+    assertTrue(js.getDuration() > 0);
+    assertTrue(js.getNumberMaps() >= 0);
+    assertTrue(js.getNumberReduces() >= 0);
+    assertTrue(js.getNumberMaps() + js.getNumberReduces() > 0);
+    assertTrue(js.getTotalSlotTime() >= 0);
+
+    for (int i = 0; i < js.getNumberMaps(); i++) {
+      TaskAttemptInfo tai = js.getTaskAttemptInfo(TaskType.MAP, i, 0);
+      assertTrue(tai.getRuntime() > 0);
+    }
+
+    for (int i = 0; i < js.getNumberReduces(); i++) {
+      TaskAttemptInfo tai = js.getTaskAttemptInfo(TaskType.REDUCE, i, 0);
+      assertTrue(tai.getRuntime() > 0);
+    }
+
+    if (js.hasDeadline()) {
+      assertTrue(js.getDeadline() > js.getSubmissionTime() + js.getDuration());
+    }
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/appmaster/TestAMSimulator.java
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/appmaster/TestAMSimulator.java b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/appmaster/TestAMSimulator.java
index fd1c861..02dc26e 100644
--- a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/appmaster/TestAMSimulator.java
+++ b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/appmaster/TestAMSimulator.java
@@ -17,32 +17,62 @@
  */
 package org.apache.hadoop.yarn.sls.appmaster;
 
+import com.codahale.metrics.MetricRegistry;
+import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.exceptions.YarnException;
 import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler;
 import org.apache.hadoop.yarn.sls.conf.SLSConfiguration;
-import org.apache.hadoop.yarn.sls.scheduler.ContainerSimulator;
+import org.apache.hadoop.yarn.sls.scheduler.*;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
 
 import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
 import java.util.List;
 
+@RunWith(Parameterized.class)
 public class TestAMSimulator {
   private ResourceManager rm;
   private YarnConfiguration conf;
+  private Path metricOutputDir;
+
+  private Class slsScheduler;
+  private Class scheduler;
+
+  @Parameterized.Parameters
+  public static Collection<Object[]> params() {
+    return Arrays.asList(new Object[][] {
+        {SLSFairScheduler.class, FairScheduler.class},
+        {SLSCapacityScheduler.class, CapacityScheduler.class}
+    });
+  }
+
+  public TestAMSimulator(Class slsScheduler, Class scheduler) {
+    this.slsScheduler = slsScheduler;
+    this.scheduler = scheduler;
+  }
 
   @Before
   public void setup() {
+    createMetricOutputDir();
+
     conf = new YarnConfiguration();
-    conf.set(YarnConfiguration.RM_SCHEDULER,
-        "org.apache.hadoop.yarn.sls.scheduler.ResourceSchedulerWrapper");
-    conf.set(SLSConfiguration.RM_SCHEDULER,
-        "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler");
-    conf.setBoolean(SLSConfiguration.METRICS_SWITCH, false);
+    conf.set(SLSConfiguration.METRICS_OUTPUT_DIR, metricOutputDir.toString());
+    conf.set(YarnConfiguration.RM_SCHEDULER, slsScheduler.getName());
+    conf.set(SLSConfiguration.RM_SCHEDULER, scheduler.getName());
+    conf.setBoolean(SLSConfiguration.METRICS_SWITCH, true);
     rm = new ResourceManager();
     rm.init(conf);
     rm.start();
@@ -64,14 +94,51 @@ public class TestAMSimulator {
     }
   }
 
+  private void verifySchedulerMetrics(String appId) {
+    if (scheduler.equals(FairScheduler.class)) {
+      SchedulerMetrics schedulerMetrics = ((SchedulerWrapper)
+          rm.getResourceScheduler()).getSchedulerMetrics();
+      MetricRegistry metricRegistry = schedulerMetrics.getMetrics();
+      for (FairSchedulerMetrics.Metric metric :
+          FairSchedulerMetrics.Metric.values()) {
+        String key = "variable.app." + appId + "." + metric.getValue() +
+            ".memory";
+        Assert.assertTrue(metricRegistry.getGauges().containsKey(key));
+        Assert.assertNotNull(metricRegistry.getGauges().get(key).getValue());
+      }
+    }
+  }
+
+  private void createMetricOutputDir() {
+    Path testDir = Paths.get(System.getProperty("test.build.data"));
+    try {
+      metricOutputDir = Files.createTempDirectory(testDir, "output");
+    } catch (IOException e) {
+      Assert.fail(e.toString());
+    }
+  }
+
+  private void deleteMetricOutputDir() {
+    try {
+      FileUtils.deleteDirectory(metricOutputDir.toFile());
+    } catch (IOException e) {
+      Assert.fail(e.toString());
+    }
+  }
+
   @Test
   public void testAMSimulator() throws Exception {
     // Register one app
     MockAMSimulator app = new MockAMSimulator();
-    List<ContainerSimulator> containers = new ArrayList<ContainerSimulator>();
-    app.init(1, 1000, containers, rm, null, 0, 1000000l, "user1", "default",
-        false, "app1");
+    String appId = "app1";
+    String queue = "default";
+    List<ContainerSimulator> containers = new ArrayList<>();
+    app.init(1000, containers, rm, null, 0, 1000000L, "user1", queue, true,
+        appId, null, 0, SLSConfiguration.getAMContainerResource(conf));
     app.firstStep();
+
+    verifySchedulerMetrics(appId);
+
     Assert.assertEquals(1, rm.getRMContext().getRMApps().size());
     Assert.assertNotNull(rm.getRMContext().getRMApps().get(app.appId));
 
@@ -82,5 +149,7 @@ public class TestAMSimulator {
   @After
   public void tearDown() {
     rm.stop();
+
+    deleteMetricOutputDir();
   }
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/nodemanager/TestNMSimulator.java
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/nodemanager/TestNMSimulator.java b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/nodemanager/TestNMSimulator.java
index f9a3932..2f10f7d 100644
--- a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/nodemanager/TestNMSimulator.java
+++ b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/nodemanager/TestNMSimulator.java
@@ -21,26 +21,50 @@ import org.apache.hadoop.yarn.api.records.Container;
 import org.apache.hadoop.yarn.api.records.ContainerId;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler;
 import org.apache.hadoop.yarn.server.utils.BuilderUtils;
 import org.apache.hadoop.yarn.sls.conf.SLSConfiguration;
+import org.apache.hadoop.yarn.sls.scheduler.SLSCapacityScheduler;
+import org.apache.hadoop.yarn.sls.scheduler.SLSFairScheduler;
 import org.apache.hadoop.yarn.util.resource.Resources;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
 
+import java.util.Arrays;
+import java.util.Collection;
+
+@RunWith(Parameterized.class)
 public class TestNMSimulator {
   private final int GB = 1024;
   private ResourceManager rm;
   private YarnConfiguration conf;
 
+  private Class slsScheduler;
+  private Class scheduler;
+
+  @Parameterized.Parameters
+  public static Collection<Object[]> params() {
+    return Arrays.asList(new Object[][] {
+        {SLSFairScheduler.class, FairScheduler.class},
+        {SLSCapacityScheduler.class, CapacityScheduler.class}
+    });
+  }
+
+  public TestNMSimulator(Class slsScheduler, Class scheduler) {
+    this.slsScheduler = slsScheduler;
+    this.scheduler = scheduler;
+  }
+
   @Before
   public void setup() {
     conf = new YarnConfiguration();
-    conf.set(YarnConfiguration.RM_SCHEDULER,
-        "org.apache.hadoop.yarn.sls.scheduler.ResourceSchedulerWrapper");
-    conf.set(SLSConfiguration.RM_SCHEDULER,
-        "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler");
+    conf.set(YarnConfiguration.RM_SCHEDULER, slsScheduler.getName());
+    conf.set(SLSConfiguration.RM_SCHEDULER, scheduler.getName());
     conf.setBoolean(SLSConfiguration.METRICS_SWITCH, false);
     rm = new ResourceManager();
     rm.init(conf);

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/scheduler/TestTaskRunner.java
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/scheduler/TestTaskRunner.java b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/scheduler/TestTaskRunner.java
index 23f2bb6..ce6c1b3 100644
--- a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/scheduler/TestTaskRunner.java
+++ b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/scheduler/TestTaskRunner.java
@@ -35,7 +35,7 @@ public class TestTaskRunner {
   }
 
   @After
-  public void cleanUp() {
+  public void cleanUp() throws InterruptedException {
     runner.stop();
   }
 

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/utils/TestSLSUtils.java
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/utils/TestSLSUtils.java b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/utils/TestSLSUtils.java
index f4eda67..30964a1 100644
--- a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/utils/TestSLSUtils.java
+++ b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/utils/TestSLSUtils.java
@@ -21,6 +21,9 @@ package org.apache.hadoop.yarn.sls.utils;
 import org.junit.Assert;
 import org.junit.Test;
 
+import java.util.HashSet;
+import java.util.Set;
+
 public class TestSLSUtils {
 
   @Test
@@ -36,4 +39,31 @@ public class TestSLSUtils {
     Assert.assertEquals(rackHostname[1], "node1");
   }
 
+  @Test
+  public void testGenerateNodes() {
+    Set<? extends String> nodes = SLSUtils.generateNodes(3, 3);
+    Assert.assertEquals("Number of nodes is wrong.", 3, nodes.size());
+    Assert.assertEquals("Number of racks is wrong.", 3, getNumRack(nodes));
+
+    nodes = SLSUtils.generateNodes(3, 1);
+    Assert.assertEquals("Number of nodes is wrong.", 3, nodes.size());
+    Assert.assertEquals("Number of racks is wrong.", 1, getNumRack(nodes));
+
+    nodes = SLSUtils.generateNodes(3, 4);
+    Assert.assertEquals("Number of nodes is wrong.", 3, nodes.size());
+    Assert.assertEquals("Number of racks is wrong.", 3, getNumRack(nodes));
+
+    nodes = SLSUtils.generateNodes(3, 0);
+    Assert.assertEquals("Number of nodes is wrong.", 3, nodes.size());
+    Assert.assertEquals("Number of racks is wrong.", 1, getNumRack(nodes));
+  }
+
+  private int getNumRack(Set<? extends String> nodes) {
+    Set<String> racks = new HashSet<>();
+    for (String node : nodes) {
+      String[] rackHostname = SLSUtils.getRackHostName(node);
+      racks.add(rackHostname[0]);
+    }
+    return racks.size();
+  }
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/web/TestSLSWebApp.java
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/web/TestSLSWebApp.java b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/web/TestSLSWebApp.java
index 1c1e63c..c9be450 100644
--- a/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/web/TestSLSWebApp.java
+++ b/hadoop-tools/hadoop-sls/src/test/java/org/apache/hadoop/yarn/sls/web/TestSLSWebApp.java
@@ -20,7 +20,6 @@ package org.apache.hadoop.yarn.sls.web;
 
 import org.junit.Assert;
 import org.apache.commons.io.FileUtils;
-import org.apache.hadoop.yarn.sls.SLSRunner;
 import org.junit.Test;
 
 import java.io.File;
@@ -28,6 +27,7 @@ import java.text.MessageFormat;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
+import java.util.HashMap;
 
 public class TestSLSWebApp {
 
@@ -36,20 +36,21 @@ public class TestSLSWebApp {
     String simulateInfoTemplate = FileUtils.readFileToString(
             new File("src/main/html/simulate.info.html.template"));
 
-    SLSRunner.simulateInfoMap.put("Number of racks", 10);
-    SLSRunner.simulateInfoMap.put("Number of nodes", 100);
-    SLSRunner.simulateInfoMap.put("Node memory (MB)", 1024);
-    SLSRunner.simulateInfoMap.put("Node VCores", 1);
-    SLSRunner.simulateInfoMap.put("Number of applications", 100);
-    SLSRunner.simulateInfoMap.put("Number of tasks", 1000);
-    SLSRunner.simulateInfoMap.put("Average tasks per applicaion", 10);
-    SLSRunner.simulateInfoMap.put("Number of queues", 4);
-    SLSRunner.simulateInfoMap.put("Average applications per queue", 25);
-    SLSRunner.simulateInfoMap.put("Estimated simulate time (s)", 10000);
+    Map<String, Object> simulateInfoMap = new HashMap<>();
+    simulateInfoMap.put("Number of racks", 10);
+    simulateInfoMap.put("Number of nodes", 100);
+    simulateInfoMap.put("Node memory (MB)", 1024);
+    simulateInfoMap.put("Node VCores", 1);
+    simulateInfoMap.put("Number of applications", 100);
+    simulateInfoMap.put("Number of tasks", 1000);
+    simulateInfoMap.put("Average tasks per applicaion", 10);
+    simulateInfoMap.put("Number of queues", 4);
+    simulateInfoMap.put("Average applications per queue", 25);
+    simulateInfoMap.put("Estimated simulate time (s)", 10000);
 
     StringBuilder info = new StringBuilder();
     for (Map.Entry<String, Object> entry :
-            SLSRunner.simulateInfoMap.entrySet()) {
+        simulateInfoMap.entrySet()) {
       info.append("<tr>");
       info.append("<td class='td1'>" + entry.getKey() + "</td>");
       info.append("<td class='td2'>" + entry.getValue() + "</td>");
@@ -60,8 +61,7 @@ public class TestSLSWebApp {
             MessageFormat.format(simulateInfoTemplate, info.toString());
     Assert.assertTrue("The simulate info html page should not be empty",
             simulateInfo.length() > 0);
-    for (Map.Entry<String, Object> entry :
-            SLSRunner.simulateInfoMap.entrySet()) {
+    for (Map.Entry<String, Object> entry : simulateInfoMap.entrySet()) {
       Assert.assertTrue("The simulate info html page should have information "
               + "of " + entry.getKey(), simulateInfo.contains("<td class='td1'>"
               + entry.getKey() + "</td><td class='td2'>"

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/capacity-scheduler.xml
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/test/resources/capacity-scheduler.xml b/hadoop-tools/hadoop-sls/src/test/resources/capacity-scheduler.xml
index 61be96a..1762265 100644
--- a/hadoop-tools/hadoop-sls/src/test/resources/capacity-scheduler.xml
+++ b/hadoop-tools/hadoop-sls/src/test/resources/capacity-scheduler.xml
@@ -39,6 +39,16 @@
   </property>
 
   <property>
+    <name>yarn.scheduler.capacity.root.sls_queue_1.reservable</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>yarn.scheduler.capacity.root.sls_queue_1.show-reservations-as-queues</name>
+    <value>true</value>
+  </property>
+
+  <property>
     <name>yarn.scheduler.capacity.root.sls_queue_2.capacity</name>
     <value>25</value>
   </property>

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/exit-invariants.txt
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/test/resources/exit-invariants.txt b/hadoop-tools/hadoop-sls/src/test/resources/exit-invariants.txt
new file mode 100644
index 0000000..b4a3228
--- /dev/null
+++ b/hadoop-tools/hadoop-sls/src/test/resources/exit-invariants.txt
@@ -0,0 +1,8 @@
+ActiveApplications >= 0
+AppsCompleted >= 0
+AppsFailed >= 0
+AppsKilled >= 0
+AppsPending >= 0
+AppsRunning >= 0
+AppsSubmitted >= 0
+PendingContainers >= 0

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/fair-scheduler.xml
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/test/resources/fair-scheduler.xml b/hadoop-tools/hadoop-sls/src/test/resources/fair-scheduler.xml
index fa10359..7c46767 100644
--- a/hadoop-tools/hadoop-sls/src/test/resources/fair-scheduler.xml
+++ b/hadoop-tools/hadoop-sls/src/test/resources/fair-scheduler.xml
@@ -21,6 +21,7 @@
 -->
 
 <allocations>
+  <defaultQueueSchedulingPolicy>drf</defaultQueueSchedulingPolicy>
   <user name="jenkins">
     <!-- Limit on running jobs for the user across all pools. If more
       jobs than this are submitted, only the first <maxRunningJobs> will
@@ -31,20 +32,21 @@
   <userMaxAppsDefault>1000</userMaxAppsDefault>
   <queue name="sls_queue_1">
     <minResources>1024 mb, 1 vcores</minResources>
-    <schedulingMode>fair</schedulingMode>
+    <schedulingPolicy>drf</schedulingPolicy>
     <weight>0.25</weight>
     <minSharePreemptionTimeout>2</minSharePreemptionTimeout>
+    <reservation>true</reservation>
   </queue>
   <queue name="sls_queue_2">
     <minResources>1024 mb, 1 vcores</minResources>
-    <schedulingMode>fair</schedulingMode>
+    <schedulingMode>drf</schedulingMode>
     <weight>0.25</weight>
     <minSharePreemptionTimeout>2</minSharePreemptionTimeout>
   </queue>
   <queue name="sls_queue_3">
     <minResources>1024 mb, 1 vcores</minResources>
     <weight>0.5</weight>
-    <schedulingMode>fair</schedulingMode>
+    <schedulingMode>drf</schedulingMode>
     <minSharePreemptionTimeout>2</minSharePreemptionTimeout>
   </queue>
 </allocations>

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/inputsls.json
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/test/resources/inputsls.json b/hadoop-tools/hadoop-sls/src/test/resources/inputsls.json
new file mode 100644
index 0000000..b9d46a5
--- /dev/null
+++ b/hadoop-tools/hadoop-sls/src/test/resources/inputsls.json
@@ -0,0 +1,55 @@
+{
+  "am.type": "mapreduce",
+  "job.start.ms": 0,
+  "job.end.ms": 95375,
+  "job.queue.name": "sls_queue_1",
+  "job.id": "job_1",
+  "job.user": "default",
+  "job.tasks": [
+    {
+      "container.host": "/default-rack/node1",
+      "container.start.ms": 6664,
+      "container.end.ms": 23707,
+      "container.priority": 20,
+      "container.type": "map"
+    },
+    {
+      "container.host": "/default-rack/node3",
+      "container.start.ms": 6665,
+      "container.end.ms": 21593,
+      "container.priority": 20,
+      "container.type": "map"
+    },
+    {
+      "container.host": "/default-rack/node2",
+      "container.start.ms": 68770,
+      "container.end.ms": 86613,
+      "container.priority": 20,
+      "container.type": "map"
+    }
+  ]
+}
+{
+  "am.type": "mapreduce",
+  "job.start.ms": 105204,
+  "job.end.ms": 197256,
+  "job.queue.name": "sls_queue_2",
+  "job.id": "job_2",
+  "job.user": "default",
+  "job.tasks": [
+    {
+      "container.host": "/default-rack/node1",
+      "container.start.ms": 111822,
+      "container.end.ms": 133985,
+      "container.priority": 20,
+      "container.type": "map"
+    },
+    {
+      "container.host": "/default-rack/node2",
+      "container.start.ms": 111788,
+      "container.end.ms": 131377,
+      "container.priority": 20,
+      "container.type": "map"
+    }
+  ]
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/test/resources/log4j.properties b/hadoop-tools/hadoop-sls/src/test/resources/log4j.properties
new file mode 100644
index 0000000..81a3f6a
--- /dev/null
+++ b/hadoop-tools/hadoop-sls/src/test/resources/log4j.properties
@@ -0,0 +1,19 @@
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+# log4j configuration used during build and unit tests
+
+log4j.rootLogger=info,stdout
+log4j.threshold=ALL
+log4j.appender.stdout=org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
+log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %-5p [%t] %c{2} (%F:%M(%L)) - %m%n

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/nodes.json
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/test/resources/nodes.json b/hadoop-tools/hadoop-sls/src/test/resources/nodes.json
new file mode 100644
index 0000000..3039554
--- /dev/null
+++ b/hadoop-tools/hadoop-sls/src/test/resources/nodes.json
@@ -0,0 +1,84 @@
+{
+  "rack": "rack1",
+  "nodes": [
+    {
+      "node": "node1"
+    },
+    {
+      "node": "node2"
+    },
+    {
+      "node": "node3"
+    },
+    {
+      "node": "node4"
+    }
+  ]
+}
+{
+  "rack": "rack2",
+  "nodes": [
+    {
+      "node": "node5"
+    },
+    {
+      "node": "node6"
+    },
+    {
+      "node": "node7"
+    },
+    {
+      "node": "node8"
+    }
+  ]
+}
+{
+  "rack": "rack3",
+  "nodes": [
+    {
+      "node": "node9"
+    },
+    {
+      "node": "node10"
+    },
+    {
+      "node": "node11"
+    },
+    {
+      "node": "node12"
+    }
+  ]
+}
+{
+  "rack": "rack4",
+  "nodes": [
+    {
+      "node": "node13"
+    },
+    {
+      "node": "node14"
+    },
+    {
+      "node": "node15"
+    },
+    {
+      "node": "node16"
+    }
+  ]
+}
+{
+  "rack": "rack5",
+  "nodes": [
+    {
+      "node": "node17"
+    },
+    {
+      "node": "node18"
+    },
+    {
+      "node": "node19"
+    },
+    {
+    }
+  ]
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/ongoing-invariants.txt
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/test/resources/ongoing-invariants.txt b/hadoop-tools/hadoop-sls/src/test/resources/ongoing-invariants.txt
new file mode 100644
index 0000000..363ed0d
--- /dev/null
+++ b/hadoop-tools/hadoop-sls/src/test/resources/ongoing-invariants.txt
@@ -0,0 +1,54 @@
+running_0 >= 0
+running_60 >= 0
+running_300 >= 0
+running_1440 >= 0
+AppsSubmitted >= 0
+AppsRunning >= 0
+AppsPending >= 0
+AppsCompleted >= 0
+AppsKilled >= 0
+AppsFailed >= 0
+AllocatedMB >= 0
+AllocatedVCores >= 0
+AllocatedContainers >= 0
+AggregateContainersAllocated >= 0
+AggregateNodeLocalContainersAllocated >= 0
+AggregateRackLocalContainersAllocated >= 0
+AggregateOffSwitchContainersAllocated >= 0
+AggregateContainersReleased >= 0
+AggregateContainersPreempted >= 0
+AvailableMB >= 0
+AvailableVCores >= 0
+PendingMB >= 0
+PendingVCores >= 0
+PendingContainers >= 0
+ReservedMB >= 0
+ReservedVCores >= 0
+ReservedContainers >= 0
+ActiveUsers >= 0
+ActiveApplications >= 0
+AppAttemptFirstContainerAllocationDelayNumOps >= 0
+AppAttemptFirstContainerAllocationDelayAvgTime >= 0
+MemNonHeapUsedM >= 0
+MemNonHeapCommittedM >= 0
+MemNonHeapMaxM >= 0 || MemNonHeapMaxM == -1
+MemHeapUsedM >= 0
+MemHeapCommittedM >= 0
+MemHeapMaxM >= 0
+MemMaxM >= 0
+GcCountPS_Scavenge >= 0
+GcTimeMillisPS_Scavenge >= 0
+GcCountPS_MarkSweep >= 0
+GcTimeMillisPS_MarkSweep >= 0
+GcCount >= 0
+GcTimeMillis >= 0
+ThreadsNew >= 0
+ThreadsRunnable >= 0
+ThreadsBlocked >= 0
+ThreadsWaiting >= 0
+ThreadsTimedWaiting >= 0
+ThreadsTerminated >= 0
+LogFatal >= 0
+LogError >= 0
+LogWarn >= 0
+LogInfo >= 0

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/sls-runner.xml
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/test/resources/sls-runner.xml b/hadoop-tools/hadoop-sls/src/test/resources/sls-runner.xml
index d7acc98..2f076c2 100644
--- a/hadoop-tools/hadoop-sls/src/test/resources/sls-runner.xml
+++ b/hadoop-tools/hadoop-sls/src/test/resources/sls-runner.xml
@@ -25,11 +25,11 @@
   <!-- Nodes configuration -->
   <property>
     <name>yarn.sls.nm.memory.mb</name>
-    <value>10240</value>
+    <value>100240</value>
   </property>
   <property>
     <name>yarn.sls.nm.vcores</name>
-    <value>10</value>
+    <value>100</value>
   </property>
   <property>
     <name>yarn.sls.nm.heartbeat.interval.ms</name>
@@ -77,5 +77,5 @@
     <name>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</name>
     <value>org.apache.hadoop.yarn.sls.scheduler.CapacitySchedulerMetrics</value>
   </property>
-  
+
 </configuration>

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/syn.json
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/test/resources/syn.json b/hadoop-tools/hadoop-sls/src/test/resources/syn.json
new file mode 100644
index 0000000..8479d23
--- /dev/null
+++ b/hadoop-tools/hadoop-sls/src/test/resources/syn.json
@@ -0,0 +1,53 @@
+{
+  "description": "tiny jobs workload",
+  "num_nodes": 20,
+  "nodes_per_rack": 4,
+  "num_jobs": 10,
+  "rand_seed": 2,
+  "workloads": [
+    {
+      "workload_name": "tiny-test",
+      "workload_weight": 0.5,
+      "description": "Sort jobs",
+      "queue_name": "sls_queue_1",
+      "job_classes": [
+        {
+          "class_name": "class_1",
+          "user_name": "foobar",
+          "class_weight": 1.0,
+          "mtasks_avg": 5,
+          "mtasks_stddev": 1,
+          "rtasks_avg": 5,
+          "rtasks_stddev": 1,
+          "dur_avg": 60,
+          "dur_stddev": 5,
+          "mtime_avg": 10,
+          "mtime_stddev": 2,
+          "rtime_avg": 20,
+          "rtime_stddev": 4,
+          "map_max_memory_avg": 1024,
+          "map_max_memory_stddev": 0.001,
+          "reduce_max_memory_avg": 2048,
+          "reduce_max_memory_stddev": 0.001,
+          "map_max_vcores_avg": 1,
+          "map_max_vcores_stddev": 0.001,
+          "reduce_max_vcores_avg": 2,
+          "reduce_max_vcores_stddev": 0.001,
+          "chance_of_reservation": 0.5,
+          "deadline_factor_avg": 10.0,
+          "deadline_factor_stddev": 0.001
+        }
+      ],
+      "time_distribution": [
+        {
+          "time": 1,
+          "weight": 100
+        },
+        {
+          "time": 60,
+          "jobs": 0
+        }
+      ]
+    }
+  ]
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-tools/hadoop-sls/src/test/resources/yarn-site.xml
----------------------------------------------------------------------
diff --git a/hadoop-tools/hadoop-sls/src/test/resources/yarn-site.xml b/hadoop-tools/hadoop-sls/src/test/resources/yarn-site.xml
index c9f714c..282aef3 100644
--- a/hadoop-tools/hadoop-sls/src/test/resources/yarn-site.xml
+++ b/hadoop-tools/hadoop-sls/src/test/resources/yarn-site.xml
@@ -17,7 +17,7 @@
 <configuration>
   <property>
     <name>yarn.resourcemanager.scheduler.class</name>
-	  <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
+    <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
     <!-- <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</value> -->
   </property>
 
@@ -69,4 +69,21 @@
     <name>yarn.scheduler.fair.assignmultiple</name>
     <value>true</value>
   </property>
+
+
+  <property>
+    <description>Enable reservation system.</description>
+    <name>yarn.resourcemanager.reservation-system.enable</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>yarn.nodemanager.resource.memory-mb</name>
+    <value>1000000</value>
+  </property>
+  <property>
+    <name>yarn.nodemanager.resource.cpu-vcores</name>
+    <value>320</value>
+  </property>
+
 </configuration>

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/pom.xml
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/pom.xml
index 85df2c0..7061887 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/pom.xml
@@ -340,6 +340,7 @@
             <exclude>src/test/resources/submit-reservation.json</exclude>
             <exclude>src/test/resources/delete-reservation.json</exclude>
             <exclude>src/test/resources/update-reservation.json</exclude>
+            <exclude>src/test/resources/invariants.txt</exclude>
           </excludes>
         </configuration>
       </plugin>

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantViolationException.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantViolationException.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantViolationException.java
new file mode 100644
index 0000000..0491756
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantViolationException.java
@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants;
+
+
+import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
+
+/**
+ * This exception represents the violation of an internal invariant.
+ */
+public class InvariantViolationException extends YarnRuntimeException {
+
+  public InvariantViolationException(String s) {
+    super(s);
+  }
+
+  public InvariantViolationException(String s, Exception e) {
+    super(s, e);
+  }
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantsChecker.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantsChecker.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantsChecker.java
new file mode 100644
index 0000000..2c9031f
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantsChecker.java
@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
+import org.apache.hadoop.yarn.server.resourcemanager.monitor.SchedulingEditPolicy;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Abstract invariant checker, that setup common context for invariants
+ * checkers.
+ */
+public abstract class InvariantsChecker implements SchedulingEditPolicy {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(InvariantsChecker.class);
+  public static final String THROW_ON_VIOLATION =
+      "yarn.resourcemanager.invariant-checker.throw-on-violation";
+  public static final String INVARIANT_MONITOR_INTERVAL =
+      "yarn.resourcemanager.invariant-checker.monitor-interval";
+
+  private Configuration conf;
+  private RMContext context;
+  private ResourceScheduler scheduler;
+  private boolean throwOnInvariantViolation;
+  private long monitoringInterval;
+
+  @Override
+  public void init(Configuration config, RMContext rmContext,
+      ResourceScheduler scheduler) {
+    this.conf = config;
+    this.context = rmContext;
+    this.scheduler = scheduler;
+    this.throwOnInvariantViolation =
+        conf.getBoolean(InvariantsChecker.THROW_ON_VIOLATION, false);
+    this.monitoringInterval =
+        conf.getLong(InvariantsChecker.INVARIANT_MONITOR_INTERVAL, 1000L);
+
+    LOG.info("Invariant checker " + this.getPolicyName()
+        + " enabled. Monitoring every " + monitoringInterval
+        + "ms, throwOnViolation=" + throwOnInvariantViolation);
+  }
+
+  @Override
+  public long getMonitoringInterval() {
+    return monitoringInterval;
+  }
+
+  @Override
+  public String getPolicyName() {
+    return this.getClass().getSimpleName();
+  }
+
+  public void logOrThrow(String message) throws InvariantViolationException {
+    if (getThrowOnInvariantViolation()) {
+      throw new InvariantViolationException(message);
+    } else {
+      LOG.warn(message);
+    }
+  }
+
+  public boolean getThrowOnInvariantViolation() {
+    return throwOnInvariantViolation;
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public RMContext getContext() {
+    return context;
+  }
+
+  public ResourceScheduler getScheduler() {
+    return scheduler;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/MetricsInvariantChecker.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/MetricsInvariantChecker.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/MetricsInvariantChecker.java
new file mode 100644
index 0000000..849cbf9
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/MetricsInvariantChecker.java
@@ -0,0 +1,195 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants;
+
+import com.google.common.base.Charsets;
+import com.google.common.io.Files;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.metrics2.AbstractMetric;
+import org.apache.hadoop.metrics2.MetricsRecord;
+import org.apache.hadoop.metrics2.MetricsSystem;
+import org.apache.hadoop.metrics2.impl.MetricsCollectorImpl;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.metrics2.source.JvmMetrics;
+import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.PreemptableResourceScheduler;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.script.Compilable;
+import javax.script.CompiledScript;
+import javax.script.ScriptEngineManager;
+import javax.script.ScriptException;
+import javax.script.SimpleBindings;
+import java.io.File;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * This policy checks at every invocation that a given set of invariants
+ * (specified in a file) are respected over QueueMetrics and JvmMetrics. The
+ * file may contain arbitrary (Javascrip) boolean expression over the metrics
+ * variables.
+ *
+ * The right set of invariants depends on the deployment environment, a large
+ * number of complex invariant can make this check expensive.
+ *
+ * The MetricsInvariantChecker can be configured to throw a RuntimeException or
+ * simlpy warn in the logs if an invariant is not respected.
+ */
+public class MetricsInvariantChecker extends InvariantsChecker {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(MetricsInvariantChecker.class);
+  public static final String INVARIANTS_FILE =
+      "yarn.resourcemanager.invariant-checker.file";
+
+  private MetricsSystem metricsSystem;
+  private MetricsCollectorImpl collector;
+  private SimpleBindings bindings;
+  private ScriptEngineManager manager;
+  private Compilable scriptEngine;
+  private String invariantFile;
+  private Map<String, CompiledScript> invariants;
+  private CompiledScript combinedInvariants;
+
+  // set of metrics we monitor
+  private QueueMetrics queueMetrics;
+  private JvmMetrics jvmMetrics;
+
+  @Override
+  public void init(Configuration config, RMContext rmContext,
+                   ResourceScheduler preemptableResourceScheduler) {
+
+    super.init(config, rmContext, preemptableResourceScheduler);
+
+    this.metricsSystem = DefaultMetricsSystem.instance();
+    this.queueMetrics =
+        QueueMetrics.forQueue(metricsSystem, "root", null, false, getConf());
+    this.jvmMetrics = (JvmMetrics) metricsSystem.getSource("JvmMetrics");
+
+    // at first collect all metrics
+    collector = new MetricsCollectorImpl();
+    queueMetrics.getMetrics(collector, true);
+    jvmMetrics.getMetrics(collector, true);
+
+    // prepare bindings and evaluation engine
+    this.bindings = new SimpleBindings();
+    this.manager = new ScriptEngineManager();
+    this.scriptEngine = (Compilable) manager.getEngineByName("JavaScript");
+
+    // load metrics invariant from file
+    this.invariantFile = getConf().get(MetricsInvariantChecker.INVARIANTS_FILE);
+
+    this.invariants = new HashMap<>();
+
+    // preload all bindings
+    queueMetrics.getMetrics(collector, true);
+    jvmMetrics.getMetrics(collector, true);
+    for (MetricsRecord record : collector.getRecords()) {
+      for (AbstractMetric am : record.metrics()) {
+        bindings.put(am.name().replace(' ', '_'), am.value());
+      }
+    }
+
+    StringBuilder sb = new StringBuilder();
+    try {
+      List<String> tempInv =
+          Files.readLines(new File(invariantFile), Charsets.UTF_8);
+
+
+      boolean first = true;
+      // precompile individual invariants
+      for (String inv : tempInv) {
+
+        if(first) {
+          first = false;
+        } else {
+          sb.append("&&");
+        }
+
+        invariants.put(inv, scriptEngine.compile(inv));
+        sb.append(" (");
+        sb.append(inv);
+        sb.append(") ");
+      }
+
+      // create a single large combined invariant for speed of checking
+      combinedInvariants = scriptEngine.compile(sb.toString());
+
+    } catch (IOException e) {
+      throw new RuntimeException(
+          "Error loading invariant file: " + e.getMessage());
+    } catch (ScriptException e) {
+      throw new RuntimeException("Error compiling invariant " + e.getMessage());
+    }
+
+  }
+
+  @Override
+  public void editSchedule() {
+    // grab all changed metrics and update bindings
+    collector.clear();
+    queueMetrics.getMetrics(collector, false);
+    jvmMetrics.getMetrics(collector, false);
+
+    for (MetricsRecord record : collector.getRecords()) {
+      for (AbstractMetric am : record.metrics()) {
+        bindings.put(am.name().replace(' ', '_'), am.value());
+      }
+    }
+
+    // evaluate all invariants with new bindings
+    try {
+
+      // fastpath check all invariants at once (much faster)
+      boolean allInvHold = (boolean) combinedInvariants.eval(bindings);
+
+      // if any fails, check individually to produce more insightful log
+      if (!allInvHold) {
+        for (Map.Entry<String, CompiledScript> e : invariants.entrySet()) {
+          boolean invariantsHold = (boolean) e.getValue().eval(bindings);
+          if (!invariantsHold) {
+            // filter bindings to produce minimal set
+            Map<String, Object> matchingBindings =
+                extractMatchingBindings(e.getKey(), bindings);
+            logOrThrow("Invariant \"" + e.getKey()
+                + "\" is NOT holding, with bindings: " + matchingBindings);
+          }
+        }
+      }
+    } catch (ScriptException e) {
+      logOrThrow(e.getMessage());
+    }
+  }
+
+  private static Map<String, Object> extractMatchingBindings(String inv,
+      SimpleBindings allBindings) {
+    Map<String, Object> matchingBindings = new HashMap<>();
+    for (Map.Entry<String, Object> s : allBindings.entrySet()) {
+      if (inv.contains(s.getKey())) {
+        matchingBindings.put(s.getKey(), s.getValue());
+      }
+    }
+    return matchingBindings;
+  }
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/ReservationInvariantsChecker.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/ReservationInvariantsChecker.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/ReservationInvariantsChecker.java
new file mode 100644
index 0000000..2f9f03e
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/ReservationInvariantsChecker.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants;
+
+import org.apache.hadoop.yarn.server.resourcemanager.reservation.Plan;
+import org.apache.hadoop.yarn.util.UTCClock;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Collection;
+
+/**
+ * Invariant checker that checks certain reservation invariants are respected.
+ */
+public class ReservationInvariantsChecker extends InvariantsChecker {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(ReservationInvariantsChecker.class);
+
+  private UTCClock clock = new UTCClock();
+
+  @Override
+  public void editSchedule() {
+    Collection<Plan> plans =
+        getContext().getReservationSystem().getAllPlans().values();
+
+    try {
+      for (Plan plan : plans) {
+        long currReservations =
+            plan.getReservationsAtTime(clock.getTime()).size();
+        long numberReservationQueues = getContext().getScheduler()
+            .getQueueInfo(plan.getQueueName(), true, false).getChildQueues()
+            .size();
+        if (currReservations != numberReservationQueues - 1) {
+          logOrThrow("Number of reservations (" + currReservations
+              + ") does NOT match the number of reservationQueues ("
+              + (numberReservationQueues - 1) + "), while it should.");
+        }
+      }
+    } catch (IOException io) {
+      throw new InvariantViolationException("Issue during invariant check: ",
+          io);
+    }
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/d894f910/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/package-info.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/package-info.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/package-info.java
new file mode 100644
index 0000000..d9931d6
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Monitoring policies, used to check invariants.
+ */
+package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants;
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


Mime
View raw message