hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ka...@apache.org
Subject hadoop git commit: YARN-4866. FairScheduler: AMs can consume all vcores leading to a livelock when using FAIR policy. (Yufei Gu via kasha)
Date Thu, 26 May 2016 05:16:41 GMT
Repository: hadoop
Updated Branches:
  refs/heads/branch-2 c65697796 -> 999cdd91b


YARN-4866. FairScheduler: AMs can consume all vcores leading to a livelock when using FAIR
policy. (Yufei Gu via kasha)

(cherry picked from commit 4f513a4a8ed73beec89b7b28c0cd056ee5f4ed0d)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/999cdd91
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/999cdd91
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/999cdd91

Branch: refs/heads/branch-2
Commit: 999cdd91b751de76f0e8a8cbb63df41b985cec6c
Parents: c656977
Author: Karthik Kambatla <kasha@cloudera.com>
Authored: Wed May 25 22:13:27 2016 -0700
Committer: Karthik Kambatla <kasha@cloudera.com>
Committed: Wed May 25 22:13:42 2016 -0700

----------------------------------------------------------------------
 .../scheduler/fair/FSLeafQueue.java             | 23 ++++++++--
 .../resourcemanager/scheduler/fair/FSQueue.java | 19 +++++++++
 .../scheduler/fair/TestFairScheduler.java       | 44 +++++++++++++++++++-
 3 files changed, 81 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/999cdd91/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSLeafQueue.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSLeafQueue.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSLeafQueue.java
index ca5a146..a398906 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSLeafQueue.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSLeafQueue.java
@@ -43,6 +43,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerAppUtils;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.policies.DominantResourceFairnessPolicy;
 import org.apache.hadoop.yarn.util.resource.Resources;
 
 @Private
@@ -481,8 +482,8 @@ public class FSLeafQueue extends FSQueue {
 
   /**
    * Check whether this queue can run this application master under the
-   * maxAMShare limit
-   *
+   * maxAMShare limit. For FIFO and FAIR policies, check if the VCore usage
+   * takes up the entire cluster or maxResources for the queue.
    * @param amResource
    * @return true if this queue can run
    */
@@ -494,8 +495,22 @@ public class FSLeafQueue extends FSQueue {
     }
     Resource maxAMResource = Resources.multiply(getFairShare(), maxAMShare);
     Resource ifRunAMResource = Resources.add(amResourceUsage, amResource);
-    return !policy
-        .checkIfAMResourceUsageOverLimit(ifRunAMResource, maxAMResource);
+
+    boolean overMaxAMShareLimit = policy
+            .checkIfAMResourceUsageOverLimit(ifRunAMResource, maxAMResource);
+
+    // For fair policy and fifo policy which doesn't check VCore usages,
+    // additionally check if the AM takes all available VCores or
+    // over maxResource to avoid deadlock.
+    if (!overMaxAMShareLimit && !policy.equals(
+        SchedulingPolicy.getInstance(DominantResourceFairnessPolicy.class))) {
+      overMaxAMShareLimit =
+         isVCoresOverMaxResource(ifRunAMResource.getVirtualCores()) ||
+         ifRunAMResource.getVirtualCores() >=
+         scheduler.getRootQueueMetrics().getAvailableVirtualCores();
+    }
+
+    return !overMaxAMShareLimit;
   }
 
   public void addAMResourceUsage(Resource amResource) {

http://git-wip-us.apache.org/repos/asf/hadoop/blob/999cdd91/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSQueue.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSQueue.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSQueue.java
index 4e6b99d..a0e9453 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSQueue.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSQueue.java
@@ -311,6 +311,25 @@ public abstract class FSQueue implements Queue, Schedulable {
   }
 
   /**
+   * Helper method to check if requested VCores are over maxResource.
+   * @param requestedVCores the number of VCores requested
+   * @return true if the number of VCores requested is over the maxResource;
+   *         false otherwise
+   */
+  protected boolean isVCoresOverMaxResource(int requestedVCores) {
+    if (requestedVCores >= scheduler.getAllocationConfiguration().
+        getMaxResources(getName()).getVirtualCores()) {
+      return true;
+    }
+
+    if (getParent() == null) {
+      return false;
+    }
+
+    return getParent().isVCoresOverMaxResource(requestedVCores);
+  }
+
+  /**
    * Returns true if queue has at least one app running.
    */
   public boolean isActive() {

http://git-wip-us.apache.org/repos/asf/hadoop/blob/999cdd91/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java
index b30fabe..38801f2 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java
@@ -3287,6 +3287,7 @@ public class TestFairScheduler extends FairSchedulerTestBase {
   @Test
   public void testQueueMaxAMShareDefault() throws Exception {
     conf.set(FairSchedulerConfiguration.ALLOCATION_FILE, ALLOC_FILE);
+    conf.setInt(YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES, 6);
 
     PrintWriter out = new PrintWriter(new FileWriter(ALLOC_FILE));
     out.println("<?xml version=\"1.0\"?>");
@@ -3297,11 +3298,14 @@ public class TestFairScheduler extends FairSchedulerTestBase {
     out.println("<maxAMShare>0.4</maxAMShare>");
     out.println("</queue>");
     out.println("<queue name=\"queue3\">");
+    out.println("<maxResources>10240 mb 4 vcores</maxResources>");
     out.println("</queue>");
     out.println("<queue name=\"queue4\">");
     out.println("</queue>");
     out.println("<queue name=\"queue5\">");
     out.println("</queue>");
+    out.println(
+        "<defaultQueueSchedulingPolicy>fair</defaultQueueSchedulingPolicy>");
     out.println("</allocations>");
     out.close();
 
@@ -3310,7 +3314,7 @@ public class TestFairScheduler extends FairSchedulerTestBase {
     scheduler.reinitialize(conf, resourceManager.getRMContext());
 
     RMNode node =
-        MockNodes.newNodeInfo(1, Resources.createResource(8192, 20),
+        MockNodes.newNodeInfo(1, Resources.createResource(8192, 10),
             0, "127.0.0.1");
     NodeAddedSchedulerEvent nodeEvent = new NodeAddedSchedulerEvent(node);
     NodeUpdateSchedulerEvent updateEvent = new NodeUpdateSchedulerEvent(node);
@@ -3378,6 +3382,44 @@ public class TestFairScheduler extends FairSchedulerTestBase {
         0, app2.getLiveContainers().size());
     assertEquals("Queue2's AM resource usage should be 0 MB memory",
         0, queue2.getAmResourceUsage().getMemory());
+
+    // Remove the app2
+    AppAttemptRemovedSchedulerEvent appRemovedEvent2 =
+        new AppAttemptRemovedSchedulerEvent(attId2,
+                RMAppAttemptState.FINISHED, false);
+    scheduler.handle(appRemovedEvent2);
+    scheduler.update();
+
+    // AM3 can pass the fair share checking, but it takes all available VCore,
+    // So the AM3 is not accepted.
+    ApplicationAttemptId attId3 = createAppAttemptId(3, 1);
+    createApplicationWithAMResource(attId3, "queue3", "test1", amResource1);
+    createSchedulingRequestExistingApplication(1024, 6, amPriority, attId3);
+    FSAppAttempt app3 = scheduler.getSchedulerApp(attId3);
+    scheduler.update();
+    scheduler.handle(updateEvent);
+    assertEquals("Application3's AM resource shouldn't be updated",
+            0, app3.getAMResource().getMemory());
+    assertEquals("Application3's AM should not be running",
+            0, app3.getLiveContainers().size());
+    assertEquals("Queue3's AM resource usage should be 0 MB memory",
+            0, queue3.getAmResourceUsage().getMemory());
+
+    // AM4 can pass the fair share checking and it doesn't takes all
+    // available VCore, but it need 5 VCores which are more than
+    // maxResources(4 VCores). So the AM4 is not accepted.
+    ApplicationAttemptId attId4 = createAppAttemptId(4, 1);
+    createApplicationWithAMResource(attId4, "queue3", "test1", amResource1);
+    createSchedulingRequestExistingApplication(1024, 5, amPriority, attId4);
+    FSAppAttempt app4 = scheduler.getSchedulerApp(attId4);
+    scheduler.update();
+    scheduler.handle(updateEvent);
+    assertEquals("Application4's AM resource shouldn't be updated",
+            0, app4.getAMResource().getMemory());
+    assertEquals("Application4's AM should not be running",
+            0, app4.getLiveContainers().size());
+    assertEquals("Queue3's AM resource usage should be 0 MB memory",
+            0, queue3.getAmResourceUsage().getMemory());
   }
 
   /**


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


Mime
View raw message