mesos-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From bmah...@apache.org
Subject [2/3] mesos git commit: Added GPU_RESOURCES capability to FrameworkInfo.
Date Tue, 21 Jun 2016 01:49:31 GMT
Added GPU_RESOURCES capability to FrameworkInfo.

Due to the scarce resource problem described in MESOS-5377, we are
introducing a GPU_RESOURCES Framework capability. This capability
allows the Mesos allocator to make better decisions about which
frameworks should receive resources from GPU capable machines. In
essence, the allocator ONLY allocate resources from GPU capable
machines to frameworks that have this capability. This is necessary to
prevent non-GPU workloads from filling up the GPU machines and
preventing GPU workloads to run.

Review: https://reviews.apache.org/r/48914/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/eec19286
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/eec19286
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/eec19286

Branch: refs/heads/master
Commit: eec19286f93e9c5b9f28447421a5721eb1605ac6
Parents: 9bf6460
Author: Kevin Klues <klueska@gmail.com>
Authored: Mon Jun 20 14:29:35 2016 -0700
Committer: Benjamin Mahler <bmahler@apache.org>
Committed: Mon Jun 20 18:43:51 2016 -0700

----------------------------------------------------------------------
 include/mesos/mesos.proto                       | 15 +++++++++++
 include/mesos/v1/mesos.proto                    | 15 +++++++++++
 src/master/allocator/mesos/hierarchical.cpp     | 26 +++++++++++++++++++-
 src/master/allocator/mesos/hierarchical.hpp     |  4 +++
 .../containerizer/nvidia_gpu_isolator_tests.cpp | 12 +++++++--
 5 files changed, 69 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/eec19286/include/mesos/mesos.proto
----------------------------------------------------------------------
diff --git a/include/mesos/mesos.proto b/include/mesos/mesos.proto
index 54e3d1e..e4bb313 100644
--- a/include/mesos/mesos.proto
+++ b/include/mesos/mesos.proto
@@ -274,6 +274,21 @@ message FrameworkInfo {
       // killed by an executor. The executor will examine this
       // capability to determine whether it can send TASK_KILLING.
       TASK_KILLING_STATE = 2;
+
+      // Indicates whether the framework is aware of GPU resources.
+      // Frameworks that are aware of GPU resources are expected to
+      // avoid placing non-GPU workloads on GPU agents, in order
+      // to avoid occupying a GPU agent and preventing GPU workloads
+      // from using precious GPU resources! Currently, if a
+      // framework is unaware of GPU resources, it will not be
+      // offered *any* of the resources on an agent with GPUs.
+      // This restriction is in place because we do not have a
+      // revocation mechanism that ensures GPU workloads can evict
+      // GPU agent occupants if necessary.
+      //
+      // TODO(bmahler): As we add revocation we can relax the
+      // restriction here. See MESOS-5634 for more information.
+      GPU_RESOURCES = 3;
     }
 
     // Enum fields should be optional, see: MESOS-4997.

http://git-wip-us.apache.org/repos/asf/mesos/blob/eec19286/include/mesos/v1/mesos.proto
----------------------------------------------------------------------
diff --git a/include/mesos/v1/mesos.proto b/include/mesos/v1/mesos.proto
index e644ce2..8bba833 100644
--- a/include/mesos/v1/mesos.proto
+++ b/include/mesos/v1/mesos.proto
@@ -274,6 +274,21 @@ message FrameworkInfo {
       // killed by an executor. The executor will examine this
       // capability to determine whether it can send TASK_KILLING.
       TASK_KILLING_STATE = 2;
+
+      // Indicates whether the framework is aware of GPU resources.
+      // Frameworks that are aware of GPU resources are expected to
+      // avoid placing non-GPU workloads on GPU agents, in order
+      // to avoid occupying a GPU agent and preventing GPU workloads
+      // from using precious GPU resources! Currently, if a
+      // framework is unaware of GPU resources, it will not be
+      // offered *any* of the resources on an agent with GPUs.
+      // This restriction is in place because we do not have a
+      // revocation mechanism that ensures GPU workloads can evict
+      // GPU agent occupants if necessary.
+      //
+      // TODO(bmahler): As we add revocation we can relax the
+      // restriction here. See MESOS-5634 for more information.
+      GPU_RESOURCES = 3;
     }
 
     // Enum fields should be optional, see: MESOS-4997.

http://git-wip-us.apache.org/repos/asf/mesos/blob/eec19286/src/master/allocator/mesos/hierarchical.cpp
----------------------------------------------------------------------
diff --git a/src/master/allocator/mesos/hierarchical.cpp b/src/master/allocator/mesos/hierarchical.cpp
index 8b7b3af..5b2331b 100644
--- a/src/master/allocator/mesos/hierarchical.cpp
+++ b/src/master/allocator/mesos/hierarchical.cpp
@@ -250,12 +250,16 @@ void HierarchicalAllocatorProcess::addFramework(
   frameworks[frameworkId] = Framework();
   frameworks[frameworkId].role = frameworkInfo.role();
 
-  // Check if the framework desires revocable resources.
+  // Set the framework capabilities that this allocator cares about.
   frameworks[frameworkId].revocable = false;
+  frameworks[frameworkId].gpuAware = false;
+
   foreach (const FrameworkInfo::Capability& capability,
            frameworkInfo.capabilities()) {
     if (capability.type() == FrameworkInfo::Capability::REVOCABLE_RESOURCES) {
       frameworks[frameworkId].revocable = true;
+    } else if (capability.type() == FrameworkInfo::Capability::GPU_RESOURCES) {
+      frameworks[frameworkId].gpuAware = true;
     }
   }
 
@@ -388,12 +392,16 @@ void HierarchicalAllocatorProcess::updateFramework(
   // progress on allowing these fields to be updated.
   CHECK_EQ(frameworks[frameworkId].role, frameworkInfo.role());
 
+  // Update the framework capabilities that this allocator cares about.
   frameworks[frameworkId].revocable = false;
+  frameworks[frameworkId].gpuAware = false;
 
   foreach (const FrameworkInfo::Capability& capability,
            frameworkInfo.capabilities()) {
     if (capability.type() == FrameworkInfo::Capability::REVOCABLE_RESOURCES) {
       frameworks[frameworkId].revocable = true;
+    } else if (capability.type() == FrameworkInfo::Capability::GPU_RESOURCES) {
+      frameworks[frameworkId].gpuAware = true;
     }
   }
 }
@@ -1263,6 +1271,14 @@ void HierarchicalAllocatorProcess::allocate(
           continue;
         }
 
+        // Only offer resources from slaves that have GPUs to
+        // frameworks that are capable of receiving GPUs.
+        // See MESOS-5634.
+        if (!frameworks[frameworkId].gpuAware &&
+            slaves[slaveId].total.gpus().getOrElse(0) > 0) {
+          continue;
+        }
+
         // Calculate the currently available resources on the slave.
         Resources available = slaves[slaveId].total - slaves[slaveId].allocated;
 
@@ -1393,6 +1409,14 @@ void HierarchicalAllocatorProcess::allocate(
           continue;
         }
 
+        // Only offer resources from slaves that have GPUs to
+        // frameworks that are capable of receiving GPUs.
+        // See MESOS-5634.
+        if (!frameworks[frameworkId].gpuAware &&
+            slaves[slaveId].total.gpus().getOrElse(0) > 0) {
+          continue;
+        }
+
         // Calculate the currently available resources on the slave.
         Resources available = slaves[slaveId].total - slaves[slaveId].allocated;
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/eec19286/src/master/allocator/mesos/hierarchical.hpp
----------------------------------------------------------------------
diff --git a/src/master/allocator/mesos/hierarchical.hpp b/src/master/allocator/mesos/hierarchical.hpp
index 9c6b23a..c9cb1e1 100644
--- a/src/master/allocator/mesos/hierarchical.hpp
+++ b/src/master/allocator/mesos/hierarchical.hpp
@@ -283,6 +283,10 @@ protected:
     // Whether the framework desires revocable resources.
     bool revocable;
 
+    // Whether the framework is aware of GPU resources. See
+    // the documentation for the GPU_RESOURCES Capability.
+    bool gpuAware;
+
     // Active offer and inverse offer filters for the framework.
     hashmap<SlaveID, hashset<OfferFilter*>> offerFilters;
     hashmap<SlaveID, hashset<InverseOfferFilter*>> inverseOfferFilters;

http://git-wip-us.apache.org/repos/asf/mesos/blob/eec19286/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp b/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
index e06d107..dd49a93 100644
--- a/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
+++ b/src/tests/containerizer/nvidia_gpu_isolator_tests.cpp
@@ -91,8 +91,12 @@ TEST_F(NvidiaGpuTest, ROOT_CGROUPS_NVIDIA_GPU_VerifyDeviceAccess)
 
   MockScheduler sched;
 
+  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.add_capabilities()->set_type(
+      FrameworkInfo::Capability::GPU_RESOURCES);
+
   MesosSchedulerDriver driver(
-      &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL);
+      &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL);
 
   Future<Nothing> schedRegistered;
   EXPECT_CALL(sched, registered(_, _, _))
@@ -183,8 +187,12 @@ TEST_F(NvidiaGpuTest, ROOT_CGROUPS_NVIDIA_GPU_FractionalResources)
 
   MockScheduler sched;
 
+  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.add_capabilities()->set_type(
+      FrameworkInfo::Capability::GPU_RESOURCES);
+
   MesosSchedulerDriver driver(
-      &sched, DEFAULT_FRAMEWORK_INFO, master.get()->pid, DEFAULT_CREDENTIAL);
+      &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL);
 
   Future<Nothing> schedRegistered;
   EXPECT_CALL(sched, registered(_, _, _))


Mime
View raw message