mesos-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Benjamin Mahler <benjamin.mah...@gmail.com>
Subject Re: git commit: Task health status change notifications
Date Wed, 30 Jul 2014 01:36:30 GMT
Not sure if it's related to this commit, but seems the GracePeriod test is
flaky now:

https://issues.apache.org/jira/browse/MESOS-1653

Could you help triage this ticket?


On Tue, Jul 29, 2014 at 3:47 PM, <nnielsen@apache.org> wrote:

> Repository: mesos
> Updated Branches:
>   refs/heads/master 98557a7cf -> f66289831
>
>
> Task health status change notifications
>
> The reusable health check program added in #22579 emits TaskStatus
> messages when the task under supervision first becomes viable (when the
> task passes its first health check).  It also emits a message when a
> task changes state from healthy to unhealthy.
>
> However, the scheduler should be notified for _every_ observed change in
> health status.  It's easy to imagine cases where the scheduler wants to
> wait a while before killing an unhealthy task, but still be notified of
> status changes so that load balancers may be updated, etc.  This patch
> therefore causes the scheduler to also be notified when an unhealthy
> task becomes healthy again.
>
> Review: https://reviews.apache.org/r/23966
>
>
> Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
> Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f6628983
> Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f6628983
> Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f6628983
>
> Branch: refs/heads/master
> Commit: f6628983165e4b0a2f44bb288ff87041f9e5e1bb
> Parents: 98557a7
> Author: Connor Doyle <connor@mesosphere.io>
> Authored: Tue Jul 29 15:46:45 2014 -0700
> Committer: Niklas Q. Nielsen <niklas@mesosphere.io>
> Committed: Tue Jul 29 15:46:45 2014 -0700
>
> ----------------------------------------------------------------------
>  src/health-check/main.cpp        |  5 +-
>  src/tests/health_check_tests.cpp | 87 +++++++++++++++++++++++++++++++++++
>  2 files changed, 91 insertions(+), 1 deletion(-)
> ----------------------------------------------------------------------
>
>
>
> http://git-wip-us.apache.org/repos/asf/mesos/blob/f6628983/src/health-check/main.cpp
> ----------------------------------------------------------------------
> diff --git a/src/health-check/main.cpp b/src/health-check/main.cpp
> index 95d881e..10d57a0 100644
> --- a/src/health-check/main.cpp
> +++ b/src/health-check/main.cpp
> @@ -121,7 +121,10 @@ private:
>    void success()
>    {
>      VLOG(1) << "Check passed";
> -    if (initializing) {
> +
> +    // Send a healthy status update on the first success,
> +    // and on the first success following failure(s).
> +    if (initializing || consecutiveFailures > 0) {
>        TaskHealthStatus taskHealthStatus;
>        taskHealthStatus.set_healthy(true);
>        taskHealthStatus.mutable_task_id()->CopyFrom(taskID);
>
>
> http://git-wip-us.apache.org/repos/asf/mesos/blob/f6628983/src/tests/health_check_tests.cpp
> ----------------------------------------------------------------------
> diff --git a/src/tests/health_check_tests.cpp
> b/src/tests/health_check_tests.cpp
> index aa5b78b..6c54ea8 100644
> --- a/src/tests/health_check_tests.cpp
> +++ b/src/tests/health_check_tests.cpp
> @@ -174,6 +174,93 @@ TEST_F(HealthCheckTest, HealthyTask)
>    Shutdown();
>  }
>
> +// Testing health status change reporting to scheduler.
> +TEST_F(HealthCheckTest, HealthStatusChange)
> +{
> +  Try<PID<Master> > master = StartMaster();
> +  ASSERT_SOME(master);
> +
> +  slave::Flags flags = CreateSlaveFlags();
> +  flags.isolation = "posix/cpu,posix/mem";
> +
> +  Try<MesosContainerizer*> containerizer =
> +    MesosContainerizer::create(flags, false);
> +  CHECK_SOME(containerizer);
> +
> +  Try<PID<Slave> > slave = StartSlave(containerizer.get());
> +  ASSERT_SOME(slave);
> +
> +  MockScheduler sched;
> +  MesosSchedulerDriver driver(
> +      &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);
> +
> +  EXPECT_CALL(sched, registered(&driver, _, _));
> +
> +  Future<vector<Offer> > offers;
> +  EXPECT_CALL(sched, resourceOffers(&driver, _))
> +    .WillOnce(FutureArg<1>(&offers))
> +    .WillRepeatedly(Return()); // Ignore subsequent offers.
> +
> +  driver.start();
> +
> +  AWAIT_READY(offers);
> +  EXPECT_NE(0u, offers.get().size());
> +
> +  // Create a temporary file.
> +  Try<string> temporaryPath = os::mktemp();
> +  ASSERT_SOME(temporaryPath);
> +  string tmpPath = temporaryPath.get();
> +
> +  // This command fails every other invocation.
> +  // For all runs i in Nat0, the following case i % 2 applies:
> +  //
> +  // Case 0:
> +  //   - Remove the temporary file.
> +  //
> +  // Case 1:
> +  //   - Attempt to remove the nonexistent temporary file.
> +  //   - Create the temporary file.
> +  //   - Exit with a non-zero status.
> +  string alt = "rm " + tmpPath + " || (touch " + tmpPath + " && exit 1)";
> +
> +  vector<TaskInfo> tasks = populateTasks(
> +      "sleep 20", alt, offers.get()[0], 0, 3);
> +
> +  Future<TaskStatus> statusRunning;
> +  Future<TaskStatus> statusHealth1;
> +  Future<TaskStatus> statusHealth2;
> +  Future<TaskStatus> statusHealth3;
> +
> +  EXPECT_CALL(sched, statusUpdate(&driver, _))
> +    .WillOnce(FutureArg<1>(&statusRunning))
> +    .WillOnce(FutureArg<1>(&statusHealth1))
> +    .WillOnce(FutureArg<1>(&statusHealth2))
> +    .WillOnce(FutureArg<1>(&statusHealth3));
> +
> +  driver.launchTasks(offers.get()[0].id(), tasks);
> +
> +  AWAIT_READY(statusRunning);
> +  EXPECT_EQ(TASK_RUNNING, statusRunning.get().state());
> +
> +  AWAIT_READY(statusHealth1);
> +  EXPECT_EQ(TASK_RUNNING, statusHealth1.get().state());
> +  EXPECT_TRUE(statusHealth1.get().healthy());
> +
> +  AWAIT_READY(statusHealth2);
> +  EXPECT_EQ(TASK_RUNNING, statusHealth2.get().state());
> +  EXPECT_FALSE(statusHealth2.get().healthy());
> +
> +  AWAIT_READY(statusHealth3);
> +  EXPECT_EQ(TASK_RUNNING, statusHealth3.get().state());
> +  EXPECT_TRUE(statusHealth3.get().healthy());
> +
> +  os::rm(tmpPath); // Clean up the temporary file.
> +
> +  driver.stop();
> +  driver.join();
> +
> +  Shutdown();
> +}
>
>  // Testing killing task after number of consecutive failures.
>  // Temporarily disabled due to MESOS-1613.
>
>

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message