mesos-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gilb...@apache.org
Subject [mesos] 01/04: Made agent not read the forked pid and libprocess pid after reboot.
Date Fri, 11 Jan 2019 21:34:23 GMT
This is an automated email from the ASF dual-hosted git repository.

gilbert pushed a commit to branch 1.5.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit efcc347bb86dfd0be469d8c0215af43435737c71
Author: Qian Zhang <zhq527725@gmail.com>
AuthorDate: Fri Jan 11 12:21:18 2019 -0800

    Made agent not read the forked pid and libprocess pid after reboot.
    
    After agent host is rebooted, the forked pid and libprocess pid in
    agent's meta directory are obsolete, so we should not read them during
    agent recovery, otherwise containerizer may wait for an irrelevant
    process if the forked pid is reused by another process after reboot.
    
    Review: https://reviews.apache.org/r/69705/
    (cherry picked from commit eda091215a132a7ea758a95cd4f66945a462387e)
---
 src/slave/state.cpp | 57 +++++++++++++++++++++++++++++++++++++++++------------
 src/slave/state.hpp | 12 +++++++----
 2 files changed, 52 insertions(+), 17 deletions(-)

diff --git a/src/slave/state.cpp b/src/slave/state.cpp
index e7cf849..ae16d6f 100644
--- a/src/slave/state.cpp
+++ b/src/slave/state.cpp
@@ -123,7 +123,9 @@ Try<State> recover(const string& rootDir, bool strict)
   SlaveID slaveId;
   slaveId.set_value(Path(directory.get()).basename());
 
-  Try<SlaveState> slave = SlaveState::recover(rootDir, slaveId, strict);
+  Try<SlaveState> slave =
+    SlaveState::recover(rootDir, slaveId, strict, state.rebooted);
+
   if (slave.isError()) {
     return Error(slave.error());
   }
@@ -137,7 +139,8 @@ Try<State> recover(const string& rootDir, bool strict)
 Try<SlaveState> SlaveState::recover(
     const string& rootDir,
     const SlaveID& slaveId,
-    bool strict)
+    bool strict,
+    bool rebooted)
 {
   SlaveState state;
   state.id = slaveId;
@@ -188,7 +191,7 @@ Try<SlaveState> SlaveState::recover(
     frameworkId.set_value(Path(path).basename());
 
     Try<FrameworkState> framework =
-      FrameworkState::recover(rootDir, slaveId, frameworkId, strict);
+      FrameworkState::recover(rootDir, slaveId, frameworkId, strict, rebooted);
 
     if (framework.isError()) {
       return Error("Failed to recover framework " + frameworkId.value() +
@@ -207,7 +210,8 @@ Try<FrameworkState> FrameworkState::recover(
     const string& rootDir,
     const SlaveID& slaveId,
     const FrameworkID& frameworkId,
-    bool strict)
+    bool strict,
+    bool rebooted)
 {
   FrameworkState state;
   state.id = frameworkId;
@@ -295,8 +299,8 @@ Try<FrameworkState> FrameworkState::recover(
     ExecutorID executorId;
     executorId.set_value(Path(path).basename());
 
-    Try<ExecutorState> executor =
-      ExecutorState::recover(rootDir, slaveId, frameworkId, executorId, strict);
+    Try<ExecutorState> executor = ExecutorState::recover(
+        rootDir, slaveId, frameworkId, executorId, strict, rebooted);
 
     if (executor.isError()) {
       return Error("Failed to recover executor '" + executorId.value() +
@@ -316,7 +320,8 @@ Try<ExecutorState> ExecutorState::recover(
     const SlaveID& slaveId,
     const FrameworkID& frameworkId,
     const ExecutorID& executorId,
-    bool strict)
+    bool strict,
+    bool rebooted)
 {
   ExecutorState state;
   state.id = executorId;
@@ -356,7 +361,13 @@ Try<ExecutorState> ExecutorState::recover(
       containerId.set_value(Path(path).basename());
 
       Try<RunState> run = RunState::recover(
-          rootDir, slaveId, frameworkId, executorId, containerId, strict);
+          rootDir,
+          slaveId,
+          frameworkId,
+          executorId,
+          containerId,
+          strict,
+          rebooted);
 
       if (run.isError()) {
         return Error(
@@ -423,7 +434,8 @@ Try<RunState> RunState::recover(
     const FrameworkID& frameworkId,
     const ExecutorID& executorId,
     const ContainerID& containerId,
-    bool strict)
+    bool strict,
+    bool rebooted)
 {
   RunState state;
   state.id = containerId;
@@ -469,18 +481,37 @@ Try<RunState> RunState::recover(
     state.errors += task->errors;
   }
 
-  // Read the forked pid.
   path = paths::getForkedPidPath(
       rootDir, slaveId, frameworkId, executorId, containerId);
+
+  // If agent host is rebooted, we do not read the forked pid and libprocess pid
+  // since those two pids are obsolete after reboot. And we remove the forked
+  // pid file to make sure we will not read it in the case the agent process is
+  // restarted after we checkpoint the new boot ID in `Slave::__recover` (i.e.,
+  // agent recovery is done after the reboot).
+  if (rebooted) {
+    if (os::exists(path)) {
+      Try<Nothing> rm = os::rm(path);
+      if (rm.isError()) {
+        return Error(
+            "Failed to remove executor forked pid file '" + path + "': " +
+            rm.error());
+      }
+    }
+
+    return state;
+  }
+
   if (!os::exists(path)) {
-    // This could happen if the slave died before the isolator
-    // checkpointed the forked pid.
+    // This could happen if the slave died before the containerizer checkpointed
+    // the forked pid or agent process is restarted after agent host is rebooted
+    // since we remove this file in the above code.
     LOG(WARNING) << "Failed to find executor forked pid file '" << path <<
"'";
     return state;
   }
 
+  // Read the forked pid.
   Result<string> pid = state::read<string>(path);
-
   if (pid.isError()) {
     message = "Failed to read executor forked pid from '" + path +
               "': " + pid.error();
diff --git a/src/slave/state.hpp b/src/slave/state.hpp
index 003211e..37cd2f6 100644
--- a/src/slave/state.hpp
+++ b/src/slave/state.hpp
@@ -271,7 +271,8 @@ struct RunState
       const FrameworkID& frameworkId,
       const ExecutorID& executorId,
       const ContainerID& containerId,
-      bool strict);
+      bool strict,
+      bool rebooted);
 
   Option<ContainerID> id;
   hashmap<TaskID, TaskState> tasks;
@@ -298,7 +299,8 @@ struct ExecutorState
       const SlaveID& slaveId,
       const FrameworkID& frameworkId,
       const ExecutorID& executorId,
-      bool strict);
+      bool strict,
+      bool rebooted);
 
   ExecutorID id;
   Option<ExecutorInfo> info;
@@ -316,7 +318,8 @@ struct FrameworkState
       const std::string& rootDir,
       const SlaveID& slaveId,
       const FrameworkID& frameworkId,
-      bool strict);
+      bool strict,
+      bool rebooted);
 
   FrameworkID id;
   Option<FrameworkInfo> info;
@@ -351,7 +354,8 @@ struct SlaveState
   static Try<SlaveState> recover(
       const std::string& rootDir,
       const SlaveID& slaveId,
-      bool strict);
+      bool strict,
+      bool rebooted);
 
   SlaveID id;
   Option<SlaveInfo> info;


Mime
View raw message