hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From s..@apache.org
Subject [17/50] [abbrv] hadoop git commit: YARN-7189. Container-executor doesn't remove Docker containers that error out early. Contributed by Eric Badger
Date Wed, 02 May 2018 19:21:52 GMT
YARN-7189. Container-executor doesn't remove Docker containers that error out early. Contributed
by Eric Badger

(cherry picked from commit 391ac5cdd2f31db2341bb731daee094b9ca309ec)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/5ec195ed
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/5ec195ed
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/5ec195ed

Branch: refs/heads/YARN-8200
Commit: 5ec195edbcd982a3e7c2a4ea760e3ce860c87143
Parents: 88cb461
Author: Jason Lowe <jlowe@apache.org>
Authored: Tue Apr 17 09:45:55 2018 -0500
Committer: Jason Lowe <jlowe@apache.org>
Committed: Tue Apr 17 09:53:19 2018 -0500

----------------------------------------------------------------------
 .../impl/container-executor.c                   | 59 +++++++++++++++-----
 1 file changed, 44 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/5ec195ed/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c
index c1a42ca..109ff73 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c
@@ -1444,7 +1444,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
   if (exit_code != 0) {
     fprintf(ERRORFILE, "Could not create script path\n");
     fflush(ERRORFILE);
-    goto cleanup;
+    goto pre_launch_cleanup;
   }
 
   fprintf(LOGFILE, "Creating local dirs...\n");
@@ -1455,7 +1455,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
   if (exit_code != 0) {
     fprintf(ERRORFILE, "Could not create local files and directories %d %d\n", container_file_source,
cred_file_source);
     fflush(ERRORFILE);
-    goto cleanup;
+    goto pre_launch_cleanup;
   }
 
   docker_command = construct_docker_command(command_file);
@@ -1467,14 +1467,14 @@ int launch_docker_container_as_user(const char * user, const char
*app_id,
     exit_code = OUT_OF_MEMORY;
     fprintf(ERRORFILE, "Container out of memory");
     fflush(ERRORFILE);
-    goto cleanup;
+    goto pre_launch_cleanup;
   }
 
   fprintf(LOGFILE, "Changing effective user to root...\n");
   if (change_effective_user(0, user_gid) != 0) {
     fprintf(ERRORFILE, "Could not change to effective users %d, %d\n", 0, user_gid);
     fflush(ERRORFILE);
-    goto cleanup;
+    goto pre_launch_cleanup;
   }
 
   snprintf(docker_command_with_binary, command_size, "%s %s", docker_binary, docker_command);
@@ -1487,7 +1487,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
      "Could not invoke docker %s.\n", docker_command_with_binary);
     fflush(ERRORFILE);
     exit_code = UNABLE_TO_EXECUTE_CONTAINER_SCRIPT;
-    goto cleanup;
+    goto post_launch_cleanup;
   }
 
   snprintf(docker_inspect_command, command_size,
@@ -1504,7 +1504,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
      "Could not inspect docker to get pid %s.\n", docker_inspect_command);
     fflush(ERRORFILE);
     exit_code = UNABLE_TO_EXECUTE_CONTAINER_SCRIPT;
-    goto cleanup;
+    goto post_launch_cleanup;
   }
 
   if (pid != 0) {
@@ -1519,7 +1519,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
         if (strcmp(*cgroup_ptr, "none") != 0 &&
              write_pid_to_cgroup_as_root(*cgroup_ptr, pid) != 0) {
           exit_code = WRITE_CGROUP_FAILED;
-          goto cleanup;
+          goto post_launch_cleanup;
         }
       }
     }
@@ -1532,7 +1532,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
       exit_code = WRITE_PIDFILE_FAILED;
       fprintf(ERRORFILE, "Could not write pid to %s", pid_file);
       fflush(ERRORFILE);
-      goto cleanup;
+      goto post_launch_cleanup;
     }
 
     snprintf(docker_wait_command, command_size,
@@ -1578,20 +1578,49 @@ int launch_docker_container_as_user(const char * user, const char
*app_id,
     }
   }
 
+post_launch_cleanup:
+
   fprintf(LOGFILE, "Removing docker container post-exit...\n");
   snprintf(docker_rm_command, command_size,
     "%s rm %s", docker_binary, container_id);
-  FILE* rm_docker = popen(docker_rm_command, "w");
-  if (pclose (rm_docker) != 0)
-  {
-    fprintf (ERRORFILE,
-     "Could not remove container %s.\n", docker_rm_command);
+  int rc, i, sleep_time = 1, max_iterations = 5;
+  for (i = 0; i < max_iterations; i++) {
+    if (i > 0) {
+      sleep(sleep_time);
+      sleep_time *= 2;
+    }
+    FILE* rm_docker = popen(docker_rm_command, "w");
+    if (rm_docker == 0) {
+      fprintf(ERRORFILE,
+        "popen() failed: %s\n", strerror(errno));
+      fflush(ERRORFILE);
+      continue;
+    }
+    rc = pclose(rm_docker);
+    if (rc == -1) {
+      fprintf(ERRORFILE,
+        "pclose() failed: %s\n", strerror(errno));
+      fflush(ERRORFILE);
+    } else if (WIFEXITED(rc)) {
+      if (WEXITSTATUS(rc) == 0) {
+        break;
+      } else {
+        fprintf(ERRORFILE,
+          "docker rm command failed with exit status: %d\n", WEXITSTATUS(rc));
+        fflush(ERRORFILE);
+      }
+    }
+  }
+
+  if (i == max_iterations) {
+    // Tried 5 times and failed.
+    fprintf(ERRORFILE,
+     "Could not remove container after %d tries: %s\n", max_iterations, docker_rm_command);
     fflush(ERRORFILE);
     exit_code = UNABLE_TO_EXECUTE_CONTAINER_SCRIPT;
-    goto cleanup;
   }
 
-cleanup:
+pre_launch_cleanup:
 
   if (exit_code_file != NULL && write_exit_code_file_as_nm(exit_code_file, exit_code)
< 0) {
     fprintf (ERRORFILE,


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


Mime
View raw message