hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From vino...@apache.org
Subject git commit: YARN-1922. Fixed NodeManager to kill process-trees correctly in the presence of races between the launch and the stop-container call and when root processes crash. Contributed by Billie Rinaldi.
Date Tue, 04 Nov 2014 00:41:01 GMT
Repository: hadoop
Updated Branches:
  refs/heads/branch-2 d2d2c84a1 -> 9c76dcada


YARN-1922. Fixed NodeManager to kill process-trees correctly in the presence of races between
the launch and the stop-container call and when root processes crash. Contributed by Billie
Rinaldi.

(cherry picked from commit c5a46d4c8ca236ff641a309f256bbbdf4dd56db5)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/9c76dcad
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/9c76dcad
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/9c76dcad

Branch: refs/heads/branch-2
Commit: 9c76dcadafb941d641b72586885ad98f8408c4e0
Parents: d2d2c84
Author: Vinod Kumar Vavilapalli <vinodkv@apache.org>
Authored: Mon Nov 3 16:38:55 2014 -0800
Committer: Vinod Kumar Vavilapalli <vinodkv@apache.org>
Committed: Mon Nov 3 16:40:37 2014 -0800

----------------------------------------------------------------------
 hadoop-yarn-project/CHANGES.txt                 |   4 +
 .../launcher/ContainerLaunch.java               |   5 +-
 .../launcher/TestContainerLaunch.java           | 111 +++++++++++++++++++
 3 files changed, 117 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/9c76dcad/hadoop-yarn-project/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt
index 3534a0d..a50da38 100644
--- a/hadoop-yarn-project/CHANGES.txt
+++ b/hadoop-yarn-project/CHANGES.txt
@@ -822,6 +822,10 @@ Release 2.6.0 - UNRELEASED
     YARN-2795. Fixed ResourceManager to not crash loading node-label data from
     HDFS in secure mode. (Wangda Tan via vinodkv)
 
+    YARN-1922. Fixed NodeManager to kill process-trees correctly in the presence 
+    of races between the launch and the stop-container call and when root
+    processes crash. (Billie Rinaldi via vinodkv)
+
 Release 2.5.1 - 2014-09-05
 
   INCOMPATIBLE CHANGES

http://git-wip-us.apache.org/repos/asf/hadoop/blob/9c76dcad/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
index 434cb4e..57e3bb9 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
@@ -461,9 +461,8 @@ public class ContainerLaunch implements Callable<Integer> {
     final int sleepInterval = 100;
 
     // loop waiting for pid file to show up 
-    // until either the completed flag is set which means something bad 
-    // happened or our timer expires in which case we admit defeat
-    while (!completed.get()) {
+    // until our timer expires in which case we admit defeat
+    while (true) {
       processId = ProcessIdFileReader.getProcessId(pidFilePath);
       if (processId != null) {
         LOG.debug("Got pid " + processId + " for container "

http://git-wip-us.apache.org/repos/asf/hadoop/blob/9c76dcad/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
index 186788d..001643b 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
@@ -75,6 +75,7 @@ import org.apache.hadoop.yarn.event.Event;
 import org.apache.hadoop.yarn.event.EventHandler;
 import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
 import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
+import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
 import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
@@ -969,4 +970,114 @@ public class TestContainerLaunch extends BaseContainerManagerTest {
       assertThat(e.getMessage(), containsString(expectedMessage));
     }
   }
+
+  @Test
+  public void testKillProcessGroup() throws Exception {
+    Assume.assumeTrue(Shell.isSetsidAvailable);
+    containerManager.start();
+
+    // Construct the Container-id
+    ApplicationId appId = ApplicationId.newInstance(2, 2);
+    ApplicationAttemptId appAttemptId =
+        ApplicationAttemptId.newInstance(appId, 1);
+    ContainerId cId = ContainerId.newInstance(appAttemptId, 0);
+    File processStartFile =
+        new File(tmpDir, "pid.txt").getAbsoluteFile();
+    File childProcessStartFile =
+        new File(tmpDir, "child_pid.txt").getAbsoluteFile();
+
+    // setup a script that can handle sigterm gracefully
+    File scriptFile = Shell.appendScriptExtension(tmpDir, "testscript");
+    PrintWriter writer = new PrintWriter(new FileOutputStream(scriptFile));
+    writer.println("#!/bin/bash\n\n");
+    writer.println("echo \"Running testscript for forked process\"");
+    writer.println("umask 0");
+    writer.println("echo $$ >> " + processStartFile);
+    writer.println("while true;\ndo sleep 1s;\ndone > /dev/null 2>&1 &");
+    writer.println("echo $! >> " + childProcessStartFile);
+    writer.println("while true;\ndo sleep 1s;\ndone");
+    writer.close();
+    FileUtil.setExecutable(scriptFile, true);
+
+    ContainerLaunchContext containerLaunchContext =
+        recordFactory.newRecordInstance(ContainerLaunchContext.class);
+
+    // upload the script file so that the container can run it
+    URL resource_alpha =
+        ConverterUtils.getYarnUrlFromPath(localFS
+            .makeQualified(new Path(scriptFile.getAbsolutePath())));
+    LocalResource rsrc_alpha =
+        recordFactory.newRecordInstance(LocalResource.class);
+    rsrc_alpha.setResource(resource_alpha);
+    rsrc_alpha.setSize(-1);
+    rsrc_alpha.setVisibility(LocalResourceVisibility.APPLICATION);
+    rsrc_alpha.setType(LocalResourceType.FILE);
+    rsrc_alpha.setTimestamp(scriptFile.lastModified());
+    String destinationFile = "dest_file.sh";
+    Map<String, LocalResource> localResources =
+        new HashMap<String, LocalResource>();
+    localResources.put(destinationFile, rsrc_alpha);
+    containerLaunchContext.setLocalResources(localResources);
+
+    // set up the rest of the container
+    List<String> commands = Arrays.asList(Shell.getRunScriptCommand(scriptFile));
+    containerLaunchContext.setCommands(commands);
+    Priority priority = Priority.newInstance(10);
+    long createTime = 1234;
+    Token containerToken = createContainerToken(cId, priority, createTime);
+
+    StartContainerRequest scRequest =
+        StartContainerRequest.newInstance(containerLaunchContext,
+            containerToken);
+    List<StartContainerRequest> list = new ArrayList<StartContainerRequest>();
+    list.add(scRequest);
+    StartContainersRequest allRequests =
+        StartContainersRequest.newInstance(list);
+    containerManager.startContainers(allRequests);
+
+    int timeoutSecs = 0;
+    while (!processStartFile.exists() && timeoutSecs++ < 20) {
+      Thread.sleep(1000);
+      LOG.info("Waiting for process start-file to be created");
+    }
+    Assert.assertTrue("ProcessStartFile doesn't exist!",
+        processStartFile.exists());
+
+    BufferedReader reader =
+          new BufferedReader(new FileReader(processStartFile));
+    // Get the pid of the process
+    String pid = reader.readLine().trim();
+    // No more lines
+    Assert.assertEquals(null, reader.readLine());
+    reader.close();
+
+    reader =
+          new BufferedReader(new FileReader(childProcessStartFile));
+    // Get the pid of the child process
+    String child = reader.readLine().trim();
+    // No more lines
+    Assert.assertEquals(null, reader.readLine());
+    reader.close();
+
+    LOG.info("Manually killing pid " + pid + ", but not child pid " + child);
+    Shell.execCommand(new String[]{"kill", "-9", pid});
+
+    BaseContainerManagerTest.waitForContainerState(containerManager, cId,
+        ContainerState.COMPLETE);
+
+    Assert.assertFalse("Process is still alive!",
+        DefaultContainerExecutor.containerIsAlive(pid));
+
+    List<ContainerId> containerIds = new ArrayList<ContainerId>();
+    containerIds.add(cId);
+
+    GetContainerStatusesRequest gcsRequest =
+        GetContainerStatusesRequest.newInstance(containerIds);
+
+    ContainerStatus containerStatus =
+        containerManager.getContainerStatuses(gcsRequest)
+            .getContainerStatuses().get(0);
+    Assert.assertEquals(ExitCode.FORCE_KILLED.getExitCode(),
+        containerStatus.getExitStatus());
+  }
 }


Mime
View raw message