hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jl...@apache.org
Subject hadoop git commit: YARN-6640. AM heartbeat stuck when responseId overflows MAX_INT. Contributed by Botong Huang
Date Fri, 25 Aug 2017 15:05:09 GMT
Repository: hadoop
Updated Branches:
  refs/heads/branch-2 389b659c9 -> 82957ce0d


YARN-6640. AM heartbeat stuck when responseId overflows MAX_INT. Contributed by Botong Huang

(cherry picked from commit 3a4e861169dc3da9df0158ba6f44a9bc8576e217)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/82957ce0
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/82957ce0
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/82957ce0

Branch: refs/heads/branch-2
Commit: 82957ce0d0e3e4594e0dc2fa7508e09656bc4b8d
Parents: 389b659
Author: Jason Lowe <jlowe@apache.org>
Authored: Fri Aug 25 09:16:17 2017 -0500
Committer: Jason Lowe <jlowe@apache.org>
Committed: Fri Aug 25 09:20:50 2017 -0500

----------------------------------------------------------------------
 .../ApplicationMasterService.java               | 33 +++++++++++++----
 .../yarn/server/resourcemanager/MockAM.java     | 31 +++++++++++-----
 .../TestApplicationMasterService.java           | 37 +++++++++++++++++++-
 3 files changed, 86 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/82957ce0/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java
index 7c1e30b..147ba34 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java
@@ -80,6 +80,8 @@ import com.google.common.annotations.VisibleForTesting;
 public class ApplicationMasterService extends AbstractService implements
     ApplicationMasterProtocol {
   private static final Log LOG = LogFactory.getLog(ApplicationMasterService.class);
+  private static final int PRE_REGISTER_RESPONSE_ID = -1;
+
   private final AMLivelinessMonitor amLivelinessMonitor;
   private YarnScheduler rScheduler;
   protected InetSocketAddress masterServiceAddress;
@@ -321,6 +323,11 @@ public class ApplicationMasterService extends AbstractService implements
   protected static final Allocation EMPTY_ALLOCATION = new Allocation(
       EMPTY_CONTAINER_LIST, Resources.createResource(0), null, null, null);
 
+  private int getNextResponseId(int responseId) {
+    // Loop between 0 to Integer.MAX_VALUE
+    return (responseId + 1) & Integer.MAX_VALUE;
+  }
+
   @Override
   public AllocateResponse allocate(AllocateRequest request)
       throws YarnException, IOException {
@@ -353,14 +360,17 @@ public class ApplicationMasterService extends AbstractService implements
         throw new ApplicationMasterNotRegisteredException(message);
       }
 
-      if ((request.getResponseId() + 1) == lastResponse.getResponseId()) {
-        /* old heartbeat */
+      // Normally request.getResponseId() == lastResponse.getResponseId()
+      if (getNextResponseId(request.getResponseId()) == lastResponse
+          .getResponseId()) {
+        // heartbeat one step old, simply return lastReponse
         return lastResponse;
-      } else if (request.getResponseId() + 1 < lastResponse.getResponseId()) {
+      } else if (request.getResponseId() != lastResponse.getResponseId()) {
         String message =
             "Invalid responseId in AllocateRequest from application attempt: "
                 + appAttemptId + ", expect responseId to be "
-                + (lastResponse.getResponseId() + 1);
+                + lastResponse.getResponseId() + ", but get "
+                + request.getResponseId();
         throw new InvalidApplicationMasterRequestException(message);
       }
 
@@ -400,7 +410,7 @@ public class ApplicationMasterService extends AbstractService implements
        * need to worry about unregister call occurring in between (which
        * removes the lock object).
        */
-      response.setResponseId(lastResponse.getResponseId() + 1);
+      response.setResponseId(getNextResponseId(lastResponse.getResponseId()));
       lock.setAllocateResponse(response);
       return response;
     }
@@ -411,12 +421,23 @@ public class ApplicationMasterService extends AbstractService implements
         recordFactory.newRecordInstance(AllocateResponse.class);
     // set response id to -1 before application master for the following
     // attemptID get registered
-    response.setResponseId(-1);
+    response.setResponseId(PRE_REGISTER_RESPONSE_ID);
     LOG.info("Registering app attempt : " + attemptId);
     responseMap.put(attemptId, new AllocateResponseLock(response));
     rmContext.getNMTokenSecretManager().registerApplicationAttempt(attemptId);
   }
 
+  @VisibleForTesting
+  protected boolean setAttemptLastResponseId(ApplicationAttemptId attemptId,
+      int lastResponseId) {
+    AllocateResponseLock lock = responseMap.get(attemptId);
+    if (lock == null || lock.getAllocateResponse() == null) {
+      return false;
+    }
+    lock.getAllocateResponse().setResponseId(lastResponseId);
+    return true;
+  }
+
   public void unregisterAttempt(ApplicationAttemptId attemptId) {
     LOG.info("Unregistering app attempt : " + attemptId);
     responseMap.remove(attemptId);

http://git-wip-us.apache.org/repos/asf/hadoop/blob/82957ce0/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockAM.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockAM.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockAM.java
index 2451c1e..a698ecf 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockAM.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockAM.java
@@ -126,6 +126,18 @@ public class MockAM {
     }
   }
 
+  public boolean setApplicationLastResponseId(int newLastResponseId) {
+    ApplicationMasterService applicationMasterService =
+        (ApplicationMasterService) amRMProtocol;
+    responseId = newLastResponseId;
+    return applicationMasterService.setAttemptLastResponseId(attemptId,
+        newLastResponseId);
+  }
+
+  public int getResponseId() {
+    return responseId;
+  }
+
   public void addRequests(String[] hosts, int memory, int priority,
       int containers) throws Exception {
     addRequests(hosts, memory, priority, containers, 0L);
@@ -272,19 +284,22 @@ public class MockAM {
 
   public AllocateResponse doAllocateAs(UserGroupInformation ugi,
       final AllocateRequest req) throws Exception {
-    req.setResponseId(++responseId);
+    req.setResponseId(responseId);
     try {
-      return ugi.doAs(new PrivilegedExceptionAction<AllocateResponse>() {
-        @Override
-        public AllocateResponse run() throws Exception {
-          return amRMProtocol.allocate(req);
-        }
-      });
+      AllocateResponse response =
+          ugi.doAs(new PrivilegedExceptionAction<AllocateResponse>() {
+            @Override
+            public AllocateResponse run() throws Exception {
+              return amRMProtocol.allocate(req);
+            }
+          });
+      responseId = response.getResponseId();
+      return response;
     } catch (UndeclaredThrowableException e) {
       throw (Exception) e.getCause();
     }
   }
-  
+
   public AllocateResponse doHeartbeat() throws Exception {
     return allocate(null, null);
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/82957ce0/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestApplicationMasterService.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestApplicationMasterService.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestApplicationMasterService.java
index 85a36e7..e684f3c 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestApplicationMasterService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestApplicationMasterService.java
@@ -272,7 +272,42 @@ public class TestApplicationMasterService {
     Assert.assertEquals(MockRM.getClusterTimeStamp(), tokenId.getRMIdentifier());
     rm.stop();
   }
-  
+
+  @Test(timeout = 3000000)
+  public void testAllocateResponseIdOverflow() throws Exception {
+    MockRM rm = new MockRM(conf);
+    try {
+      rm.start();
+
+      // Register node1
+      MockNM nm1 = rm.registerNode("127.0.0.1:1234", 6 * GB);
+
+      // Submit an application
+      RMApp app1 = rm.submitApp(2048);
+
+      // kick the scheduling
+      nm1.nodeHeartbeat(true);
+      RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
+      MockAM am1 = rm.sendAMLaunched(attempt1.getAppAttemptId());
+      am1.registerAppAttempt();
+
+      // Set the last reponseId to be MAX_INT
+      Assert.assertTrue(am1.setApplicationLastResponseId(Integer.MAX_VALUE));
+
+      // Both allocate should succeed
+      am1.schedule(); // send allocate with reponseId = MAX_INT
+      Assert.assertEquals(0, am1.getResponseId());
+
+      am1.schedule(); // send allocate with reponseId = 0
+      Assert.assertEquals(1, am1.getResponseId());
+
+    } finally {
+      if (rm != null) {
+        rm.stop();
+      }
+    }
+  }
+
   @Test(timeout=600000)
   public void testInvalidContainerReleaseRequest() throws Exception {
     MockRM rm = new MockRM(conf);


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


Mime
View raw message