hadoop-yarn-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From vino...@apache.org
Subject svn commit: r1404431 - in /hadoop/common/trunk/hadoop-yarn-project: CHANGES.txt hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java
Date Thu, 01 Nov 2012 00:32:37 GMT
Author: vinodkv
Date: Thu Nov  1 00:32:37 2012
New Revision: 1404431

URL: http://svn.apache.org/viewvc?rev=1404431&view=rev
Log:
YARN-189. Fixed a deadlock between RM's ApplicationMasterService and the dispatcher. Contributed
by Thomas Graves.

Modified:
    hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt
    hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java

Modified: hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt?rev=1404431&r1=1404430&r2=1404431&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt (original)
+++ hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt Thu Nov  1 00:32:37 2012
@@ -196,6 +196,9 @@ Release 0.23.5 - UNRELEASED
     YARN-166. capacity scheduler doesn't allow capacity < 1.0 (tgraves via
     bobby)
 
+    YARN-189. Fixed a deadlock between RM's ApplicationMasterService and the
+    dispatcher. (Thomas Graves via vinodkv)
+
 Release 0.23.4 - UNRELEASED
 
   INCOMPATIBLE CHANGES

Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java?rev=1404431&r1=1404430&r2=1404431&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java
(original)
+++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java
Thu Nov  1 00:32:37 2012
@@ -265,10 +265,10 @@ public class ApplicationMasterService ex
       // Oh damn! Sending reboot isn't enough. RM state is corrupted. TODO:
       allocateResponse.setAMResponse(reboot);
       return allocateResponse;
-    }
-
+    } 
+    
     // Allow only one thread in AM to do heartbeat at a time.
-    synchronized (lastResponse) { // BUG TODO: Locking order is screwed.
+    synchronized (lastResponse) {
 
       // Send the status update to the appAttempt.
       this.rmContext.getDispatcher().getEventHandler().handle(
@@ -282,7 +282,8 @@ public class ApplicationMasterService ex
       Allocation allocation =
           this.rScheduler.allocate(appAttemptId, ask, release);
 
-      RMApp app = this.rmContext.getRMApps().get(appAttemptId.getApplicationId());
+      RMApp app = this.rmContext.getRMApps().get(
+          appAttemptId.getApplicationId());
       RMAppAttempt appAttempt = app.getRMAppAttempt(appAttemptId);
       
       AMResponse response = recordFactory.newRecordInstance(AMResponse.class);
@@ -316,7 +317,18 @@ public class ApplicationMasterService ex
           .pullJustFinishedContainers());
       response.setResponseId(lastResponse.getResponseId() + 1);
       response.setAvailableResources(allocation.getResourceLimit());
-      responseMap.put(appAttemptId, response);
+      
+      AMResponse oldResponse = responseMap.put(appAttemptId, response);
+      if (oldResponse == null) {
+        // appAttempt got unregistered, remove it back out
+        responseMap.remove(appAttemptId);
+        String message = "App Attempt removed from the cache during allocate"
+            + appAttemptId;
+        LOG.error(message);
+        allocateResponse.setAMResponse(reboot);
+        return allocateResponse;
+      }
+      
       allocateResponse.setAMResponse(response);
       allocateResponse.setNumClusterNodes(this.rScheduler.getNumClusterNodes());
       return allocateResponse;
@@ -331,12 +343,7 @@ public class ApplicationMasterService ex
   }
 
   public void unregisterAttempt(ApplicationAttemptId attemptId) {
-    AMResponse lastResponse = responseMap.get(attemptId);
-    if (lastResponse != null) {
-      synchronized (lastResponse) {
-        responseMap.remove(attemptId);
-      }
-    }
+    responseMap.remove(attemptId);
   }
 
   public void refreshServiceAcls(Configuration configuration, 



Mime
View raw message