hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From varunsax...@apache.org
Subject [25/50] [abbrv] hadoop git commit: YARN-3359. Recover collector list when RM fails over (Li Lu via Varun Saxena)
Date Tue, 25 Apr 2017 21:53:05 GMT
YARN-3359. Recover collector list when RM fails over (Li Lu via Varun Saxena)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/03f8d653
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/03f8d653
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/03f8d653

Branch: refs/heads/YARN-5355
Commit: 03f8d653e2f8cd32e4b34908023b9666c96eaed2
Parents: 39c88af
Author: Varun Saxena <varunsaxena@apache.org>
Authored: Sun Nov 6 23:00:48 2016 +0530
Committer: Varun Saxena <varunsaxena@apache.org>
Committed: Mon Apr 24 16:17:42 2017 +0530

----------------------------------------------------------------------
 .../yarn/server/nodemanager/NodeManager.java    |  39 ++++++
 .../resourcemanager/ResourceTrackerService.java |  26 ++--
 .../yarn/server/resourcemanager/MockNM.java     |  16 +++
 .../TestRMHATimelineCollectors.java             | 120 +++++++++++++++++++
 4 files changed, 190 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/03f8d653/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
index 3ddf4c3..e8858dc 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
@@ -63,6 +63,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManag
 import org.apache.hadoop.yarn.server.nodemanager.collectormanager.NMCollectorService;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
 import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
 import org.apache.hadoop.yarn.server.nodemanager.nodelabels.ConfigurationNodeLabelsProvider;
@@ -466,8 +467,14 @@ public class NodeManager extends CompositeService
           if (!rmWorkPreservingRestartEnabled) {
             LOG.info("Cleaning up running containers on resync");
             containerManager.cleanupContainersOnNMResync();
+            // Clear all known collectors for resync.
+            if (context.getKnownCollectors() != null) {
+              context.getKnownCollectors().clear();
+            }
           } else {
             LOG.info("Preserving containers on resync");
+            // Re-register known timeline collectors.
+            reregisterCollectors();
           }
           ((NodeStatusUpdaterImpl) nodeStatusUpdater)
             .rebootNodeStatusUpdaterAndRegisterWithRM();
@@ -479,6 +486,38 @@ public class NodeManager extends CompositeService
     }.start();
   }
 
+  /**
+   * Reregisters all collectors known by this node to the RM. This method is
+   * called when the RM needs to resync with the node.
+   */
+  protected void reregisterCollectors() {
+    Map<ApplicationId, AppCollectorData> knownCollectors
+        = context.getKnownCollectors();
+    if (knownCollectors == null) {
+      return;
+    }
+    Map<ApplicationId, AppCollectorData> registeringCollectors
+        = context.getRegisteringCollectors();
+    for (Map.Entry<ApplicationId, AppCollectorData> entry
+        : knownCollectors.entrySet()) {
+      Application app = context.getApplications().get(entry.getKey());
+      if ((app != null)
+          && !ApplicationState.FINISHED.equals(app.getApplicationState())) {
+        registeringCollectors.putIfAbsent(entry.getKey(), entry.getValue());
+        AppCollectorData data = entry.getValue();
+        if (LOG.isDebugEnabled()) {
+          LOG.debug(entry.getKey() + " : " + data.getCollectorAddr() + "@<"
+              + data.getRMIdentifier() + ", " + data.getVersion() + ">");
+        }
+      } else {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Remove collector data for done app " + entry.getKey());
+        }
+      }
+    }
+    knownCollectors.clear();
+  }
+
   public static class NMContext implements Context {
 
     private NodeId nodeId = null;

http://git-wip-us.apache.org/repos/asf/hadoop/blob/03f8d653/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java
index 92bfc37..b468f46 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java
@@ -656,17 +656,21 @@ public class ResourceTrackerService extends AbstractService implements
             LOG.warn("Cannot update collector info because application ID: " +
                 appId + " is not found in RMContext!");
           } else {
-            AppCollectorData previousCollectorData = rmApp.getCollectorData();
-            if (AppCollectorData.happensBefore(previousCollectorData,
-                collectorData)) {
-              // Sending collector update event.
-              // Note: RM has to store the newly received collector data
-              // synchronously. Otherwise, the RM may send out stale collector
-              // data before this update is done, and the RM then crashes, the
-              // newly updated collector data will get lost.
-              LOG.info("Update collector information for application " + appId
-                  + " with new address: " + collectorData.getCollectorAddr());
-              ((RMAppImpl) rmApp).setCollectorData(collectorData);
+            synchronized (rmApp) {
+              AppCollectorData previousCollectorData = rmApp.getCollectorData();
+              if (AppCollectorData.happensBefore(previousCollectorData,
+                  collectorData)) {
+                // Sending collector update event.
+                // Note: RM has to store the newly received collector data
+                // synchronously. Otherwise, the RM may send out stale collector
+                // data before this update is done, and the RM then crashes, the
+                // newly updated collector data will get lost.
+                LOG.info("Update collector information for application " + appId
+                    + " with new address: " + collectorData.getCollectorAddr()
+                    + " timestamp: " + collectorData.getRMIdentifier()
+                    + ", " + collectorData.getVersion());
+                ((RMAppImpl) rmApp).setCollectorData(collectorData);
+              }
             }
           }
         }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/03f8d653/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java
index 2d76127..4a8ff00 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java
@@ -24,6 +24,7 @@ import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
 
 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
 import org.apache.hadoop.yarn.api.records.ApplicationId;
@@ -39,6 +40,7 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest;
 import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse;
 import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
 import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse;
+import org.apache.hadoop.yarn.server.api.records.AppCollectorData;
 import org.apache.hadoop.yarn.server.api.records.MasterKey;
 import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus;
 import org.apache.hadoop.yarn.server.api.records.NodeStatus;
@@ -60,6 +62,8 @@ public class MockNM {
   private String version;
   private Map<ContainerId, ContainerStatus> containerStats =
       new HashMap<ContainerId, ContainerStatus>();
+  private Map<ApplicationId, AppCollectorData> registeringCollectors
+      = new ConcurrentHashMap<>();
 
   public MockNM(String nodeIdStr, int memory, ResourceTrackerService resourceTracker) {
     // scale vcores based on the requested memory
@@ -117,6 +121,15 @@ public class MockNM {
         true, ++responseId);
   }
 
+  public void addRegisteringCollector(ApplicationId appId,
+      AppCollectorData data) {
+    this.registeringCollectors.put(appId, data);
+  }
+
+  public Map<ApplicationId, AppCollectorData> getRegisteringCollectors() {
+    return this.registeringCollectors;
+  }
+
   public RegisterNodeManagerResponse registerNode() throws Exception {
     return registerNode(null, null);
   }
@@ -229,6 +242,9 @@ public class MockNM {
     req.setNodeStatus(status);
     req.setLastKnownContainerTokenMasterKey(this.currentContainerTokenMasterKey);
     req.setLastKnownNMTokenMasterKey(this.currentNMTokenMasterKey);
+
+    req.setRegisteringCollectors(this.registeringCollectors);
+
     NodeHeartbeatResponse heartbeatResponse =
         resourceTracker.nodeHeartbeat(req);
     

http://git-wip-us.apache.org/repos/asf/hadoop/blob/03f8d653/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHATimelineCollectors.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHATimelineCollectors.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHATimelineCollectors.java
new file mode 100644
index 0000000..a54ff34
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHATimelineCollectors.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.yarn.api.records.ApplicationId;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.api.records.AppCollectorData;
+import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Test if the new active RM could recover collector status on a state
+ * transition.
+ */
+public class TestRMHATimelineCollectors extends RMHATestBase {
+  public static final Log LOG = LogFactory
+      .getLog(TestSubmitApplicationWithRMHA.class);
+
+  @Before
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    confForRM1.setBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED, true);
+    confForRM2.setBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED, true);
+    confForRM1.setFloat(YarnConfiguration.TIMELINE_SERVICE_VERSION, 2.0f);
+    confForRM2.setFloat(YarnConfiguration.TIMELINE_SERVICE_VERSION, 2.0f);
+  }
+
+  @Test
+  public void testRebuildCollectorDataOnFailover() throws Exception {
+    startRMs();
+    MockNM nm1
+        = new MockNM("127.0.0.1:1234", 15120, rm2.getResourceTrackerService());
+    MockNM nm2
+        = new MockNM("127.0.0.1:5678", 15121, rm2.getResourceTrackerService());
+    RMApp app1 = rm1.submitApp(1024);
+    String collectorAddr1 = "1.2.3.4:5";
+    AppCollectorData data1 = AppCollectorData.newInstance(
+        app1.getApplicationId(), collectorAddr1);
+    nm1.addRegisteringCollector(app1.getApplicationId(), data1);
+
+    String collectorAddr2 = "5.4.3.2:1";
+    RMApp app2 = rm1.submitApp(1024);
+    AppCollectorData data2 = AppCollectorData.newInstance(
+        app2.getApplicationId(), collectorAddr2, rm1.getStartTime(), 1);
+    nm1.addRegisteringCollector(app2.getApplicationId(), data2);
+
+    explicitFailover();
+
+    List<ApplicationId> runningApps = new ArrayList<>();
+    runningApps.add(app1.getApplicationId());
+    runningApps.add(app2.getApplicationId());
+    nm1.registerNode(runningApps);
+    nm2.registerNode(runningApps);
+
+    String collectorAddr12 = "1.2.3.4:56";
+    AppCollectorData data12 = AppCollectorData.newInstance(
+        app1.getApplicationId(), collectorAddr12, rm1.getStartTime(), 0);
+    nm2.addRegisteringCollector(app1.getApplicationId(), data12);
+
+    String collectorAddr22 = "5.4.3.2:10";
+    AppCollectorData data22 = AppCollectorData.newInstance(
+        app2.getApplicationId(), collectorAddr22, rm1.getStartTime(), 2);
+    nm2.addRegisteringCollector(app2.getApplicationId(), data22);
+
+    Map<ApplicationId, AppCollectorData> results1
+        = nm1.nodeHeartbeat(true).getAppCollectors();
+    assertEquals(collectorAddr1,
+        results1.get(app1.getApplicationId()).getCollectorAddr());
+    assertEquals(collectorAddr2,
+        results1.get(app2.getApplicationId()).getCollectorAddr());
+
+    Map<ApplicationId, AppCollectorData> results2
+        = nm2.nodeHeartbeat(true).getAppCollectors();
+    // addr of app1 should be collectorAddr1 since it's registering (no time
+    // stamp).
+    assertEquals(collectorAddr1,
+        results2.get(app1.getApplicationId()).getCollectorAddr());
+    // addr of app2 should be collectorAddr22 since its version number is
+    // greater.
+    assertEquals(collectorAddr22,
+        results2.get(app2.getApplicationId()).getCollectorAddr());
+
+    // Now nm1 should get updated collector list
+    nm1.getRegisteringCollectors().clear();
+    Map<ApplicationId, AppCollectorData> results12
+        = nm1.nodeHeartbeat(true).getAppCollectors();
+    assertEquals(collectorAddr1,
+        results12.get(app1.getApplicationId()).getCollectorAddr());
+    assertEquals(collectorAddr22,
+        results12.get(app2.getApplicationId()).getCollectorAddr());
+
+
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


Mime
View raw message