Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id BE7FA200BEF for ; Tue, 29 Nov 2016 08:49:02 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id BD26A160B15; Tue, 29 Nov 2016 07:49:02 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id DFBD6160B05 for ; Tue, 29 Nov 2016 08:49:01 +0100 (CET) Received: (qmail 34289 invoked by uid 500); 29 Nov 2016 07:48:59 -0000 Mailing-List: contact issues-help@flink.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@flink.apache.org Delivered-To: mailing list issues@flink.apache.org Received: (qmail 34246 invoked by uid 99); 29 Nov 2016 07:48:59 -0000 Received: from arcas.apache.org (HELO arcas) (140.211.11.28) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 29 Nov 2016 07:48:58 +0000 Received: from arcas.apache.org (localhost [127.0.0.1]) by arcas (Postfix) with ESMTP id D2D002C03DB for ; Tue, 29 Nov 2016 07:48:58 +0000 (UTC) Date: Tue, 29 Nov 2016 07:48:58 +0000 (UTC) From: "ASF GitHub Bot (JIRA)" To: issues@flink.apache.org Message-ID: In-Reply-To: References: Subject: [jira] [Commented] (FLINK-4927) Implement FLI-6 YARN Resource Manager MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 7bit X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394 archived-at: Tue, 29 Nov 2016 07:49:02 -0000 [ https://issues.apache.org/jira/browse/FLINK-4927?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15704562#comment-15704562 ] ASF GitHub Bot commented on FLINK-4927: --------------------------------------- Github user shuai-xu commented on a diff in the pull request: https://github.com/apache/flink/pull/2808#discussion_r89955045 --- Diff: flink-yarn/src/main/java/org/apache/flink/yarn/YarnResourceManager.java --- @@ -0,0 +1,551 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.yarn; + +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.configuration.ConfigConstants; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.clusterframework.ApplicationStatus; +import org.apache.flink.runtime.clusterframework.BootstrapTools; +import org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters; +import org.apache.flink.runtime.clusterframework.types.ResourceID; +import org.apache.flink.runtime.clusterframework.types.ResourceProfile; +import org.apache.flink.runtime.highavailability.HighAvailabilityServices; +import org.apache.flink.runtime.metrics.MetricRegistry; +import org.apache.flink.runtime.resourcemanager.JobLeaderIdService; +import org.apache.flink.runtime.resourcemanager.ResourceManager; +import org.apache.flink.runtime.resourcemanager.ResourceManagerConfiguration; +import org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException; +import org.apache.flink.runtime.resourcemanager.slotmanager.SlotManagerFactory; +import org.apache.flink.runtime.rpc.FatalErrorHandler; +import org.apache.flink.runtime.rpc.RpcService; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DataOutputBuffer; +import org.apache.hadoop.security.Credentials; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.yarn.api.ApplicationConstants; +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; +import org.apache.hadoop.yarn.api.records.Priority; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.ContainerStatus; +import org.apache.hadoop.yarn.api.records.Container; +import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.api.records.NodeReport; +import org.apache.hadoop.yarn.api.records.LocalResource; +import org.apache.hadoop.yarn.client.api.AMRMClient; +import org.apache.hadoop.yarn.client.api.NMClient; +import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.util.Records; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import scala.concurrent.duration.FiniteDuration; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Map; +import java.util.HashMap; +import java.util.List; +import java.util.Collections; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +import static org.apache.flink.yarn.YarnConfigKeys.ENV_FLINK_CLASSPATH; + +/** + * The yarn implementation of the resource manager. Used when the system is started + * via the resource framework YARN. + */ +public class YarnResourceManager extends ResourceManager implements AMRMClientAsync.CallbackHandler { + protected final Logger LOG = LoggerFactory.getLogger(getClass()); + + /** The process environment variables */ + private final Map ENV; + + /** The heartbeat interval while the resource master is waiting for containers */ + private static final int FAST_YARN_HEARTBEAT_INTERVAL_MS = 500; + + /** The default heartbeat interval during regular operation */ + private static final int DEFAULT_YARN_HEARTBEAT_INTERVAL_MS = 5000; + + /** The maximum time that TaskExecutors may be waiting to register at the ResourceManager before they quit */ + private static final FiniteDuration TASKEXECUTOR_REGISTRATION_TIMEOUT = new FiniteDuration(5, TimeUnit.MINUTES); + + /** Environment variable name of the final container id used by the YarnResourceManager. + * Container ID generation may vary across Hadoop versions. */ + final static String ENV_FLINK_CONTAINER_ID = "_FLINK_CONTAINER_ID"; + + /** Environment variable name of the hostname used by the Yarn. + * TaskExecutor use this host name to start port. */ + final static String ENV_FLINK_NODE_ID = "_FLINK_NODE_ID"; + + /** Default heartbeat interval between this resource manager and the YARN ResourceManager */ + private final int yarnHeartbeatIntervalMillis; + + private final Configuration flinkConfig; + + private final YarnConfiguration yarnConfig; + + /** Client to communicate with the Resource Manager (YARN's master) */ + private AMRMClientAsync resourceManagerClient; + + /** Client to communicate with the Node manager and launch TaskExecutor processes */ + private NMClient nodeManagerClient; + + /** The number of containers requested, but not yet granted */ + private int numPendingContainerRequests; + + public YarnResourceManager( + Configuration flinkConfig, + Map env, + RpcService rpcService, + ResourceManagerConfiguration resourceManagerConfiguration, + HighAvailabilityServices highAvailabilityServices, + SlotManagerFactory slotManagerFactory, + MetricRegistry metricRegistry, + JobLeaderIdService jobLeaderIdService, + FatalErrorHandler fatalErrorHandler) { + super( + rpcService, + resourceManagerConfiguration, + highAvailabilityServices, + slotManagerFactory, + metricRegistry, + jobLeaderIdService, + fatalErrorHandler); + this.flinkConfig = flinkConfig; + this.yarnConfig = new YarnConfiguration(); + this.ENV = env; + final int yarnHeartbeatIntervalMS = flinkConfig.getInteger( + ConfigConstants.YARN_HEARTBEAT_DELAY_SECONDS, DEFAULT_YARN_HEARTBEAT_INTERVAL_MS / 1000) * 1000; + + final long yarnExpiryIntervalMS = yarnConfig.getLong( + YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, + YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS); + + if (yarnHeartbeatIntervalMS >= yarnExpiryIntervalMS) { + log.warn("The heartbeat interval of the Flink Application master ({}) is greater " + + "than YARN's expiry interval ({}). The application is likely to be killed by YARN.", + yarnHeartbeatIntervalMS, yarnExpiryIntervalMS); + } + yarnHeartbeatIntervalMillis = yarnHeartbeatIntervalMS; + numPendingContainerRequests = 0; + } + + @Override + protected void initialize() throws ResourceManagerException { + resourceManagerClient = AMRMClientAsync.createAMRMClientAsync(yarnHeartbeatIntervalMillis, this); + resourceManagerClient.init(yarnConfig); + resourceManagerClient.start(); + try { + //TODO: change akka address to tcp host and port, the getAddress() interface should return a standard tcp address + Tuple2 hostPort = parseHostPort(getAddress()); + resourceManagerClient.registerApplicationMaster(hostPort.f0, hostPort.f1, getAddress()); + } catch (Exception e) { + LOG.info("registerApplicationMaster fail", e); + } + + // create the client to communicate with the node managers + nodeManagerClient = NMClient.createNMClient(); + nodeManagerClient.init(yarnConfig); + nodeManagerClient.start(); + nodeManagerClient.cleanupRunningContainersOnStop(true); --- End diff -- There are some problems if we do so. First the AMRMClientAsync need a CallbackHandler, but Dispatcher and YarnResourceManager need different CallbackHandlers. And I think what we need in Dispatcher to allocate containers for new JobMasters should be YarnClient. > Implement FLI-6 YARN Resource Manager > ------------------------------------- > > Key: FLINK-4927 > URL: https://issues.apache.org/jira/browse/FLINK-4927 > Project: Flink > Issue Type: Sub-task > Components: YARN > Environment: {{flip-6}} feature branch > Reporter: Stephan Ewen > Assignee: shuai.xu > > The Flink YARN Resource Manager communicates with YARN's Resource Manager to acquire and release containers. > It is also responsible to notify the JobManager eagerly about container failures. -- This message was sent by Atlassian JIRA (v6.3.4#6332)