flink-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (FLINK-4927) Implement FLI-6 YARN Resource Manager
Date Tue, 29 Nov 2016 07:48:58 GMT

    [ https://issues.apache.org/jira/browse/FLINK-4927?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15704562#comment-15704562
] 

ASF GitHub Bot commented on FLINK-4927:
---------------------------------------

Github user shuai-xu commented on a diff in the pull request:

    https://github.com/apache/flink/pull/2808#discussion_r89955045
  
    --- Diff: flink-yarn/src/main/java/org/apache/flink/yarn/YarnResourceManager.java ---
    @@ -0,0 +1,551 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.flink.yarn;
    +
    +import org.apache.flink.api.java.tuple.Tuple2;
    +import org.apache.flink.configuration.ConfigConstants;
    +import org.apache.flink.configuration.Configuration;
    +import org.apache.flink.runtime.clusterframework.ApplicationStatus;
    +import org.apache.flink.runtime.clusterframework.BootstrapTools;
    +import org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters;
    +import org.apache.flink.runtime.clusterframework.types.ResourceID;
    +import org.apache.flink.runtime.clusterframework.types.ResourceProfile;
    +import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
    +import org.apache.flink.runtime.metrics.MetricRegistry;
    +import org.apache.flink.runtime.resourcemanager.JobLeaderIdService;
    +import org.apache.flink.runtime.resourcemanager.ResourceManager;
    +import org.apache.flink.runtime.resourcemanager.ResourceManagerConfiguration;
    +import org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException;
    +import org.apache.flink.runtime.resourcemanager.slotmanager.SlotManagerFactory;
    +import org.apache.flink.runtime.rpc.FatalErrorHandler;
    +import org.apache.flink.runtime.rpc.RpcService;
    +import org.apache.hadoop.fs.Path;
    +import org.apache.hadoop.io.DataOutputBuffer;
    +import org.apache.hadoop.security.Credentials;
    +import org.apache.hadoop.security.UserGroupInformation;
    +import org.apache.hadoop.yarn.api.ApplicationConstants;
    +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
    +import org.apache.hadoop.yarn.api.records.Priority;
    +import org.apache.hadoop.yarn.api.records.Resource;
    +import org.apache.hadoop.yarn.api.records.ContainerStatus;
    +import org.apache.hadoop.yarn.api.records.Container;
    +import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
    +import org.apache.hadoop.yarn.api.records.NodeReport;
    +import org.apache.hadoop.yarn.api.records.LocalResource;
    +import org.apache.hadoop.yarn.client.api.AMRMClient;
    +import org.apache.hadoop.yarn.client.api.NMClient;
    +import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
    +import org.apache.hadoop.yarn.conf.YarnConfiguration;
    +import org.apache.hadoop.yarn.util.Records;
    +import org.slf4j.Logger;
    +import org.slf4j.LoggerFactory;
    +import scala.concurrent.duration.FiniteDuration;
    +
    +import java.io.File;
    +import java.io.IOException;
    +import java.nio.ByteBuffer;
    +import java.util.Map;
    +import java.util.HashMap;
    +import java.util.List;
    +import java.util.Collections;
    +import java.util.UUID;
    +import java.util.concurrent.TimeUnit;
    +
    +import static org.apache.flink.yarn.YarnConfigKeys.ENV_FLINK_CLASSPATH;
    +
    +/**
    + * The yarn implementation of the resource manager. Used when the system is started
    + * via the resource framework YARN.
    + */
    +public class YarnResourceManager extends ResourceManager<ResourceID> implements
AMRMClientAsync.CallbackHandler {
    +	protected final Logger LOG = LoggerFactory.getLogger(getClass());
    +
    +	/** The process environment variables */
    +	private final Map<String, String> ENV;
    +
    +	/** The heartbeat interval while the resource master is waiting for containers */
    +	private static final int FAST_YARN_HEARTBEAT_INTERVAL_MS = 500;
    +
    +	/** The default heartbeat interval during regular operation */
    +	private static final int DEFAULT_YARN_HEARTBEAT_INTERVAL_MS = 5000;
    +
    +	/** The maximum time that TaskExecutors may be waiting to register at the ResourceManager
before they quit */
    +	private static final FiniteDuration TASKEXECUTOR_REGISTRATION_TIMEOUT = new FiniteDuration(5,
TimeUnit.MINUTES);
    +
    +	/** Environment variable name of the final container id used by the YarnResourceManager.
    +	 * Container ID generation may vary across Hadoop versions. */
    +	final static String ENV_FLINK_CONTAINER_ID = "_FLINK_CONTAINER_ID";
    +	
    +	/** Environment variable name of the hostname used by the Yarn.
    +	 * TaskExecutor use this host name to start port. */
    +	final static String ENV_FLINK_NODE_ID = "_FLINK_NODE_ID";
    +
    +	/** Default heartbeat interval between this resource manager and the YARN ResourceManager
*/
    +	private final int yarnHeartbeatIntervalMillis;
    +
    +	private final Configuration flinkConfig;
    +
    +	private final YarnConfiguration yarnConfig;
    +
    +	/** Client to communicate with the Resource Manager (YARN's master) */
    +	private AMRMClientAsync<AMRMClient.ContainerRequest> resourceManagerClient;
    +
    +	/** Client to communicate with the Node manager and launch TaskExecutor processes */
    +	private NMClient nodeManagerClient;
    +
    +	/** The number of containers requested, but not yet granted */
    +	private int numPendingContainerRequests;
    +
    +	public YarnResourceManager(
    +			Configuration flinkConfig,
    +			Map<String, String> env,
    +			RpcService rpcService,
    +			ResourceManagerConfiguration resourceManagerConfiguration,
    +			HighAvailabilityServices highAvailabilityServices,
    +			SlotManagerFactory slotManagerFactory,
    +			MetricRegistry metricRegistry,
    +			JobLeaderIdService jobLeaderIdService,
    +			FatalErrorHandler fatalErrorHandler) {
    +		super(
    +			rpcService,
    +			resourceManagerConfiguration,
    +			highAvailabilityServices,
    +			slotManagerFactory,
    +			metricRegistry,
    +			jobLeaderIdService,
    +			fatalErrorHandler);
    +		this.flinkConfig  = flinkConfig;
    +		this.yarnConfig = new YarnConfiguration();
    +		this.ENV = env;
    +		final int yarnHeartbeatIntervalMS = flinkConfig.getInteger(
    +				ConfigConstants.YARN_HEARTBEAT_DELAY_SECONDS, DEFAULT_YARN_HEARTBEAT_INTERVAL_MS
/ 1000) * 1000;
    +
    +		final long yarnExpiryIntervalMS = yarnConfig.getLong(
    +				YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS,
    +				YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS);
    +
    +		if (yarnHeartbeatIntervalMS >= yarnExpiryIntervalMS) {
    +			log.warn("The heartbeat interval of the Flink Application master ({}) is greater "
+
    +					"than YARN's expiry interval ({}). The application is likely to be killed by YARN.",
    +					yarnHeartbeatIntervalMS, yarnExpiryIntervalMS);
    +		}
    +		yarnHeartbeatIntervalMillis = yarnHeartbeatIntervalMS;
    +		numPendingContainerRequests = 0;
    +	}
    +
    +	@Override
    +	protected void initialize() throws ResourceManagerException {
    +		resourceManagerClient = AMRMClientAsync.createAMRMClientAsync(yarnHeartbeatIntervalMillis,
this);
    +		resourceManagerClient.init(yarnConfig);
    +		resourceManagerClient.start();
    +		try {
    +			//TODO: change akka address to tcp host and port, the getAddress() interface should
return a standard tcp address
    +			Tuple2<String, Integer> hostPort = parseHostPort(getAddress());
    +			resourceManagerClient.registerApplicationMaster(hostPort.f0, hostPort.f1, getAddress());
    +		} catch (Exception e) {
    +			LOG.info("registerApplicationMaster fail", e);
    +		}
    +
    +		// create the client to communicate with the node managers
    +		nodeManagerClient = NMClient.createNMClient();
    +		nodeManagerClient.init(yarnConfig);
    +		nodeManagerClient.start();
    +		nodeManagerClient.cleanupRunningContainersOnStop(true);
    --- End diff --
    
    There are some problems if we do so. First the AMRMClientAsync need a CallbackHandler,
but Dispatcher and YarnResourceManager need different CallbackHandlers. And I think what we
need in Dispatcher to allocate containers for new JobMasters should be YarnClient.


> Implement FLI-6 YARN Resource Manager
> -------------------------------------
>
>                 Key: FLINK-4927
>                 URL: https://issues.apache.org/jira/browse/FLINK-4927
>             Project: Flink
>          Issue Type: Sub-task
>          Components: YARN
>         Environment: {{flip-6}} feature branch
>            Reporter: Stephan Ewen
>            Assignee: shuai.xu
>
> The Flink YARN Resource Manager communicates with YARN's Resource Manager to acquire
and release containers.
> It is also responsible to notify the JobManager eagerly about container failures.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Mime
View raw message