flink-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (FLINK-8910) Introduce automated end-to-end test for local recovery (including sticky scheduling)
Date Tue, 20 Mar 2018 16:25:01 GMT

    [ https://issues.apache.org/jira/browse/FLINK-8910?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16406614#comment-16406614
] 

ASF GitHub Bot commented on FLINK-8910:
---------------------------------------

Github user tillrohrmann commented on a diff in the pull request:

    https://github.com/apache/flink/pull/5676#discussion_r175828150
  
    --- Diff: flink-end-to-end-tests/src/main/java/org/apache/flink/streaming/tests/StickyAllocationAndLocalRecoveryTestJob.java
---
    @@ -0,0 +1,451 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + * <p>
    + * http://www.apache.org/licenses/LICENSE-2.0
    + * <p>
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.flink.streaming.tests;
    +
    +import org.apache.flink.api.common.functions.MapFunction;
    +import org.apache.flink.api.common.functions.RichFlatMapFunction;
    +import org.apache.flink.api.common.functions.RuntimeContext;
    +import org.apache.flink.api.common.restartstrategy.RestartStrategies;
    +import org.apache.flink.api.common.state.ListState;
    +import org.apache.flink.api.common.state.ListStateDescriptor;
    +import org.apache.flink.api.common.state.ValueState;
    +import org.apache.flink.api.common.state.ValueStateDescriptor;
    +import org.apache.flink.api.java.functions.KeySelector;
    +import org.apache.flink.api.java.utils.ParameterTool;
    +import org.apache.flink.contrib.streaming.state.RocksDBStateBackend;
    +import org.apache.flink.runtime.state.CheckpointListener;
    +import org.apache.flink.runtime.state.FunctionInitializationContext;
    +import org.apache.flink.runtime.state.FunctionSnapshotContext;
    +import org.apache.flink.runtime.state.filesystem.FsStateBackend;
    +import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
    +import org.apache.flink.streaming.api.environment.CheckpointConfig;
    +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
    +import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction;
    +import org.apache.flink.streaming.api.functions.source.RichSourceFunction;
    +import org.apache.flink.streaming.api.operators.StreamingRuntimeContext;
    +import org.apache.flink.util.Collector;
    +import org.apache.flink.util.Preconditions;
    +
    +import org.apache.commons.lang3.RandomStringUtils;
    +import org.slf4j.Logger;
    +import org.slf4j.LoggerFactory;
    +
    +import java.io.IOException;
    +import java.io.Serializable;
    +import java.util.ArrayList;
    +import java.util.HashSet;
    +import java.util.Iterator;
    +import java.util.List;
    +import java.util.Set;
    +
    +/**
    + * Automatic end-to-end test for local recovery (including sticky allocation).
    + */
    +public class StickyAllocationAndLocalRecoveryTestJob {
    +
    +	private static final Logger LOG = LoggerFactory.getLogger(StickyAllocationAndLocalRecoveryTestJob.class);
    +
    +	public static void main(String[] args) throws Exception {
    +
    +		final ParameterTool pt = ParameterTool.fromArgs(args);
    +
    +		final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    +
    +		env.setParallelism(pt.getInt("parallelism", 1));
    +		env.setMaxParallelism(pt.getInt("maxParallelism", pt.getInt("parallelism", 1)));
    +		env.enableCheckpointing(pt.getInt("checkpointInterval", 1000));
    +		env.setRestartStrategy(RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, pt.getInt("restartDelay",
0)));
    +		if (pt.has("externalizedCheckpoints") && pt.getBoolean("externalizedCheckpoints",
false)) {
    +			env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
    +		}
    +
    +		String stateBackend = pt.get("stateBackend", "file");
    +		String checkpointDir = pt.getRequired("checkpointDir");
    +
    +		boolean killJvmOnFail = pt.getBoolean("killJvmOnFail", false);
    +
    +		if ("file".equals(stateBackend)) {
    +			boolean asyncCheckpoints = pt.getBoolean("asyncCheckpoints", false);
    +			env.setStateBackend(new FsStateBackend(checkpointDir, asyncCheckpoints));
    +		} else if ("rocks".equals(stateBackend)) {
    +			boolean incrementalCheckpoints = pt.getBoolean("incrementalCheckpoints", false);
    +			env.setStateBackend(new RocksDBStateBackend(checkpointDir, incrementalCheckpoints));
    +		} else {
    +			throw new IllegalArgumentException("Unknown backend: " + stateBackend);
    +		}
    +
    +		// make parameters available in the web interface
    +		env.getConfig().setGlobalJobParameters(pt);
    +
    +		// delay to throttle down the production of the source
    +		long delay = pt.has("delay") ? pt.getLong("delay") : 0L;
    +
    +		// the maximum number of attempts, before the job finishes with success
    +		int maxAttempts = pt.has("maxAttempts") ? pt.getInt("maxAttempts") : 3;
    +
    +		// size of one artificial value
    +		int valueSize = pt.has("valueSize") ? pt.getInt("valueSize") : 10;
    +
    +		env.addSource(new RandomLongSource(maxAttempts, delay))
    +			.keyBy((KeySelector<Long, Long>) aLong -> aLong)
    +			.flatMap(new StateCreatingFlatMap(valueSize, killJvmOnFail))
    +			.map((MapFunction<String, String>) value -> value)
    +			.addSink(new PrintSinkFunction<>());
    +
    +		env.execute("Sticky Allocation And Local Recovery Test");
    +	}
    +
    +	/**
    +	 * Source function that produces a long sequence.
    +	 */
    +	private static final class RandomLongSource extends RichSourceFunction<Long> implements
CheckpointedFunction {
    +
    +		private static final long serialVersionUID = 1L;
    +
    +		/**
    +		 * Generator delay between two events.
    +		 */
    +		final long delay;
    +
    +		/**
    +		 * Maximum restarts before shutting down this source.
    +		 */
    +		final int maxAttempts;
    +
    +		/**
    +		 * State that holds the current key for recovery.
    +		 */
    +		ListState<Long> sourceCurrentKeyState;
    +
    +		/**
    +		 * Generator's current key.
    +		 */
    +		long currentKey;
    +
    +		/**
    +		 * Generator runs while this is true.
    +		 */
    +		volatile boolean running;
    +
    +		RandomLongSource(int maxAttempts, long delay) {
    +			this.delay = delay;
    +			this.maxAttempts = maxAttempts;
    +			this.running = true;
    +		}
    +
    +		@Override
    +		public void run(SourceContext<Long> sourceContext) throws Exception {
    +
    +			int numberOfParallelSubtasks = getRuntimeContext().getNumberOfParallelSubtasks();
    +			int subtaskIdx = getRuntimeContext().getIndexOfThisSubtask();
    +
    +			// the source emits one final event and shuts down once we have reached max attempts.
    +			if (getRuntimeContext().getAttemptNumber() > maxAttempts) {
    +				sourceContext.collect(Long.MAX_VALUE - subtaskIdx);
    --- End diff --
    
    element emission should always happen under the checkpoint lock


> Introduce automated end-to-end test for local recovery (including sticky scheduling)
> ------------------------------------------------------------------------------------
>
>                 Key: FLINK-8910
>                 URL: https://issues.apache.org/jira/browse/FLINK-8910
>             Project: Flink
>          Issue Type: Sub-task
>          Components: State Backends, Checkpointing
>    Affects Versions: 1.5.0
>            Reporter: Stefan Richter
>            Assignee: Stefan Richter
>            Priority: Major
>             Fix For: 1.5.0
>
>
> We should have an automated end-to-end test that can run nightly to check that sticky
allocation and local recovery work as expected.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Mime
View raw message