From dev-return-45422-archive-asf-public=cust-asf.ponee.io@ignite.apache.org Mon Mar 25 05:19:03 2019 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx-eu-01.ponee.io (Postfix) with SMTP id AEFAE18076D for ; Mon, 25 Mar 2019 06:19:02 +0100 (CET) Received: (qmail 48216 invoked by uid 500); 25 Mar 2019 05:19:01 -0000 Mailing-List: contact dev-help@ignite.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ignite.apache.org Delivered-To: mailing list dev@ignite.apache.org Received: (qmail 48183 invoked by uid 99); 25 Mar 2019 05:19:01 -0000 Received: from mailrelay1-us-west.apache.org (HELO mailrelay1-us-west.apache.org) (209.188.14.139) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 25 Mar 2019 05:19:01 +0000 Received: from jira-lw-us.apache.org (unknown [207.244.88.139]) by mailrelay1-us-west.apache.org (ASF Mail Server at mailrelay1-us-west.apache.org) with ESMTP id 7E97CE0016 for ; Mon, 25 Mar 2019 05:19:00 +0000 (UTC) Received: from jira-lw-us.apache.org (localhost [127.0.0.1]) by jira-lw-us.apache.org (ASF Mail Server at jira-lw-us.apache.org) with ESMTP id 3A5F22459B for ; Mon, 25 Mar 2019 05:19:00 +0000 (UTC) Date: Mon, 25 Mar 2019 05:19:00 +0000 (UTC) From: "Roman Shtykh (JIRA)" To: dev@ignite.apache.org Message-ID: In-Reply-To: References: Subject: [jira] [Created] (IGNITE-11620) GridDhtInvalidPartitionException stops the cluster MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 7bit X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394 Roman Shtykh created IGNITE-11620: ------------------------------------- Summary: GridDhtInvalidPartitionException stops the cluster Key: IGNITE-11620 URL: https://issues.apache.org/jira/browse/IGNITE-11620 Project: Ignite Issue Type: Bug Affects Versions: 2.7, 2.6 Reporter: Roman Shtykh When injecting data and having it expired at the same time rebalancing occurs, *GridDhtInvalidPartitionException* triggers *SYSTEM_WORKER_TERMINATION*. This can cause cascading failures in the cluster and take the whole cluster down. Simple test case: {noformat} import org.apache.ignite.IgniteCache; import org.apache.ignite.configuration.CacheConfiguration; import org.apache.ignite.configuration.IgniteConfiguration; import org.apache.ignite.failure.StopNodeOrHaltFailureHandler; import org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi; import org.apache.ignite.spi.discovery.tcp.ipfinder.TcpDiscoveryIpFinder; import org.apache.ignite.spi.discovery.tcp.ipfinder.vm.TcpDiscoveryVmIpFinder; import org.apache.ignite.testframework.junits.common.GridCommonAbstractTest; import javax.cache.expiry.CreatedExpiryPolicy; import javax.cache.expiry.Duration; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import static org.apache.ignite.cache.CacheAtomicityMode.ATOMIC; import static org.apache.ignite.cache.CacheMode.PARTITIONED; /** * */ public class ExpireWhileRebalanceTest extends GridCommonAbstractTest { private static final int ENTRIES = 500000; /** * */ protected static final TcpDiscoveryIpFinder IP_FINDER = new TcpDiscoveryVmIpFinder(true); /** * {@inheritDoc} */ @Override protected IgniteConfiguration getConfiguration(String gridName) throws Exception { IgniteConfiguration cfg = super.getConfiguration(gridName); ((TcpDiscoverySpi) cfg.getDiscoverySpi()).setIpFinder(IP_FINDER); cfg.setFailureHandler(new StopNodeOrHaltFailureHandler()); CacheConfiguration ccfg = new CacheConfiguration<>(DEFAULT_CACHE_NAME); ccfg.setAtomicityMode(ATOMIC); ccfg.setCacheMode(PARTITIONED); ccfg.setExpiryPolicyFactory(CreatedExpiryPolicy.factoryOf(new Duration(TimeUnit.SECONDS, 1))); cfg.setCacheConfiguration(ccfg); return cfg; } /** * @throws Exception If failed. */ public void testExpireWhileRebalancing() throws Exception { startGridsMultiThreaded(4); IgniteCache cache = ignite(0).cache(DEFAULT_CACHE_NAME); CountDownLatch latch = new CountDownLatch(1); new Thread(() -> { for (int i = 1; i <= ENTRIES; i++) { cache.put(i, i); if (i % (ENTRIES / 10) == 0) System.out.println(">>> Entries put: " + i); } latch.countDown(); }).start(); // stopping 0 has no effect stopGrid(3); awaitPartitionMapExchange(); startGrid(3); latch.await(10, TimeUnit.SECONDS); } /** * {@inheritDoc} */ @Override protected void afterTest() throws Exception { stopAllGrids(); } } {noformat} -- This message was sent by Atlassian JIRA (v7.6.3#76005)