Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 96957200B95 for ; Mon, 12 Sep 2016 11:33:41 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 9544A160AD8; Mon, 12 Sep 2016 09:33:41 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 924AA160AD7 for ; Mon, 12 Sep 2016 11:33:40 +0200 (CEST) Received: (qmail 71721 invoked by uid 500); 12 Sep 2016 09:33:39 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 71442 invoked by uid 99); 12 Sep 2016 09:33:39 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 12 Sep 2016 09:33:39 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 1B3DFE3607; Mon, 12 Sep 2016 09:33:39 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: shalin@apache.org To: commits@lucene.apache.org Date: Mon, 12 Sep 2016 09:33:41 -0000 Message-Id: In-Reply-To: References: X-Mailer: ASF-Git Admin Mailer Subject: [04/15] lucene-solr:branch_6_2: SOLR-9439: Shard split clean up logic for older failed splits is faulty (cherry picked from commit 7d2f42e) archived-at: Mon, 12 Sep 2016 09:33:41 -0000 SOLR-9439: Shard split clean up logic for older failed splits is faulty (cherry picked from commit 7d2f42e) (cherry picked from commit 97b6216) Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/348b3e8f Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/348b3e8f Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/348b3e8f Branch: refs/heads/branch_6_2 Commit: 348b3e8fe43afbde8a5fe3b5658aee60935276c4 Parents: 9920ac9 Author: Shalin Shekhar Mangar Authored: Sat Aug 27 09:08:53 2016 +0530 Committer: Shalin Shekhar Mangar Committed: Mon Sep 12 10:02:16 2016 +0530 ---------------------------------------------------------------------- solr/CHANGES.txt | 2 + .../org/apache/solr/cloud/SplitShardCmd.java | 62 ++++++++++++++++---- .../org/apache/solr/core/CoreContainer.java | 7 ++- .../org/apache/solr/util/TestInjection.java | 20 +++++++ .../org/apache/solr/cloud/ShardSplitTest.java | 54 +++++++++++++++++ 5 files changed, 130 insertions(+), 15 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/348b3e8f/solr/CHANGES.txt ---------------------------------------------------------------------- diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index f9c0373..d5024b8 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -30,6 +30,8 @@ Bug Fixes * SOLR-9445: Admin requests are retried by CloudSolrClient and LBHttpSolrClient on failure. (shalin) +* SOLR-9439: Shard split clean up logic for older failed splits is faulty. The delete shard API + has also been made more resilient against failures resulting from non-existent cores. (shalin) ================== 6.2.0 ================== Versions of Major Components http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/348b3e8f/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java ---------------------------------------------------------------------- diff --git a/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java index d7bbf66..4463285 100644 --- a/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java @@ -46,6 +46,7 @@ import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.Utils; import org.apache.solr.handler.component.ShardHandler; +import org.apache.solr.util.TestInjection; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -79,6 +80,7 @@ public class SplitShardCmd implements Cmd { log.info("Split shard invoked"); ZkStateReader zkStateReader = ocmh.zkStateReader; + zkStateReader.forceUpdateCollection(collectionName); String splitKey = message.getStr("split.key"); ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler(); @@ -197,7 +199,10 @@ public class SplitShardCmd implements Cmd { subSlices.add(subSlice); String subShardName = collectionName + "_" + subSlice + "_replica1"; subShardNames.add(subShardName); + } + boolean oldShardsDeleted = false; + for (String subSlice : subSlices) { Slice oSlice = collection.getSlice(subSlice); if (oSlice != null) { final Slice.State state = oSlice.getState(); @@ -206,24 +211,33 @@ public class SplitShardCmd implements Cmd { "Sub-shard: " + subSlice + " exists in active state. Aborting split shard."); } else if (state == Slice.State.CONSTRUCTION || state == Slice.State.RECOVERY) { // delete the shards - for (String sub : subSlices) { - log.info("Sub-shard: {} already exists therefore requesting its deletion", sub); - Map propMap = new HashMap<>(); - propMap.put(Overseer.QUEUE_OPERATION, "deleteshard"); - propMap.put(COLLECTION_PROP, collectionName); - propMap.put(SHARD_ID_PROP, sub); - ZkNodeProps m = new ZkNodeProps(propMap); - try { - ocmh.commandMap.get(DELETESHARD).call(clusterState, m, new NamedList()); - } catch (Exception e) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + sub, - e); - } + log.info("Sub-shard: {} already exists therefore requesting its deletion", subSlice); + Map propMap = new HashMap<>(); + propMap.put(Overseer.QUEUE_OPERATION, "deleteshard"); + propMap.put(COLLECTION_PROP, collectionName); + propMap.put(SHARD_ID_PROP, subSlice); + ZkNodeProps m = new ZkNodeProps(propMap); + try { + ocmh.commandMap.get(DELETESHARD).call(clusterState, m, new NamedList()); + } catch (SolrException e) { + throwIfNotNonExistentCoreException(subSlice, e); + } catch (Exception e) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice, + e); } + + oldShardsDeleted = true; } } } + if (oldShardsDeleted) { + // refresh the locally cached cluster state + zkStateReader.forceUpdateCollection(collectionName); + clusterState = zkStateReader.getClusterState(); + collection = clusterState.getCollection(collectionName); + } + final String asyncId = message.getStr(ASYNC); Map requestMap = new HashMap<>(); @@ -406,6 +420,8 @@ public class SplitShardCmd implements Cmd { replicas.add(propMap); } + assert TestInjection.injectSplitFailureBeforeReplicaCreation(); + // we must set the slice state into recovery before actually creating the replica cores // this ensures that the logic inside Overseer to update sub-shard state to 'active' // always gets a chance to execute. See SOLR-7673 @@ -455,4 +471,24 @@ public class SplitShardCmd implements Cmd { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, null, e); } } + + private void throwIfNotNonExistentCoreException(String subSlice, SolrException e) { + Throwable t = e; + String cause = null; + while (t != null) { + if (t instanceof SolrException) { + SolrException solrException = (SolrException) t; + cause = solrException.getMetadata("cause"); + if (cause != null && !"NonExistentCore".equals(cause)) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice, + e); + } + } + t = t.getCause(); + } + if (!"NonExistentCore".equals(cause)) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice, + e); + } + } } http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/348b3e8f/solr/core/src/java/org/apache/solr/core/CoreContainer.java ---------------------------------------------------------------------- diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java index b4442df..7f80b13 100644 --- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java +++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java @@ -996,8 +996,11 @@ public class CoreContainer { } CoreDescriptor cd = solrCores.getCoreDescriptor(name); - if (cd == null) - throw new SolrException(ErrorCode.BAD_REQUEST, "Cannot unload non-existent core [" + name + "]"); + if (cd == null) { + SolrException solrException = new SolrException(ErrorCode.BAD_REQUEST, "Cannot unload non-existent core [" + name + "]"); + solrException.setMetadata("cause", "NonExistentCore"); + throw solrException; + } boolean close = solrCores.isLoadedNotPendingClose(name); SolrCore core = solrCores.remove(name); http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/348b3e8f/solr/core/src/java/org/apache/solr/util/TestInjection.java ---------------------------------------------------------------------- diff --git a/solr/core/src/java/org/apache/solr/util/TestInjection.java b/solr/core/src/java/org/apache/solr/util/TestInjection.java index 03de74d..efd80bf 100644 --- a/solr/core/src/java/org/apache/solr/util/TestInjection.java +++ b/solr/core/src/java/org/apache/solr/util/TestInjection.java @@ -113,6 +113,8 @@ public class TestInjection { public static String randomDelayInCoreCreation = null; public static int randomDelayMaxInCoreCreationInSec = 10; + + public static String splitFailureBeforeReplicaCreation = null; private static Set timers = Collections.synchronizedSet(new HashSet()); @@ -124,6 +126,7 @@ public class TestInjection { updateLogReplayRandomPause = null; updateRandomPause = null; randomDelayInCoreCreation = null; + splitFailureBeforeReplicaCreation = null; for (Timer timer : timers) { timer.cancel(); @@ -285,6 +288,23 @@ public class TestInjection { return true; } + + public static boolean injectSplitFailureBeforeReplicaCreation() { + if (splitFailureBeforeReplicaCreation != null) { + Random rand = random(); + if (null == rand) return true; + + Pair pair = parseValue(splitFailureBeforeReplicaCreation); + boolean enabled = pair.first(); + int chanceIn100 = pair.second(); + if (enabled && rand.nextInt(100) >= (100 - chanceIn100)) { + log.info("Injecting failure in creating replica for sub-shard"); + throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to create replica"); + } + } + + return true; + } private static Pair parseValue(String raw) { Matcher m = ENABLED_PERCENT.matcher(raw); http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/348b3e8f/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java ---------------------------------------------------------------------- diff --git a/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java b/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java index 21dc257..389660f 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java @@ -41,6 +41,7 @@ import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.cloud.ClusterState; import org.apache.solr.common.cloud.CompositeIdRouter; +import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.DocRouter; import org.apache.solr.common.cloud.HashBasedRouter; import org.apache.solr.common.cloud.Replica; @@ -50,6 +51,7 @@ import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.common.params.CollectionParams; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.util.Utils; +import org.apache.solr.util.TestInjection; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -90,6 +92,58 @@ public class ShardSplitTest extends BasicDistributedZkTest { //waitForThingsToLevelOut(15); } + /** + * Used to test that we can split a shard when a previous split event + * left sub-shards in construction or recovery state. + * + * See SOLR-9439 + */ + @Test + public void testSplitAfterFailedSplit() throws Exception { + waitForThingsToLevelOut(15); + + TestInjection.splitFailureBeforeReplicaCreation = "true:100"; // we definitely want split to fail + try { + try { + CollectionAdminRequest.SplitShard splitShard = CollectionAdminRequest.splitShard(AbstractDistribZkTestBase.DEFAULT_COLLECTION); + splitShard.setShardName(SHARD1); + splitShard.process(cloudClient); + fail("Shard split was not supposed to succeed after failure injection!"); + } catch (Exception e) { + // expected + } + + // assert that sub-shards cores exist and sub-shard is in construction state + ZkStateReader zkStateReader = cloudClient.getZkStateReader(); + zkStateReader.forceUpdateCollection(AbstractDistribZkTestBase.DEFAULT_COLLECTION); + ClusterState state = zkStateReader.getClusterState(); + DocCollection collection = state.getCollection(AbstractDistribZkTestBase.DEFAULT_COLLECTION); + + Slice shard10 = collection.getSlice(SHARD1_0); + assertEquals(Slice.State.CONSTRUCTION, shard10.getState()); + assertEquals(1, shard10.getReplicas().size()); + + Slice shard11 = collection.getSlice(SHARD1_1); + assertEquals(Slice.State.CONSTRUCTION, shard11.getState()); + assertEquals(1, shard11.getReplicas().size()); + + // lets retry the split + TestInjection.reset(); // let the split succeed + try { + CollectionAdminRequest.SplitShard splitShard = CollectionAdminRequest.splitShard(AbstractDistribZkTestBase.DEFAULT_COLLECTION); + splitShard.setShardName(SHARD1); + splitShard.process(cloudClient); + // Yay! + } catch (Exception e) { + log.error("Shard split failed", e); + fail("Shard split did not succeed after a previous failed split attempt left sub-shards in construction state"); + } + + } finally { + TestInjection.reset(); + } + } + @Test public void testSplitShardWithRule() throws Exception { waitForThingsToLevelOut(15);