Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 82BA6200D36 for ; Mon, 6 Nov 2017 20:56:27 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id 814CB160BEC; Mon, 6 Nov 2017 19:56:27 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id A0EF2160BD5 for ; Mon, 6 Nov 2017 20:56:26 +0100 (CET) Received: (qmail 27642 invoked by uid 500); 6 Nov 2017 19:56:25 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 27633 invoked by uid 99); 6 Nov 2017 19:56:25 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 06 Nov 2017 19:56:25 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id A79B6DFC25; Mon, 6 Nov 2017 19:56:25 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: ab@apache.org To: commits@lucene.apache.org Message-Id: <7266c02dcec641958589569dea0f0c9e@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: lucene-solr:jira/solr-11458: SOLR-11458: Use moveNormalReplica for HDFS replicas when RF==1. Date: Mon, 6 Nov 2017 19:56:25 +0000 (UTC) archived-at: Mon, 06 Nov 2017 19:56:27 -0000 Repository: lucene-solr Updated Branches: refs/heads/jira/solr-11458 [created] 26c15f1f0 SOLR-11458: Use moveNormalReplica for HDFS replicas when RF==1. Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/26c15f1f Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/26c15f1f Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/26c15f1f Branch: refs/heads/jira/solr-11458 Commit: 26c15f1f08709d7638d92d211753a2920954fb1b Parents: dc6119b Author: Andrzej Bialecki Authored: Mon Nov 6 20:55:40 2017 +0100 Committer: Andrzej Bialecki Committed: Mon Nov 6 20:55:40 2017 +0100 ---------------------------------------------------------------------- .../org/apache/solr/cloud/MoveReplicaCmd.java | 9 +++- .../solr/cloud/MoveReplicaHDFSFailoverTest.java | 55 ++++++++++++++++---- .../solr/common/cloud/ClusterStateUtil.java | 2 +- 3 files changed, 52 insertions(+), 14 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26c15f1f/solr/core/src/java/org/apache/solr/cloud/MoveReplicaCmd.java ---------------------------------------------------------------------- diff --git a/solr/core/src/java/org/apache/solr/cloud/MoveReplicaCmd.java b/solr/core/src/java/org/apache/solr/cloud/MoveReplicaCmd.java index a2ed407..a89e8ba 100644 --- a/solr/core/src/java/org/apache/solr/cloud/MoveReplicaCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/MoveReplicaCmd.java @@ -112,9 +112,14 @@ public class MoveReplicaCmd implements Cmd{ } assert slice != null; Object dataDir = replica.get("dataDir"); - if (dataDir != null && dataDir.toString().startsWith("hdfs:/")) { + // don't move the only replica in place - if it fails we can lose data + boolean inPlaceMove = slice.getReplicas().size() > 1; + log.debug("--- in-place move allowed=" + inPlaceMove); + if (inPlaceMove && dataDir != null && dataDir.toString().startsWith("hdfs:/")) { + log.debug("--- using moveHdfsReplica"); moveHdfsReplica(clusterState, results, dataDir.toString(), targetNode, async, coll, replica, slice, timeout, waitForFinalState); } else { + log.debug("--- using moveNormalReplica"); moveNormalReplica(clusterState, results, targetNode, async, coll, replica, slice, timeout, waitForFinalState); } } @@ -224,7 +229,7 @@ public class MoveReplicaCmd implements Cmd{ results.add("failure", errorString); return; } else { - log.debug("Replica " + watcher.getActiveReplicas() + " is active - deleting the source..."); + log.info("Replica " + watcher.getActiveReplicas() + " is active - deleting the source..."); } } finally { ocmh.zkStateReader.removeCollectionStateWatcher(coll.getName(), watcher); http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26c15f1f/solr/core/src/test/org/apache/solr/cloud/MoveReplicaHDFSFailoverTest.java ---------------------------------------------------------------------- diff --git a/solr/core/src/test/org/apache/solr/cloud/MoveReplicaHDFSFailoverTest.java b/solr/core/src/test/org/apache/solr/cloud/MoveReplicaHDFSFailoverTest.java index 5edae7c..6621fc4 100644 --- a/solr/core/src/test/org/apache/solr/cloud/MoveReplicaHDFSFailoverTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/MoveReplicaHDFSFailoverTest.java @@ -34,6 +34,7 @@ import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.ZkConfigManager; import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.util.BadHdfsThreadsFilter; +import org.apache.solr.util.LogLevel; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; @@ -42,6 +43,7 @@ import org.junit.Test; BadHdfsThreadsFilter.class, // hdfs currently leaks thread(s) MoveReplicaHDFSTest.ForkJoinThreadsFilter.class }) +@LogLevel("org.apache.solr.cloud=DEBUG;org.apache.solr.cloud.overseer=DEBUG;org.apache.solr.client.solrj.impl.SolrClientDataProvider=DEBUG;") public class MoveReplicaHDFSFailoverTest extends SolrCloudTestCase { private static MiniDFSCluster dfsCluster; @@ -70,12 +72,12 @@ public class MoveReplicaHDFSFailoverTest extends SolrCloudTestCase { @Test public void testDataDirAndUlogAreMaintained() throws Exception { String coll = "movereplicatest_coll2"; - CollectionAdminRequest.createCollection(coll, "conf1", 1, 1) + CollectionAdminRequest.createCollection(coll, "conf1", 1, 2) .setCreateNodeSet("") .process(cluster.getSolrClient()); String hdfsUri = HdfsTestUtil.getURI(dfsCluster); - String dataDir = hdfsUri + "/dummyFolder/dataDir"; - String ulogDir = hdfsUri + "/dummyFolder2/ulogDir"; + String dataDir = hdfsUri + "/dummyFolder11/dataDir"; + String ulogDir = hdfsUri + "/dummyFolder12/ulogDir"; CollectionAdminResponse res = CollectionAdminRequest .addReplicaToShard(coll, "shard1") .setDataDir(dataDir) @@ -83,6 +85,15 @@ public class MoveReplicaHDFSFailoverTest extends SolrCloudTestCase { .setNode(cluster.getJettySolrRunner(0).getNodeName()) .process(cluster.getSolrClient()); + String dataDir2 = hdfsUri + "/dummyFolder21/dataDir"; + String ulogDir2 = hdfsUri + "/dummyFolder22/ulogDir"; + res = CollectionAdminRequest + .addReplicaToShard(coll, "shard1") + .setDataDir(dataDir2) + .setUlogDir(ulogDir2) + .setNode(cluster.getJettySolrRunner(0).getNodeName()) + .process(cluster.getSolrClient()); + ulogDir += "/tlog"; ZkStateReader zkStateReader = cluster.getSolrClient().getZkStateReader(); assertTrue(ClusterStateUtil.waitForAllActiveAndLiveReplicas(zkStateReader, 120000)); @@ -96,8 +107,15 @@ public class MoveReplicaHDFSFailoverTest extends SolrCloudTestCase { .process(cluster.getSolrClient()); assertTrue(ClusterStateUtil.waitForAllActiveAndLiveReplicas(zkStateReader, 120000)); docCollection = zkStateReader.getClusterState().getCollection(coll); - assertEquals(1, docCollection.getSlice("shard1").getReplicas().size()); - Replica newReplica = docCollection.getReplicas().iterator().next(); + assertEquals(2, docCollection.getSlice("shard1").getReplicas().size()); + Replica newReplica = null; + for (Replica r : docCollection.getReplicas()) { + if (r.getCoreName().equals(replica.getCoreName())) { + newReplica = r; + break; + } + } + assertNotNull(newReplica); assertEquals(newReplica.getNodeName(), cluster.getJettySolrRunner(1).getNodeName()); assertTrue(newReplica.getStr("ulogDir"), newReplica.getStr("ulogDir").equals(ulogDir) || newReplica.getStr("ulogDir").equals(ulogDir+'/')); assertTrue(newReplica.getStr("dataDir"),newReplica.getStr("dataDir").equals(dataDir) || newReplica.getStr("dataDir").equals(dataDir+'/')); @@ -112,14 +130,27 @@ public class MoveReplicaHDFSFailoverTest extends SolrCloudTestCase { Thread.sleep(5000); new CollectionAdminRequest.MoveReplica(coll, newReplica.getName(), cluster.getJettySolrRunner(0).getNodeName()) .process(cluster.getSolrClient()); - assertTrue(ClusterStateUtil.waitForAllActiveAndLiveReplicas(zkStateReader, 120000)); + boolean active = ClusterStateUtil.waitForAllActiveAndLiveReplicas(zkStateReader, 120000); + if (!active) { + fail("Time out waiting for all replicas to become active: " + zkStateReader.getClusterState().getCollection(coll)); + } // assert that the old core will be removed on startup cluster.getJettySolrRunner(1).start(); - assertTrue(ClusterStateUtil.waitForAllActiveAndLiveReplicas(zkStateReader, 120000)); + active = ClusterStateUtil.waitForAllActiveAndLiveReplicas(zkStateReader, 120000); + if (!active) { + fail("Time out waiting for all replicas to become active: " + zkStateReader.getClusterState().getCollection(coll)); + } docCollection = zkStateReader.getClusterState().getCollection(coll); - assertEquals(1, docCollection.getReplicas().size()); - newReplica = docCollection.getReplicas().iterator().next(); + assertEquals(2, docCollection.getReplicas().size()); + newReplica = null; + for (Replica r : docCollection.getReplicas()) { + if (r.getCoreName().equals(replica.getCoreName())) { + newReplica = r; + break; + } + } + assertNotNull(newReplica); assertEquals(newReplica.getNodeName(), cluster.getJettySolrRunner(0).getNodeName()); assertTrue(newReplica.getStr("ulogDir"), newReplica.getStr("ulogDir").equals(ulogDir) || newReplica.getStr("ulogDir").equals(ulogDir+'/')); assertTrue(newReplica.getStr("dataDir"),newReplica.getStr("dataDir").equals(dataDir) || newReplica.getStr("dataDir").equals(dataDir+'/')); @@ -144,9 +175,11 @@ public class MoveReplicaHDFSFailoverTest extends SolrCloudTestCase { assertTrue(ClusterStateUtil.waitForAllReplicasNotLive(cluster.getSolrClient().getZkStateReader(), 20000)); // move replica from node0 -> node1 - new CollectionAdminRequest.MoveReplica(coll, replica.getName(), cluster.getJettySolrRunner(1).getNodeName()) - .process(cluster.getSolrClient()); + CollectionAdminRequest.MoveReplica moveReq = new CollectionAdminRequest.MoveReplica(coll, replica.getName(), cluster.getJettySolrRunner(1).getNodeName()); + moveReq.setWaitForFinalState(true); + moveReq.process(cluster.getSolrClient()); assertTrue(ClusterStateUtil.waitForAllActiveAndLiveReplicas(cluster.getSolrClient().getZkStateReader(), 20000)); + assertEquals(2, cluster.getSolrClient().query(coll, new SolrQuery("*:*")).getResults().getNumFound()); cluster.getJettySolrRunners().get(1).stop(); assertTrue(ClusterStateUtil.waitForAllReplicasNotLive(cluster.getSolrClient().getZkStateReader(), 20000)); http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26c15f1f/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterStateUtil.java ---------------------------------------------------------------------- diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterStateUtil.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterStateUtil.java index 0910868..fad46e6 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterStateUtil.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterStateUtil.java @@ -100,7 +100,7 @@ public class ClusterStateUtil { } } } - + return success; }