Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id ECF8E200B33 for ; Wed, 29 Jun 2016 13:52:22 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id EB82C160A57; Wed, 29 Jun 2016 11:52:22 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 19006160A4D for ; Wed, 29 Jun 2016 13:52:21 +0200 (CEST) Received: (qmail 92969 invoked by uid 500); 29 Jun 2016 11:52:19 -0000 Mailing-List: contact commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@lucene.apache.org Delivered-To: mailing list commits@lucene.apache.org Received: (qmail 92915 invoked by uid 99); 29 Jun 2016 11:52:19 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 29 Jun 2016 11:52:19 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 1B803E5CE1; Wed, 29 Jun 2016 11:52:19 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit From: shalin@apache.org To: commits@lucene.apache.org Message-Id: <63bdc4605f684250ac898ba67b000254@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: lucene-solr:branch_6x: SOLR-8777: Duplicate Solr process can cripple a running process (cherry picked from commit 4ea95bf) Date: Wed, 29 Jun 2016 11:52:19 +0000 (UTC) archived-at: Wed, 29 Jun 2016 11:52:23 -0000 Repository: lucene-solr Updated Branches: refs/heads/branch_6x 976501f6f -> 812fd346f SOLR-8777: Duplicate Solr process can cripple a running process (cherry picked from commit 4ea95bf) Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/812fd346 Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/812fd346 Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/812fd346 Branch: refs/heads/branch_6x Commit: 812fd346f7a136ccfe550a6ba0d7b0e634d68769 Parents: 976501f Author: Shalin Shekhar Mangar Authored: Wed Jun 29 14:49:59 2016 +0530 Committer: Shalin Shekhar Mangar Committed: Wed Jun 29 16:59:56 2016 +0530 ---------------------------------------------------------------------- solr/CHANGES.txt | 4 +- .../org/apache/solr/cloud/ZkController.java | 96 ++++++++++++++------ 2 files changed, 70 insertions(+), 30 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/812fd346/solr/CHANGES.txt ---------------------------------------------------------------------- diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 817d85e..10f490a 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -54,10 +54,12 @@ Bug Fixes * SOLR-8626: 404 error when clicking nodes in cloud graph view in angular UI. (janhoy, Trey Grainger via shalin) +* SOLR-8777: Duplicate Solr process can cripple a running process. (Jessica Cheng Mallet, Scott Blum, shalin) + * SOLR-9254: GraphTermsQueryQParserPlugin throws NPE when field being searched is not present in segment (Joel Bernstein) -* SOLR-8657: Fix SolrRequestInfo error logs if QuerySenderListener is being used (Pascal Chollet, +* SOLR-8657: Fix SolrRequestInfo error logs if QuerySenderListener is being used (Pascal Chollet, Tomás Fernández Löbbe) Optimizations http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/812fd346/solr/core/src/java/org/apache/solr/cloud/ZkController.java ---------------------------------------------------------------------- diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java index 1388ee5..102774f 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java +++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java @@ -25,8 +25,19 @@ import java.net.URLEncoder; import java.net.UnknownHostException; import java.nio.charset.StandardCharsets; import java.nio.file.Path; -import java.util.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Enumeration; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Properties; +import java.util.Set; import java.util.concurrent.Callable; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; @@ -41,7 +52,25 @@ import org.apache.solr.cloud.overseer.OverseerAction; import org.apache.solr.cloud.overseer.SliceMutator; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; -import org.apache.solr.common.cloud.*; +import org.apache.solr.common.cloud.BeforeReconnect; +import org.apache.solr.common.cloud.ClusterState; +import org.apache.solr.common.cloud.ClusterStateUtil; +import org.apache.solr.common.cloud.DefaultConnectionStrategy; +import org.apache.solr.common.cloud.DefaultZkACLProvider; +import org.apache.solr.common.cloud.DefaultZkCredentialsProvider; +import org.apache.solr.common.cloud.DocCollection; +import org.apache.solr.common.cloud.OnReconnect; +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.cloud.Slice; +import org.apache.solr.common.cloud.SolrZkClient; +import org.apache.solr.common.cloud.ZkACLProvider; +import org.apache.solr.common.cloud.ZkCmdExecutor; +import org.apache.solr.common.cloud.ZkConfigManager; +import org.apache.solr.common.cloud.ZkCoreNodeProps; +import org.apache.solr.common.cloud.ZkCredentialsProvider; +import org.apache.solr.common.cloud.ZkNodeProps; +import org.apache.solr.common.cloud.ZkStateReader; +import org.apache.solr.common.cloud.ZooKeeperException; import org.apache.solr.common.params.CollectionParams; import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.SolrParams; @@ -642,6 +671,8 @@ public final class ZkController { zkStateReader.createClusterStateWatchersAndUpdate(); this.baseURL = zkStateReader.getBaseUrlForNodeName(this.nodeName); + checkForExistingEphemeralNode(); + // start the overseer first as following code may need it's processing if (!zkRunOnly) { overseerElector = new LeaderElector(zkClient); @@ -678,6 +709,39 @@ public final class ZkController { } + private void checkForExistingEphemeralNode() throws KeeperException, InterruptedException { + if (zkRunOnly) { + return; + } + String nodeName = getNodeName(); + String nodePath = ZkStateReader.LIVE_NODES_ZKNODE + "/" + nodeName; + + if (!zkClient.exists(nodePath, true)) { + return; + } + + final CountDownLatch deletedLatch = new CountDownLatch(1); + Stat stat = zkClient.exists(nodePath, event -> { + if (Watcher.Event.EventType.None.equals(event.getType())) { + return; + } + if (Watcher.Event.EventType.NodeDeleted.equals(event.getType())) { + deletedLatch.countDown(); + } + }, true); + + if (stat == null) { + // znode suddenly disappeared but that's okay + return; + } + + boolean deleted = deletedLatch.await(zkClient.getSolrZooKeeper().getSessionTimeout() * 2, TimeUnit.MILLISECONDS); + if (!deleted) { + throw new SolrException(ErrorCode.SERVER_ERROR, "A previous ephemeral live node still exists. " + + "Solr cannot continue. Please ensure that no other Solr process using the same port is running already."); + } + } + public void publishAndWaitForDownStates() throws KeeperException, InterruptedException { @@ -752,33 +816,7 @@ public final class ZkController { String nodeName = getNodeName(); String nodePath = ZkStateReader.LIVE_NODES_ZKNODE + "/" + nodeName; log.info("Register node as live in ZooKeeper:" + nodePath); - - try { - boolean nodeDeleted = true; - try { - // we attempt a delete in the case of a quick server bounce - - // if there was not a graceful close, the node may exist - // until expiration timeout - so a node won't be created here because - // it exists, but eventually the node will be removed. So delete - // in case it exists and create a new node. - zkClient.delete(nodePath, -1, true); - } catch (KeeperException.NoNodeException e) { - // fine if there is nothing to delete - // TODO: annoying that ZK logs a warning on us - nodeDeleted = false; - } - if (nodeDeleted) { - log - .info("Found a previous node that still exists while trying to register a new live node " - + nodePath + " - removing existing node to create another."); - } - zkClient.makePath(nodePath, CreateMode.EPHEMERAL, true); - } catch (KeeperException e) { - // it's okay if the node already exists - if (e.code() != KeeperException.Code.NODEEXISTS) { - throw e; - } - } + zkClient.makePath(nodePath, CreateMode.EPHEMERAL, true); } public String getNodeName() {