lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From no...@apache.org
Subject [02/50] [abbrv] lucene-solr:feature/autoscaling: SOLR-11297: Message 'Lock held by this virtual machine' during startup. Solr is trying to start some cores twice
Date Tue, 03 Oct 2017 13:38:43 GMT
SOLR-11297: Message 'Lock held by this virtual machine' during startup.  Solr is trying to
start some cores twice


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/6391a75a
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/6391a75a
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/6391a75a

Branch: refs/heads/feature/autoscaling
Commit: 6391a75a50ecc05db0d7a5ed9adc9fe187a4f57e
Parents: feec5c6
Author: Erick Erickson <erick@apache.org>
Authored: Mon Sep 25 12:12:31 2017 -0700
Committer: Erick Erickson <erick@apache.org>
Committed: Mon Sep 25 12:12:31 2017 -0700

----------------------------------------------------------------------
 solr/CHANGES.txt                                |  3 ++
 .../org/apache/solr/core/CoreContainer.java     | 42 +++++++++++++++++---
 2 files changed, 40 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6391a75a/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 0c58639..18219d4 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -145,6 +145,9 @@ Bug Fixes
 * SOLR-11363: JSON Facet API: repeated values in a numeric points field with docValues enabled
   were double counted. (Hossman, yonik)
 
+* SOLR-11297: Message "Lock held by this virtual machine" during startup.  Solr is trying
to start some cores twice.
+  (Luiz Armesto, Shawn Heisey, Erick Erickson)
+
 
 Optimizations
 ----------------------

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6391a75a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
index bf24db8..f3f3c49 100644
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@@ -637,9 +637,10 @@ public class CoreContainer {
               if (zkSys.getZkController() != null) {
                 zkSys.getZkController().throwErrorIfReplicaReplaced(cd);
               }
-
+              solrCores.waitAddPendingCoreOps(cd.getName());
               core = createFromDescriptor(cd, false, false);
             } finally {
+              solrCores.removeFromPendingOps(cd.getName());
               if (asyncSolrCoreLoad) {
                 solrCores.markCoreAsNotLoading(cd);
               }
@@ -935,7 +936,13 @@ public class CoreContainer {
       // first and clean it up if there's an error.
       coresLocator.create(this, cd);
 
-      SolrCore core = createFromDescriptor(cd, true, newCollection);
+      SolrCore core = null;
+      try {
+        solrCores.waitAddPendingCoreOps(cd.getName());
+        core = createFromDescriptor(cd, true, newCollection);
+      } finally {
+        solrCores.removeFromPendingOps(cd.getName());
+      }
 
       return core;
     } catch (Exception ex) {
@@ -970,7 +977,6 @@ public class CoreContainer {
       throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
           "Error CREATEing SolrCore '" + coreName + "': " + ex.getMessage() + rootMsg, ex);
     }
-
   }
 
   /**
@@ -979,6 +985,26 @@ public class CoreContainer {
    * @param dcore        a core descriptor
    * @param publishState publish core state to the cluster if true
    *
+   * WARNING: Any call to this method should be surrounded by a try/finally block
+   *          that calls solrCores.waitAddPendingCoreOps(...) and solrCores.removeFromPendingOps(...)
+   *
+   *  <pre>
+   *   <code>
+   *   try {
+   *      solrCores.waitAddPendingCoreOps(dcore.getName());
+   *      createFromDescriptor(...);
+   *   } finally {
+   *      solrCores.removeFromPendingOps(dcore.getName());
+   *   }
+   *   </code>
+   * </pre>
+   *
+   *  Trying to put the waitAddPending... in this method results in Bad Things Happening
due to race conditions.
+   *  getCore() depends on getting the core returned _if_ it's in the pending list due to
some other thread opening it.
+   *  If the core is not in the pending list and not loaded, then getCore() calls this method.
Anything that called
+   *  to check if the core was loaded _or_ in pending ops and, based on the return called
createFromDescriptor would
+   *  introduce a race condition, see getCore() for the place it would be a problem
+   *
    * @return the newly created core
    */
   private SolrCore createFromDescriptor(CoreDescriptor dcore, boolean publishState, boolean
newCollection) {
@@ -1258,7 +1284,12 @@ public class CoreContainer {
     } else {
       CoreLoadFailure clf = coreInitFailures.get(name);
       if (clf != null) {
-        createFromDescriptor(clf.cd, true, false);
+        try {
+          solrCores.waitAddPendingCoreOps(clf.cd.getName());
+          createFromDescriptor(clf.cd, true, false);
+        } finally {
+          solrCores.removeFromPendingOps(clf.cd.getName());
+        }
       } else {
         throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "No such core: " + name
);
       }
@@ -1431,7 +1462,8 @@ public class CoreContainer {
     // TestLazyCores
     if (desc == null || zkSys.getZkController() != null) return null;
 
-    // This will put an entry in pending core ops if the core isn't loaded
+    // This will put an entry in pending core ops if the core isn't loaded. Here's where
moving the
+    // waitAddPendingCoreOps to createFromDescriptor would introduce a race condition.
     core = solrCores.waitAddPendingCoreOps(name);
 
     if (isShutDown) return null; // We're quitting, so stop. This needs to be after the wait
above since we may come off


Mime
View raw message