lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From a.@apache.org
Subject [lucene-solr] branch branch_8_7 updated: SOLR-14948: Autoscaling maxComputeOperations override causes exceptions.
Date Wed, 21 Oct 2020 08:00:56 GMT
This is an automated email from the ASF dual-hosted git repository.

ab pushed a commit to branch branch_8_7
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8_7 by this push:
     new 6e5e247  SOLR-14948: Autoscaling maxComputeOperations override causes exceptions.
6e5e247 is described below

commit 6e5e247abcafd6a38590b2835fb60e851cf215db
Author: Andrzej Bialecki <ab@apache.org>
AuthorDate: Wed Oct 21 08:43:05 2020 +0200

    SOLR-14948: Autoscaling maxComputeOperations override causes exceptions.
---
 solr/CHANGES.txt                                   |  2 +
 .../solr/cloud/autoscaling/ComputePlanAction.java  | 61 ++++++++++++++++---
 .../cloud/autoscaling/ComputePlanActionTest.java   | 71 +++++++++++++++++-----
 3 files changed, 110 insertions(+), 24 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 28ae39b..3bfa47d 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -118,6 +118,8 @@ Bug Fixes
 * SOLR-14546: Fix for a relatively hard to hit issue in OverseerTaskProcessor that could
lead to out of order execution
   of Collection API tasks competing for a lock (Ilan Ginzburg).
 
+* SOLR-14948: Autoscaling maxComputeOperations override causes exceptions. (ab)
+
 ==================  8.7.0 ==================
 
 Consult the lucene/CHANGES.txt file for additional, low level, changes in this release.
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java
index 07cbb38..b76ddb0 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java
@@ -53,6 +53,8 @@ import static org.apache.solr.cloud.autoscaling.TriggerEvent.NODE_NAMES;
 public class ComputePlanAction extends TriggerActionBase {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
+  public static final String DIAGNOSTICS = "__compute_diag__";
+
   // accept all collections by default
   Predicate<String> collectionsPredicate = s -> true;
 
@@ -129,8 +131,12 @@ public class ComputePlanAction extends TriggerActionBase {
         int opCount = 0;
         int opLimit = maxOperations;
         if (requestedOperations > 0) {
+          log.debug("-- adjusting limit due to explicitly requested number of ops={}", requestedOperations);
           opLimit = requestedOperations;
         }
+        addDiagnostics(event, "maxOperations", maxOperations);
+        addDiagnostics(event, "requestedOperations", requestedOperations);
+        addDiagnostics(event, "opLimit", opLimit);
         do {
           // computing changes in large clusters may take a long time
           if (Thread.currentThread().isInterrupted()) {
@@ -156,6 +162,8 @@ public class ComputePlanAction extends TriggerActionBase {
             if (requestedOperations < 0) {
               //uncomment the following to log zero operations
 //              PolicyHelper.logState(cloudManager, initialSuggester);
+              log.debug("-- no more operations suggested, stopping after {} ops...", (opCount
- 1));
+              addDiagnostics(event, "noSuggestionsStopAfter", (opCount - 1));
               break;
             } else {
               log.info("Computed plan empty, remained {} requested ops to try.", opCount
- opLimit);
@@ -173,6 +181,10 @@ public class ComputePlanAction extends TriggerActionBase {
             operations.add(operation);
             return operations;
           });
+          if (opCount >= opLimit) {
+            log.debug("-- reached limit of maxOps={}, stopping.", opLimit);
+            addDiagnostics(event, "opLimitReached", true);
+          }
         } while (opCount < opLimit);
       } finally {
         releasePolicySession(sessionWrapper, session);
@@ -189,6 +201,14 @@ public class ComputePlanAction extends TriggerActionBase {
 
   }
 
+  private void addDiagnostics(TriggerEvent event, String key, Object value) {
+    if (log.isDebugEnabled()) {
+      Map<String, Object> diag = (Map<String, Object>) event.getProperties()
+          .computeIfAbsent(DIAGNOSTICS, n -> new HashMap<>());
+      diag.put(key, value);
+    }
+  }
+
   protected int getMaxNumOps(TriggerEvent event, AutoScalingConfig autoScalingConfig, ClusterState
clusterState) {
     // estimate a maximum default limit that should be sufficient for most purposes:
     // number of nodes * total number of replicas * 3
@@ -205,14 +225,26 @@ public class ComputePlanAction extends TriggerActionBase {
       totalRF.addAndGet(rf * coll.getSlices().size());
     });
     int totalMax = clusterState.getLiveNodes().size() * totalRF.get() * 3;
-    int maxOp = (Integer) autoScalingConfig.getProperties().getOrDefault(AutoScalingParams.MAX_COMPUTE_OPERATIONS,
totalMax);
+    addDiagnostics(event, "estimatedMaxOps", totalMax);
+    int maxOp = ((Number) autoScalingConfig.getProperties().getOrDefault(AutoScalingParams.MAX_COMPUTE_OPERATIONS,
totalMax)).intValue();
     Object o = event.getProperty(AutoScalingParams.MAX_COMPUTE_OPERATIONS, maxOp);
-    try {
-      return Integer.parseInt(String.valueOf(o));
-    } catch (Exception e) {
-      log.warn("Invalid '{}' event property: {}, using default {}", AutoScalingParams.MAX_COMPUTE_OPERATIONS,
o, maxOp);
-      return maxOp;
+    if (o != null) {
+      try {
+        maxOp = Integer.parseInt(String.valueOf(o));
+      } catch (Exception e) {
+        log.warn("Invalid '{}' event property: {}, using default {}", AutoScalingParams.MAX_COMPUTE_OPERATIONS,
o, maxOp);
+      }
     }
+    if (maxOp < 0) {
+      // unlimited
+      maxOp = Integer.MAX_VALUE;
+    } else if (maxOp < 1) {
+      // try at least one operation
+      log.debug("-- estimated maxOp={}, resetting to 1...", maxOp);
+      maxOp = 1;
+    }
+    log.debug("-- estimated total max ops={}, effective maxOps={}", totalMax, maxOp);
+    return maxOp;
   }
 
   protected int getRequestedNumOps(TriggerEvent event) {
@@ -278,19 +310,27 @@ public class ComputePlanAction extends TriggerActionBase {
       case MOVEREPLICA:
         Suggester s = session.getSuggester(action)
                 .hint(Suggester.Hint.SRC_NODE, event.getProperty(NODE_NAMES));
-        if (applyCollectionHints(cloudManager, s) == 0) return NoneSuggester.get(session);
+        if (applyCollectionHints(cloudManager, s) == 0) {
+          addDiagnostics(event, "noRelevantCollections", true);
+          return NoneSuggester.get(session);
+        }
         return s;
       case DELETENODE:
         int start = (Integer)event.getProperty(START, 0);
         @SuppressWarnings({"unchecked"})
         List<String> srcNodes = (List<String>) event.getProperty(NODE_NAMES);
         if (srcNodes.isEmpty() || start >= srcNodes.size()) {
+          addDiagnostics(event, "noSourceNodes", true);
           return NoneSuggester.get(session);
         }
         String sourceNode = srcNodes.get(start);
         s = session.getSuggester(action)
                 .hint(Suggester.Hint.SRC_NODE, event.getProperty(NODE_NAMES));
-        if (applyCollectionHints(cloudManager, s) == 0) return NoneSuggester.get(session);
+        if (applyCollectionHints(cloudManager, s) == 0) {
+          log.debug("-- no relevant collections on {}, no operations computed.", srcNodes);
+          addDiagnostics(event, "noRelevantCollections", true);
+          return NoneSuggester.get(session);
+        }
         s.hint(Suggester.Hint.SRC_NODE, Collections.singletonList(sourceNode));
         event.getProperties().put(START, ++start);
         return s;
@@ -342,11 +382,16 @@ public class ComputePlanAction extends TriggerActionBase {
                             .forEach(collShards::add);
                   }
                 });
+        log.debug("-- NODE_ADDED: ADDREPLICA suggester configured with {} collection/shard
hints.", collShards.size());
+        addDiagnostics(event, "relevantCollShard", collShards);
         suggester.hint(Suggester.Hint.COLL_SHARD, collShards);
         suggester.hint(Suggester.Hint.REPLICATYPE, replicaType);
         break;
       case MOVEREPLICA:
+        log.debug("-- NODE_ADDED event specified MOVEREPLICA - no hints added.");
+        break;
       case NONE:
+        log.debug("-- NODE_ADDED event specified NONE - no operations suggested.");
         break;
       default:
         throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/ComputePlanActionTest.java
b/solr/core/src/test/org/apache/solr/cloud/autoscaling/ComputePlanActionTest.java
index 2526292..e7a317d 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/ComputePlanActionTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/ComputePlanActionTest.java
@@ -583,7 +583,7 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
     int numShards = 1;
     int numCollections = 5;
 
-    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections);
+    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections,
null);
   }
 
   @Test
@@ -592,7 +592,7 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
     int numShards = 1;
     int numCollections = 5;
 
-    nodeAddedTriggerWithAddReplicaPreferredOpReplicaType(collectionNamePrefix, numShards,
numCollections);
+    nodeAddedTriggerWithAddReplicaPreferredOpReplicaType(collectionNamePrefix, numShards,
numCollections, null);
   }
 
   @Test
@@ -602,9 +602,19 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
     int numShards = 2;
     int numCollections = 5;
 
-    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections);
+    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections,
null);
   }
-  private void nodeAddedTriggerWithAddReplicaPreferredOp(String collectionNamePrefix, int
numShards, int numCollections) throws Exception {
+
+  @Test
+  public void testNodeAddedTriggerWithAddReplicaPreferredOp_2Shard_OpLimit() throws Exception
{
+    String collectionNamePrefix = "testNodeAddedTriggerWithAddReplicaPreferredOp_2Shard";
+    int numShards = 2;
+    int numCollections = 5;
+
+    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections,
1);
+  }
+
+  private void nodeAddedTriggerWithAddReplicaPreferredOp(String collectionNamePrefix, int
numShards, int numCollections, Integer maxOps) throws Exception {
     String setTriggerCommand = "{" +
         "'set-trigger' : {" +
         "'name' : 'node_added_trigger'," +
@@ -624,10 +634,10 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
         "    ]" +
         "}";
 
-    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections,
setTriggerCommand, setClusterPolicyCommand);
+    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections,
setTriggerCommand, setClusterPolicyCommand, maxOps);
   }
 
-  private void nodeAddedTriggerWithAddReplicaPreferredOpReplicaType(String collectionNamePrefix,
int numShards, int numCollections) throws Exception {
+  private void nodeAddedTriggerWithAddReplicaPreferredOpReplicaType(String collectionNamePrefix,
int numShards, int numCollections, Integer maxOps) throws Exception {
     String setTriggerCommand = "{" +
         "'set-trigger' : {" +
         "'name' : 'node_added_trigger'," +
@@ -648,13 +658,15 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
         "    ]" +
         "}";
 
-    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections,
setTriggerCommand, setClusterPolicyCommand, 0, 1, 0);
+    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections,
setTriggerCommand, setClusterPolicyCommand, maxOps, 0, 1, 0);
   }
 
-  private void nodeAddedTriggerWithAddReplicaPreferredOp(String collectionNamePrefix, int
numShards, int numCollections, String setTriggerCommand, String setClusterPolicyCommand) throws
Exception {
-    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections,
setTriggerCommand, setClusterPolicyCommand, 1, null, null);
+  private void nodeAddedTriggerWithAddReplicaPreferredOp(String collectionNamePrefix, int
numShards, int numCollections, String setTriggerCommand, String setClusterPolicyCommand, Integer
maxOps) throws Exception {
+    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections,
setTriggerCommand, setClusterPolicyCommand, maxOps, 1, null, null);
   }
-  private void nodeAddedTriggerWithAddReplicaPreferredOp(String collectionNamePrefix, int
numShards, int numCollections, String setTriggerCommand, String setClusterPolicyCommand, Integer
nNrtReplicas, Integer nTlogReplicas, Integer nPullReplicas) throws Exception {
+  private void nodeAddedTriggerWithAddReplicaPreferredOp(String collectionNamePrefix, int
numShards, int numCollections, String setTriggerCommand, String setClusterPolicyCommand,
+                                                         Integer maxOps,
+                                                         Integer nNrtReplicas, Integer nTlogReplicas,
Integer nPullReplicas) throws Exception {
     CloudSolrClient solrClient = cluster.getSolrClient();
     @SuppressWarnings({"rawtypes"})
     SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand);
@@ -665,6 +677,16 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
     response = solrClient.request(req);
     assertEquals(response.get("result").toString(), "success");
 
+    if (maxOps != null) {
+      String setMaxOpsCommand = "{" +
+          " 'set-properties': {" +
+          "   'maxComputeOperations': " + maxOps +
+          "  }" +
+          "}";
+      req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setMaxOpsCommand);
+      response = solrClient.request(req);
+      assertEquals(response.get("result").toString(), "success");
+    }
 
     CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(collectionNamePrefix
+ "_0",
         "conf", numShards, nNrtReplicas, nTlogReplicas, nPullReplicas).setMaxShardsPerNode(2);
@@ -683,7 +705,13 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
     @SuppressWarnings({"rawtypes"})
     List operations = (List) actionContext.get("operations");
     assertNotNull(operations);
-    assertEquals(numShards, operations.size());
+    int numExpectedOps;
+    if (maxOps != null && maxOps > 0) {
+      numExpectedOps = maxOps;
+    } else {
+      numExpectedOps = numShards;
+    }
+    assertEquals(numExpectedOps, operations.size());
     Set<String> affectedShards = new HashSet<>(2);
     for (Object operation : operations) {
       assertTrue(operation instanceof CollectionAdminRequest.AddReplica);
@@ -692,7 +720,7 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
       assertEquals(collectionNamePrefix + "_0", addReplica.getCollection());
       affectedShards.add(addReplica.getShard());
     }
-    assertEquals(numShards, affectedShards.size());
+    assertEquals(numExpectedOps, affectedShards.size());
 
     for (int i = 1; i < numCollections; i++) {
       create = CollectionAdminRequest.createCollection(collectionNamePrefix + "_" + i,
@@ -712,7 +740,12 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
     actionContext = actionContextPropsRef.get();
     operations = (List) actionContext.get("operations");
     assertNotNull(operations);
-    assertEquals(numCollections * numShards, operations.size());
+    if (maxOps != null && maxOps > 0) {
+      numExpectedOps = maxOps;
+    } else {
+      numExpectedOps = numCollections * numShards;
+    }
+    assertEquals(numExpectedOps, operations.size());
     Set<String> affectedCollections = new HashSet<>(numCollections);
     affectedShards = new HashSet<>(numShards);
     Set<Pair<String, String>> affectedCollShards = new HashSet<>(numCollections
* numShards);
@@ -724,9 +757,15 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
       affectedShards.add(addReplica.getShard());
       affectedCollShards.add(new Pair<>(addReplica.getCollection(), addReplica.getShard()));
     }
-    assertEquals(numCollections, affectedCollections.size());
-    assertEquals(numShards, affectedShards.size());
-    assertEquals(numCollections * numShards, affectedCollShards.size());
+    if (maxOps != null && maxOps > 0) {
+      assertEquals(numExpectedOps, affectedCollections.size());
+      assertEquals(numExpectedOps, affectedShards.size());
+      assertEquals(numExpectedOps, affectedCollShards.size());
+    } else {
+      assertEquals(numCollections, affectedCollections.size());
+      assertEquals(numShards, affectedShards.size());
+      assertEquals(numCollections * numShards, affectedCollShards.size());
+    }
   }
 
   @Test


Mime
View raw message