lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From hoss...@apache.org
Subject [lucene-solr] branch branch_8x updated: SOLR-14159: Eliminate some 'spin loops' in tests that may be contributing factors to odd test failures
Date Wed, 22 Jan 2020 22:04:55 GMT
This is an automated email from the ASF dual-hosted git repository.

hossman pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new e0820ac  SOLR-14159: Eliminate some 'spin loops' in tests that may be contributing
factors to odd test failures
e0820ac is described below

commit e0820acc45c24cf0c7bd8dabe36d9a9c72b35483
Author: Chris Hostetter <hossman@apache.org>
AuthorDate: Wed Jan 22 14:44:56 2020 -0700

    SOLR-14159: Eliminate some 'spin loops' in tests that may be contributing factors to odd
test failures
    
    (cherry picked from commit 6b3e7feba19d2314d8c38205dbf1ab1fe2607096)
---
 .../apache/solr/cloud/TestCloudConsistency.java    | 49 ++++++++++++++--------
 .../solr/cloud/TestTlogReplayVsRecovery.java       | 24 +++++++----
 2 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCloudConsistency.java b/solr/core/src/test/org/apache/solr/cloud/TestCloudConsistency.java
index c79ae54..9d04f99 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestCloudConsistency.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestCloudConsistency.java
@@ -25,6 +25,7 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 
 import org.apache.solr.JSONTestUtil;
 import org.apache.solr.client.solrj.SolrServerException;
@@ -37,8 +38,6 @@ import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.ZkCoreNodeProps;
 import org.apache.solr.common.util.NamedList;
-import org.apache.solr.common.util.TimeSource;
-import org.apache.solr.util.TimeOut;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -168,14 +167,22 @@ public class TestCloudConsistency extends SolrCloudTestCase {
     
     cluster.waitForNode(j1, 30);
     cluster.waitForNode(j2, 30);
-    
-    TimeOut timeOut = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME);
-    while (!timeOut.hasTimedOut()) {
-      Replica newLeader = getCollectionState(collection).getSlice("shard1").getLeader();
-      if (newLeader != null && !newLeader.getName().equals(leader.getName()) &&
newLeader.getState() == Replica.State.ACTIVE) {
-        fail("Out of sync replica became leader " + newLeader);
-      }
-    }
+
+    // the meat of the test -- wait to see if a different replica become a leader
+    // the correct behavior is that this should time out, if it succeeds we have a problem...
+    expectThrows(TimeoutException.class,
+                 "Did not time out waiting for new leader, out of sync replica became leader",
+                 () -> {
+                   cluster.getSolrClient().waitForState(collection, 10, TimeUnit.SECONDS,
(state) -> {
+            Replica newLeader = state.getSlice("shard1").getLeader();
+            if (newLeader != null && !newLeader.getName().equals(leader.getName())
&& newLeader.getState() == Replica.State.ACTIVE) {
+              // this is is the bad case, our "bad" state was found before timeout
+              log.error("WTF: New Leader={}", newLeader);
+              return true;
+            }
+            return false; // still no bad state, wait for timeout
+          });
+      });
 
     JettySolrRunner j0 = cluster.getJettySolrRunner(0);
     j0.start();
@@ -211,13 +218,21 @@ public class TestCloudConsistency extends SolrCloudTestCase {
     waitForState("Timeout waiting for leader goes DOWN", collection, (liveNodes, collectionState)
         -> collectionState.getReplica(leader.getName()).getState() == Replica.State.DOWN);
 
-    TimeOut timeOut = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME);
-    while (!timeOut.hasTimedOut()) {
-      Replica newLeader = getCollectionState(collection).getLeader("shard1");
-      if (newLeader != null && !newLeader.getName().equals(leader.getName()) &&
newLeader.getState() == Replica.State.ACTIVE) {
-        fail("Out of sync replica became leader " + newLeader);
-      }
-    }
+    // the meat of the test -- wait to see if a different replica become a leader
+    // the correct behavior is that this should time out, if it succeeds we have a problem...
+    expectThrows(TimeoutException.class,
+                 "Did not time out waiting for new leader, out of sync replica became leader",
+                 () -> {
+                   cluster.getSolrClient().waitForState(collection, 10, TimeUnit.SECONDS,
(state) -> {
+            Replica newLeader = state.getSlice("shard1").getLeader();
+            if (newLeader != null && !newLeader.getName().equals(leader.getName())
&& newLeader.getState() == Replica.State.ACTIVE) {
+              // this is is the bad case, our "bad" state was found before timeout
+              log.error("WTF: New Leader={}", newLeader);
+              return true;
+            }
+            return false; // still no bad state, wait for timeout
+          });
+      });
 
     proxies.get(cluster.getJettySolrRunner(0)).reopen();
     cluster.getJettySolrRunner(0).start();
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestTlogReplayVsRecovery.java b/solr/core/src/test/org/apache/solr/cloud/TestTlogReplayVsRecovery.java
index 09d3885..d1c4d22 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestTlogReplayVsRecovery.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestTlogReplayVsRecovery.java
@@ -25,6 +25,7 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 
 import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
 
@@ -39,9 +40,7 @@ import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.client.solrj.response.RequestStatusState;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.cloud.Replica;
-import org.apache.solr.common.util.TimeSource;
 import org.apache.solr.util.TestInjection;
-import org.apache.solr.util.TimeOut;
 import org.junit.After;
 import org.junit.Before;
 import org.slf4j.Logger;
@@ -172,13 +171,20 @@ public class TestTlogReplayVsRecovery extends SolrCloudTestCase {
     waitForState("Timeout waiting for leader goes DOWN", COLLECTION, (liveNodes, collectionState)
                  -> collectionState.getReplica(leader.getName()).getState() == Replica.State.DOWN);
 
-    TimeOut timeOut = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME);
-    while (!timeOut.hasTimedOut()) {
-      Replica newLeader = getCollectionState(COLLECTION).getLeader("shard1");
-      if (newLeader != null && !newLeader.getName().equals(leader.getName()) &&
newLeader.getState() == Replica.State.ACTIVE) {
-        fail("Out of sync replica became leader " + newLeader);
-      }
-    }
+    // Sanity check that a new (out of sync) replica doesn't come up in our place...
+    expectThrows(TimeoutException.class,
+                 "Did not time out waiting for new leader, out of sync replica became leader",
+                 () -> {
+                   cluster.getSolrClient().waitForState(COLLECTION, 10, TimeUnit.SECONDS,
(state) -> {
+            Replica newLeader = state.getSlice("shard1").getLeader();
+            if (newLeader != null && !newLeader.getName().equals(leader.getName())
&& newLeader.getState() == Replica.State.ACTIVE) {
+              // this is is the bad case, our "bad" state was found before timeout
+              log.error("WTF: New Leader={}", newLeader);
+              return true;
+            }
+            return false; // still no bad state, wait for timeout
+          });
+      });
 
     log.info("Enabling TestInjection.updateLogReplayRandomPause");
     TestInjection.updateLogReplayRandomPause = "true:100";


Mime
View raw message