geode-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From aba...@apache.org
Subject [1/2] geode git commit: GEODE-3024 race condition between server locator preparing membership views
Date Wed, 14 Jun 2017 18:05:14 GMT
Repository: geode
Updated Branches:
  refs/heads/release/1.2.0 24192d4fb -> 2ca301a97


GEODE-3024 race condition between server locator preparing membership views

If a locator is preparing a conflicting membership view we now abandon
preparation of a view in a cache server and pause before retrying.
This gives the locator time to gather information from the cache server's
view (which it receives in acks while preparing its own view),
incorporate them into a new view and send it out.  When the cache
server installs the new view from the locator it will shut down its
ViewCreator thread.

(cherry picked from commit 31b72ba48b2dda95954b30c14ae62a8730065b3f)


Project: http://git-wip-us.apache.org/repos/asf/geode/repo
Commit: http://git-wip-us.apache.org/repos/asf/geode/commit/796a2e25
Tree: http://git-wip-us.apache.org/repos/asf/geode/tree/796a2e25
Diff: http://git-wip-us.apache.org/repos/asf/geode/diff/796a2e25

Branch: refs/heads/release/1.2.0
Commit: 796a2e25acef78aa974d4432bf8f551b230f8c93
Parents: 24192d4
Author: Bruce Schuchardt <bschuchardt@pivotal.io>
Authored: Tue Jun 6 15:45:36 2017 -0700
Committer: Anthony Baker <abaker@apache.org>
Committed: Wed Jun 14 11:02:03 2017 -0700

----------------------------------------------------------------------
 .../membership/gms/membership/GMSJoinLeave.java | 110 +++++++++++++------
 .../membership/gms/messages/ViewAckMessage.java |   3 +-
 .../gms/membership/GMSJoinLeaveJUnitTest.java   |  58 ++++++++++
 3 files changed, 138 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/geode/blob/796a2e25/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
----------------------------------------------------------------------
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index 8c67795..8abcc45 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -205,12 +205,12 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
   /**
    * collects responses to new views
    */
-  private ViewReplyProcessor viewProcessor = new ViewReplyProcessor(false);
+  ViewReplyProcessor viewProcessor = new ViewReplyProcessor(false);
 
   /**
    * collects responses to view preparation messages
    */
-  private ViewReplyProcessor prepareProcessor = new ViewReplyProcessor(true);
+  ViewReplyProcessor prepareProcessor = new ViewReplyProcessor(true);
 
   /**
    * whether quorum checks can cause a forced-disconnect
@@ -971,7 +971,8 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
 
     if (m.isPreparing()) {
       if (this.preparedView != null && this.preparedView.getViewId() >= view.getViewId())
{
-        services.getMessenger().send(new ViewAckMessage(m.getSender(), this.preparedView));
+        services.getMessenger()
+            .send(new ViewAckMessage(view.getViewId(), m.getSender(), this.preparedView));
       } else {
         this.preparedView = view;
         if (viewContainsMyUnjoinedAddress) {
@@ -1943,6 +1944,8 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
     volatile boolean shutdown = false;
     volatile boolean waiting = false;
     volatile boolean testFlagForRemovalRequest = false;
+    // count of number of views abandoned due to conflicts
+    volatile int abandonedViews = 0;
 
     /**
      * initial view to install. guarded by synch on ViewCreator
@@ -1981,6 +1984,10 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
       return waiting;
     }
 
+    int getAbandonedViewCount() {
+      return abandonedViews;
+    }
+
     /**
      * All views should be sent by the ViewCreator thread, so if this member becomes coordinator
it
      * may have an initial view to transmit that announces the removal of the former coordinator
to
@@ -1998,34 +2005,47 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
     }
 
     private void sendInitialView() {
-      try {
-        if (initialView == null) {
-          return;
-        }
-        NetView v = preparedView;
-        if (v != null) {
-          processPreparedView(v);
-        }
+      boolean retry;
+      do {
+        retry = false;
         try {
-          NetView iView;
-          List<InternalDistributedMember> iJoins;
-          Set<InternalDistributedMember> iLeaves;
-          Set<InternalDistributedMember> iRemoves;
-          synchronized (this) {
-            iView = initialView;
-            iJoins = initialJoins;
-            iLeaves = initialLeaving;
-            iRemoves = initialRemovals;
+          if (initialView == null) {
+            return;
           }
-          if (iView != null) {
-            prepareAndSendView(iView, iJoins, iLeaves, iRemoves);
+          NetView v = preparedView;
+          if (v != null) {
+            processPreparedView(v);
+          }
+          try {
+            NetView iView;
+            List<InternalDistributedMember> iJoins;
+            Set<InternalDistributedMember> iLeaves;
+            Set<InternalDistributedMember> iRemoves;
+            synchronized (this) {
+              iView = initialView;
+              iJoins = initialJoins;
+              iLeaves = initialLeaving;
+              iRemoves = initialRemovals;
+            }
+            if (iView != null) {
+              prepareAndSendView(iView, iJoins, iLeaves, iRemoves);
+            }
+          } finally {
+            setInitialView(null, null, null, null);
+          }
+        } catch (ViewAbandonedException e) {
+          // another view creator is active - sleep a bit to let it finish or go away
+          retry = true;
+          try {
+            sleep(services.getConfig().getMemberTimeout());
+          } catch (InterruptedException e2) {
+            shutdown = true;
+            retry = false;
           }
-        } finally {
-          setInitialView(null, null, null, null);
+        } catch (InterruptedException e) {
+          shutdown = true;
         }
-      } catch (InterruptedException e) {
-        shutdown = true;
-      }
+      } while (retry);
     }
 
     /**
@@ -2125,6 +2145,17 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
               if (shutdown) {
                 return;
               }
+            } catch (ViewAbandonedException e) {
+              synchronized (viewRequests) {
+                viewRequests.addAll(requests);
+              }
+              // pause before reattempting so that another view creator can either finish
+              // or fail
+              try {
+                sleep(services.getConfig().getMemberTimeout());
+              } catch (InterruptedException e2) {
+                shutdown = true;
+              }
             } catch (DistributedSystemDisconnectedException e) {
               shutdown = true;
             } catch (InterruptedException e) {
@@ -2188,7 +2219,8 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
      * 
      * @throws InterruptedException
      */
-    void createAndSendView(List<DistributionMessage> requests) throws InterruptedException
{
+    void createAndSendView(List<DistributionMessage> requests)
+        throws InterruptedException, ViewAbandonedException {
       List<InternalDistributedMember> joinReqs = new ArrayList<>(10);
       Map<InternalDistributedMember, Integer> joinPorts = new HashMap<>(10);
       Set<InternalDistributedMember> leaveReqs = new HashSet<>(10);
@@ -2332,7 +2364,7 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
      */
     void prepareAndSendView(NetView newView, List<InternalDistributedMember> joinReqs,
         Set<InternalDistributedMember> leaveReqs, Set<InternalDistributedMember>
removalReqs)
-        throws InterruptedException {
+        throws InterruptedException, ViewAbandonedException {
       boolean prepared;
       do {
         if (this.shutdown || Thread.currentThread().isInterrupted()) {
@@ -2354,6 +2386,9 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
         logger.debug("view preparation phase completed.  prepared={}", prepared);
 
         NetView conflictingView = prepareProcessor.getConflictingView();
+        if (conflictingView == null) {
+          conflictingView = GMSJoinLeave.this.preparedView;
+        }
 
         if (prepared) {
           break;
@@ -2382,9 +2417,17 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
               || conflictingView.getViewId() > lastConflictingView.getViewId());
           if (conflictingViewIsMostRecent) {
             lastConflictingView = conflictingView;
-            logger.info(
-                "adding these crashed members from a conflicting view to the crash-set for
the next view: {}\nconflicting view: {}",
-                unresponsive, conflictingView);
+            // if I am not a locator and the conflicting view is from a locator I should
+            // let it take control and stop sending membership views
+            if (localAddress.getVmKind() != DistributionManager.LOCATOR_DM_TYPE &&
conflictingView
+                .getCreator().getVmKind() == DistributionManager.LOCATOR_DM_TYPE) {
+              logger.info("View preparation interrupted - a locator is taking over as "
+                  + "membership coordinator in this view: {}", conflictingView);
+              abandonedViews++;
+              throw new ViewAbandonedException();
+            }
+            logger.info("adding these crashed members from a conflicting view to the crash-set
"
+                + "for the next view: {}\nconflicting view: {}", unresponsive, conflictingView);
             failures.addAll(conflictingView.getCrashedMembers());
             // this member may have been kicked out of the conflicting view
             if (failures.contains(localAddress)) {
@@ -2644,4 +2687,7 @@ public class GMSJoinLeave implements JoinLeave, MessageHandler {
 
     return ret;
   }
+
+  static class ViewAbandonedException extends Exception {
+  }
 }

http://git-wip-us.apache.org/repos/asf/geode/blob/796a2e25/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messages/ViewAckMessage.java
----------------------------------------------------------------------
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messages/ViewAckMessage.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messages/ViewAckMessage.java
index 71bd6bc..12dd334 100644
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messages/ViewAckMessage.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messages/ViewAckMessage.java
@@ -37,9 +37,10 @@ public class ViewAckMessage extends HighPriorityDistributionMessage {
     this.preparing = preparing;
   }
 
-  public ViewAckMessage(InternalDistributedMember recipient, NetView alternateView) {
+  public ViewAckMessage(int viewId, InternalDistributedMember recipient, NetView alternateView)
{
     super();
     setRecipient(recipient);
+    this.viewId = viewId;
     this.alternateView = alternateView;
     this.preparing = true;
   }

http://git-wip-us.apache.org/repos/asf/geode/blob/796a2e25/geode-core/src/test/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
----------------------------------------------------------------------
diff --git a/geode-core/src/test/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
b/geode-core/src/test/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
index 49b09ca..a31fa8d 100644
--- a/geode-core/src/test/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
+++ b/geode-core/src/test/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
@@ -19,6 +19,7 @@ import static java.util.concurrent.TimeUnit.SECONDS;
 import static org.hamcrest.core.IsEqual.equalTo;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 import static org.mockito.Matchers.isA;
 import static org.mockito.Mockito.mock;
@@ -28,6 +29,7 @@ import static org.mockito.Mockito.times;
 import static org.mockito.Mockito.verify;
 import static org.mockito.Mockito.when;
 
+import org.apache.geode.distributed.internal.DistributionManager;
 import org.awaitility.Awaitility;
 import org.apache.geode.distributed.DistributedMember;
 import org.apache.geode.distributed.internal.DistributionConfig;
@@ -79,6 +81,7 @@ import java.util.List;
 import java.util.Properties;
 import java.util.Set;
 import java.util.Timer;
+import java.util.concurrent.TimeUnit;
 
 @Category({IntegrationTest.class, MembershipTest.class})
 public class GMSJoinLeaveJUnitTest {
@@ -838,6 +841,61 @@ public class GMSJoinLeaveJUnitTest {
   }
 
   /**
+   * If a locator is started and sends out a view to take control of the cluster another
member that
+   * is also in the process of sending out a view should relinquish control to the new locator.
+   * 
+   * @throws Exception
+   */
+  @Test
+  public void testCoordinatorGetsConflictingViewFromLocator() throws Exception {
+    // create the GMSJoinLeave instance we'll be testing
+    initMocks(false);
+    InternalDistributedMember otherMember = mockMembers[0];
+    gmsJoinLeaveMemberId.setVmKind(DistributionManager.NORMAL_DM_TYPE);
+    List<InternalDistributedMember> members = createMemberList(gmsJoinLeaveMemberId,
otherMember);
+    prepareAndInstallView(gmsJoinLeaveMemberId, members);
+    NetView installedView = gmsJoinLeave.getView();
+
+    gmsJoinLeave.unitTesting.add("noRandomViewChange"); // keep view numbers predictable
+
+    // create a view coming from the locator that conflicts with the installed view
+    InternalDistributedMember locatorMemberId = new InternalDistributedMember("localhost",
+        mockMembers[mockMembers.length - 1].getPort() + 1);
+    locatorMemberId.setVmKind(DistributionManager.LOCATOR_DM_TYPE);
+    List<InternalDistributedMember> newMemberList = new ArrayList<>(members);
+    newMemberList.add(locatorMemberId);
+    NetView locatorView =
+        new NetView(locatorMemberId, installedView.getViewId() + 10, newMemberList);
+
+    // start the process to make our GMSJoinLeave become coordinator. It will send out a
view
+    // and want an ACK from
+    synchronized (gmsJoinLeave.getViewInstallationLock()) {
+      gmsJoinLeave.becomeCoordinator();
+    }
+    Awaitility.await().atMost(30, TimeUnit.SECONDS)
+        .until(() -> gmsJoinLeave.prepareProcessor.isWaiting());
+
+    int newViewId = 6; // becomeCoordinator will bump the initial view Id by 5
+
+    ViewAckMessage msg = new ViewAckMessage(gmsJoinLeaveMemberId, newViewId, true);
+    msg.setSender(gmsJoinLeaveMemberId);
+    gmsJoinLeave.processMessage(msg);
+
+    // ack the view on behalf of the other member, returning a conflicting view coming from
a
+    // locator that is trying to become coordinator
+    msg = new ViewAckMessage(newViewId, gmsJoinLeaveMemberId, locatorView);
+    msg.setSender(otherMember);
+    gmsJoinLeave.processMessage(msg);
+
+    Awaitility.await().atMost(30, TimeUnit.SECONDS)
+        .until(() -> gmsJoinLeave.getViewCreator() != null);
+    Awaitility.await().atMost(30, TimeUnit.SECONDS)
+        .until(() -> gmsJoinLeave.getViewCreator().getAbandonedViewCount() > 0);
+    assertEquals(installedView, gmsJoinLeave.getView());
+
+  }
+
+  /**
    * In a scenario where we have a member leave at the same time as an install view The member
that
    * left should be recorded on all members, if the coordinator also happens to leave, the
new
    * coordinator should be able to process the new view correctly


Mime
View raw message