zookeeper-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From an...@apache.org
Subject [zookeeper] branch branch-3.5 updated: ZOOKEEPER-1818: Correctly handle potential inconsistent zxid/electionEpoch and peerEpoch during leader election
Date Mon, 17 Dec 2018 14:20:15 GMT
This is an automated email from the ASF dual-hosted git repository.

andor pushed a commit to branch branch-3.5
in repository https://gitbox.apache.org/repos/asf/zookeeper.git


The following commit(s) were added to refs/heads/branch-3.5 by this push:
     new 193a6ed  ZOOKEEPER-1818: Correctly handle potential inconsistent zxid/electionEpoch
and peerEpoch during leader election
193a6ed is described below

commit 193a6ed04a1f2ae19829db974878f5ecb6505cbf
Author: Fangmin Lyu <fangmin@apache.org>
AuthorDate: Mon Dec 17 15:19:47 2018 +0100

    ZOOKEEPER-1818: Correctly handle potential inconsistent zxid/electionEpoch and peerEpoch
during leader election
    
    Rebasing Fangmin Lyu's patch from https://github.com/lvfangmin/zookeeper/tree/ZOOKEEPER-1818
onto the 3.5 branch (cumulative with other PRs in flight).
    
    Author: Michael Edwards <medwards@bitpusher.com>
    Author: Fangmin Lyu <fangmin@apache.org>
    
    Reviewers: andor@apache.org
    
    Closes #714 from mkedwards/ZOOKEEPER-1818 and squashes the following commits:
    
    6adf67749 [Michael Edwards] Apply changes from #703 code review
    65198539f [Fangmin Lyu] ZOOKEEPER-1818: Correctly handle potentially inconsistent zxid/electionEpoch
and peerEpoch during leader election
---
 .../server/quorum/FastLeaderElection.java          |  48 +++-----
 .../apache/zookeeper/server/quorum/QuorumPeer.java |  34 +++++-
 .../org/apache/zookeeper/server/quorum/Vote.java   |  41 ++++++-
 .../server/quorum/FLEOutOfElectionTest.java        | 136 +++++++++++++++++++++
 4 files changed, 224 insertions(+), 35 deletions(-)

diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/FastLeaderElection.java
b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/FastLeaderElection.java
index e3cb045..401ca05 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/FastLeaderElection.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/FastLeaderElection.java
@@ -71,14 +71,6 @@ public class FastLeaderElection implements Election {
     final static int maxNotificationInterval = 60000;
     
     /**
-     * This value is passed to the methods that check the quorum
-     * majority of an established ensemble for those values that
-     * should not be taken into account in the comparison 
-     * (electionEpoch and zxid). 
-     */
-    final static int IGNOREVALUE = -1;
-
-    /**
      * Connection manager. Fast leader election uses TCP for
      * communication between peers, and QuorumCnxManager manages
      * such connections.
@@ -732,7 +724,7 @@ public class FastLeaderElection implements Election {
      * @param vote
      *            Identifier of the vote received last
      */
-    private boolean termPredicate(HashMap<Long, Vote> votes, Vote vote) {
+    protected boolean termPredicate(Map<Long, Vote> votes, Vote vote) {
         SyncedLearnerTracker voteSet = new SyncedLearnerTracker();
         voteSet.addQuorumVerifier(self.getQuorumVerifier());
         if (self.getLastSeenQuorumVerifier() != null
@@ -765,8 +757,8 @@ public class FastLeaderElection implements Election {
      * @param   leader  leader id
      * @param   electionEpoch   epoch id
      */
-    private boolean checkLeader(
-            HashMap<Long, Vote> votes,
+    protected boolean checkLeader(
+            Map<Long, Vote> votes,
             long leader,
             long electionEpoch){
 
@@ -966,6 +958,7 @@ public class FastLeaderElection implements Election {
                                     ", proposed election epoch=0x" + Long.toHexString(n.electionEpoch));
                         }
 
+                        // don't care about the version if it's in LOOKING state
                         recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
 
                         if (termPredicate(recvset,
@@ -989,9 +982,9 @@ public class FastLeaderElection implements Election {
                             if (n == null) {
                                 self.setPeerState((proposedLeader == self.getId()) ?
                                         ServerState.LEADING: learningState());
-
                                 Vote endVote = new Vote(proposedLeader,
-                                        proposedZxid, proposedEpoch);
+                                        proposedZxid, logicalclock.get(), 
+                                        proposedEpoch);
                                 leaveInstance(endVote);
                                 return endVote;
                             }
@@ -1008,13 +1001,13 @@ public class FastLeaderElection implements Election {
                          */
                         if(n.electionEpoch == logicalclock.get()){
                             recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch,
n.peerEpoch));
-                            if(termPredicate(recvset, new Vote(n.leader,
+                            if(termPredicate(recvset, new Vote(n.version, n.leader,
                                             n.zxid, n.electionEpoch, n.peerEpoch, n.state))
                                             && checkLeader(outofelection, n.leader,
n.electionEpoch)) {
                                 self.setPeerState((n.leader == self.getId()) ?
                                         ServerState.LEADING: learningState());
-
-                                Vote endVote = new Vote(n.leader, n.zxid, n.peerEpoch);
+                                Vote endVote = new Vote(n.leader, 
+                                        n.zxid, n.electionEpoch, n.peerEpoch);
                                 leaveInstance(endVote);
                                 return endVote;
                             }
@@ -1023,28 +1016,19 @@ public class FastLeaderElection implements Election {
                         /*
                          * Before joining an established ensemble, verify that
                          * a majority are following the same leader.
-                         * Only peer epoch is used to check that the votes come
-                         * from the same ensemble. This is because there is at
-                         * least one corner case in which the ensemble can be
-                         * created with inconsistent zxid and election epoch
-                         * info. However, given that only one ensemble can be
-                         * running at a single point in time and that each 
-                         * epoch is used only once, using only the epoch to 
-                         * compare the votes is sufficient.
-                         * 
-                         * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732
                          */
-                        outofelection.put(n.sid, new Vote(n.leader, 
-                                IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state));
-                        if (termPredicate(outofelection, new Vote(n.leader,
-                                IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state))
-                                && checkLeader(outofelection, n.leader, IGNOREVALUE))
{
+                        outofelection.put(n.sid, new Vote(n.version, n.leader, 
+                                n.zxid, n.electionEpoch, n.peerEpoch, n.state));
+                        if (termPredicate(outofelection, new Vote(n.version, n.leader,
+                                n.zxid, n.electionEpoch, n.peerEpoch, n.state))
+                                && checkLeader(outofelection, n.leader, n.electionEpoch))
{
                             synchronized(this){
                                 logicalclock.set(n.electionEpoch);
                                 self.setPeerState((n.leader == self.getId()) ?
                                         ServerState.LEADING: learningState());
                             }
-                            Vote endVote = new Vote(n.leader, n.zxid, n.peerEpoch);
+                            Vote endVote = new Vote(n.leader, n.zxid, 
+                                    n.electionEpoch, n.peerEpoch);
                             leaveInstance(endVote);
                             return endVote;
                         }
diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
index 260ccd9..4f07951 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
@@ -2059,8 +2059,40 @@ public class QuorumPeer extends ZooKeeperThread implements QuorumStats.Provider
     /**
      * Updates leader election info to avoid inconsistencies when
      * a new server tries to join the ensemble.
+     *
+     * Here is the inconsistency scenario we try to solve by updating the peer 
+     * epoch after following leader:
+     *
+     * Let's say we have an ensemble with 3 servers z1, z2 and z3.
+     *
+     * 1. z1, z2 were following z3 with peerEpoch to be 0xb8, the new epoch is 
+     *    0xb9, aka current accepted epoch on disk.
+     * 2. z2 get restarted, which will use 0xb9 as it's peer epoch when loading
+     *    the current accept epoch from disk.
+     * 3. z2 received notification from z1 and z3, which is following z3 with 
+     *    epoch 0xb8, so it started following z3 again with peer epoch 0xb8.
+     * 4. before z2 successfully connected to z3, z3 get restarted with new 
+     *    epoch 0xb9.
+     * 5. z2 will retry around a few round (default 5s) before giving up, 
+     *    meanwhile it will report z3 as leader.
+     * 6. z1 restarted, and looking with peer epoch 0xb9.
+     * 7. z1 voted z3, and z3 was elected as leader again with peer epoch 0xb9.
+     * 8. z2 successfully connected to z3 before giving up, but with peer 
+     *    epoch 0xb8.
+     * 9. z1 get restarted, looking for leader with peer epoch 0xba, but cannot 
+     *    join, because z2 is reporting peer epoch 0xb8, while z3 is reporting 
+     *    0xb9.
+     *
+     * By updating the election vote after actually following leader, we can 
+     * avoid this kind of stuck happened.
+     *
+     * Btw, the zxid and electionEpoch could be inconsistent because of the same 
+     * reason, it's better to update these as well after syncing with leader, but 
+     * that required protocol change which is non trivial. This problem is worked 
+     * around by skipping comparing the zxid and electionEpoch when counting for 
+     * votes for out of election servers during looking for leader.
      * 
-     * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732
+     * {@see https://issues.apache.org/jira/browse/ZOOKEEPER-1732}
      */
     protected void updateElectionVote(long newEpoch) {
         Vote currentVote = getCurrentVote();
diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Vote.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Vote.java
index 8152c66..2c2b9bd 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Vote.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Vote.java
@@ -125,11 +125,48 @@ public class Vote {
             return false;
         }
         Vote other = (Vote) o;
-        return (id == other.id
+
+        if ((state == ServerState.LOOKING) ||
+                (other.state == ServerState.LOOKING)) {
+            return (id == other.id
                     && zxid == other.zxid
                     && electionEpoch == other.electionEpoch
                     && peerEpoch == other.peerEpoch);
-
+        } else {
+            /*
+             * There are two things going on in the logic below:
+             * 
+             * 1. skip comparing the zxid and electionEpoch for votes for servers 
+             *    out of election. 
+             *    
+             *    Need to skip those because they can be inconsistent due to  
+             *    scenarios described in QuorumPeer.updateElectionVote. 
+             *
+             *    And given that only one ensemble can be running at a single point 
+             *    in time and that each epoch is used only once, using only id and 
+             *    epoch to compare the votes is sufficient.
+             *
+             *    {@see https://issues.apache.org/jira/browse/ZOOKEEPER-1805}
+             *
+             * 2. skip comparing peerEpoch if if we're running with mixed ensemble 
+             *    with (version > 0x0) and without the change (version = 0x0) 
+             *    introduced in ZOOKEEPER-1732.
+             *
+             *    {@see https://issues.apache.org/jira/browse/ZOOKEEPER-1732}
+             *
+             *    The server running with and without ZOOKEEPER-1732 will return 
+             *    different peerEpoch. During rolling upgrades, it's possible
+             *    that 2/5 servers are returning epoch 1, while the other 2/5
+             *    are returning epoch 2, the other server need to ignore the 
+             *    peerEpoch to be able to join it.
+             */
+            if ((version > 0x0) ^ (other.version > 0x0)) {
+                return id == other.id;
+            } else {
+                return (id == other.id
+                        && peerEpoch == other.peerEpoch);
+            }
+        }
     }
 
     @Override
diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/FLEOutOfElectionTest.java
b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/FLEOutOfElectionTest.java
new file mode 100644
index 0000000..df92750
--- /dev/null
+++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/FLEOutOfElectionTest.java
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.zookeeper.server.quorum;
+
+import java.io.File;
+import java.net.InetSocketAddress;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.zookeeper.PortAssignment;
+import org.apache.zookeeper.server.quorum.QuorumPeer;
+import org.apache.zookeeper.server.quorum.FastLeaderElection.Notification;
+import org.apache.zookeeper.server.quorum.Vote;
+import org.apache.zookeeper.server.quorum.QuorumPeer.QuorumServer;
+import org.apache.zookeeper.server.quorum.QuorumPeer.ServerState;
+import org.apache.zookeeper.server.util.ZxidUtils;
+import org.apache.zookeeper.test.ClientBase;
+import org.apache.zookeeper.test.FLETest;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test FastLeaderElection with out of election servers.
+ */
+public class FLEOutOfElectionTest {
+
+    private FastLeaderElection fle;
+
+    @Before
+    public void setUp() throws Exception {
+        File tmpdir = ClientBase.createTmpDir();
+        Map<Long, QuorumServer> peers = new HashMap<Long,QuorumServer>();
+        for(int i = 0; i < 5; i++) {
+            peers.put(Long.valueOf(i), new QuorumServer(Long.valueOf(i), 
+                    new InetSocketAddress("127.0.0.1", PortAssignment.unique())));
+        }
+        QuorumPeer peer = new QuorumPeer(peers, tmpdir, tmpdir, 
+                PortAssignment.unique(), 3, 3, 1000, 2, 2);
+        fle = new FastLeaderElection(peer, peer.createCnxnManager());
+    }
+
+    @Test
+    public void testIgnoringZxidElectionEpoch() {
+        Map<Long, Vote> votes = new HashMap<Long, Vote>();
+        votes.put(0L, new Vote(0x1, 4L, ZxidUtils.makeZxid(1, 1), 1, 2, ServerState.FOLLOWING));
+        votes.put(1L, new Vote(0x1, 4L, ZxidUtils.makeZxid(1, 2), 1, 2, ServerState.FOLLOWING));
+        votes.put(3L, new Vote(0x1, 4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.FOLLOWING));
+        votes.put(4L, new Vote(0x1, 4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.LEADING));
+
+        Assert.assertTrue(fle.termPredicate(votes, 
+                new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.FOLLOWING)));
+    }
+
+    @Test
+    public void testElectionWIthDifferentVersion() {
+        Map<Long, Vote> votes = new HashMap<Long, Vote>();
+        votes.put(0L, new Vote(0x1, 4L, ZxidUtils.makeZxid(1, 1), 1, 1, ServerState.FOLLOWING));
+        votes.put(1L, new Vote(0x1, 4L, ZxidUtils.makeZxid(1, 1), 1, 1, ServerState.FOLLOWING));
+        votes.put(3L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.FOLLOWING));
+        votes.put(4L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.LEADING));
+
+        Assert.assertTrue(fle.termPredicate(votes, 
+                new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.FOLLOWING)));
+    }
+
+    @Test
+    public void testLookingNormal() {
+        Map<Long, Vote> votes = new HashMap<Long, Vote>();
+        votes.put(0L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LOOKING));
+        votes.put(1L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LOOKING));
+        votes.put(3L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LOOKING));
+        votes.put(4L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LEADING));
+
+        Assert.assertTrue(fle.termPredicate(votes, 
+                new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LOOKING)));
+    }
+
+    @Test
+    public void testLookingDiffRounds() {
+        HashMap<Long, Vote> votes = new HashMap<Long, Vote>();
+        votes.put(0L, new Vote(4L, ZxidUtils.makeZxid(1, 1), 1, 1, ServerState.LOOKING));
+        votes.put(1L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.LOOKING));
+        votes.put(3L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 3, 2, ServerState.LOOKING));
+        votes.put(4L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 3, 2, ServerState.LEADING));
+
+        Assert.assertFalse(fle.termPredicate(votes,
+                new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.LOOKING)));
+    }
+
+    @Test
+    public void testOutofElection() {
+        HashMap<Long,Vote> outofelection = new HashMap<Long,Vote>();
+
+        outofelection.put(1L, new Vote(0x0, 5, ZxidUtils.makeZxid(15, 0), 0xa, 0x17, ServerState.FOLLOWING));
+        outofelection.put(2L, new Vote(0x0, 5, ZxidUtils.makeZxid(15, 0), 0xa, 0x17, ServerState.FOLLOWING));
+        outofelection.put(4L, new Vote(0x1, 5, ZxidUtils.makeZxid(15, 0), 0xa, 0x18, ServerState.FOLLOWING));
+        Vote vote = new Vote(0x1, 5, ZxidUtils.makeZxid(15, 0), 0xa, 0x18, ServerState.LEADING);
+        outofelection.put(5L, vote);
+
+        Notification n = new Notification();
+        n.version = vote.getVersion();
+        n.leader = vote.getId();
+        n.zxid = vote.getZxid();
+        n.electionEpoch = vote.getElectionEpoch();
+        n.state = vote.getState();
+        n.peerEpoch = vote.getPeerEpoch();
+        n.sid = 5L;
+        
+        // Set the logical clock to 1 on fle instance of server 3.
+        fle.logicalclock.set(0x1);
+
+        Assert.assertTrue("Quorum check failed",
+                fle.termPredicate(outofelection, new Vote(n.version, n.leader, 
+                        n.zxid, n.electionEpoch, n.peerEpoch, n.state)));
+
+        Assert.assertTrue("Leader check failed", fle.checkLeader(outofelection, 
+                n.leader, n.electionEpoch)); 
+    }
+}


Mime
View raw message