This is an automated email from the ASF dual-hosted git repository.
andor pushed a commit to branch branch-3.5
in repository https://gitbox.apache.org/repos/asf/zookeeper.git
The following commit(s) were added to refs/heads/branch-3.5 by this push:
new 193a6ed ZOOKEEPER-1818: Correctly handle potential inconsistent zxid/electionEpoch
and peerEpoch during leader election
193a6ed is described below
commit 193a6ed04a1f2ae19829db974878f5ecb6505cbf
Author: Fangmin Lyu <fangmin@apache.org>
AuthorDate: Mon Dec 17 15:19:47 2018 +0100
ZOOKEEPER-1818: Correctly handle potential inconsistent zxid/electionEpoch and peerEpoch
during leader election
Rebasing Fangmin Lyu's patch from https://github.com/lvfangmin/zookeeper/tree/ZOOKEEPER-1818
onto the 3.5 branch (cumulative with other PRs in flight).
Author: Michael Edwards <medwards@bitpusher.com>
Author: Fangmin Lyu <fangmin@apache.org>
Reviewers: andor@apache.org
Closes #714 from mkedwards/ZOOKEEPER-1818 and squashes the following commits:
6adf67749 [Michael Edwards] Apply changes from #703 code review
65198539f [Fangmin Lyu] ZOOKEEPER-1818: Correctly handle potentially inconsistent zxid/electionEpoch
and peerEpoch during leader election
---
.../server/quorum/FastLeaderElection.java | 48 +++-----
.../apache/zookeeper/server/quorum/QuorumPeer.java | 34 +++++-
.../org/apache/zookeeper/server/quorum/Vote.java | 41 ++++++-
.../server/quorum/FLEOutOfElectionTest.java | 136 +++++++++++++++++++++
4 files changed, 224 insertions(+), 35 deletions(-)
diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/FastLeaderElection.java
b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/FastLeaderElection.java
index e3cb045..401ca05 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/FastLeaderElection.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/FastLeaderElection.java
@@ -71,14 +71,6 @@ public class FastLeaderElection implements Election {
final static int maxNotificationInterval = 60000;
/**
- * This value is passed to the methods that check the quorum
- * majority of an established ensemble for those values that
- * should not be taken into account in the comparison
- * (electionEpoch and zxid).
- */
- final static int IGNOREVALUE = -1;
-
- /**
* Connection manager. Fast leader election uses TCP for
* communication between peers, and QuorumCnxManager manages
* such connections.
@@ -732,7 +724,7 @@ public class FastLeaderElection implements Election {
* @param vote
* Identifier of the vote received last
*/
- private boolean termPredicate(HashMap<Long, Vote> votes, Vote vote) {
+ protected boolean termPredicate(Map<Long, Vote> votes, Vote vote) {
SyncedLearnerTracker voteSet = new SyncedLearnerTracker();
voteSet.addQuorumVerifier(self.getQuorumVerifier());
if (self.getLastSeenQuorumVerifier() != null
@@ -765,8 +757,8 @@ public class FastLeaderElection implements Election {
* @param leader leader id
* @param electionEpoch epoch id
*/
- private boolean checkLeader(
- HashMap<Long, Vote> votes,
+ protected boolean checkLeader(
+ Map<Long, Vote> votes,
long leader,
long electionEpoch){
@@ -966,6 +958,7 @@ public class FastLeaderElection implements Election {
", proposed election epoch=0x" + Long.toHexString(n.electionEpoch));
}
+ // don't care about the version if it's in LOOKING state
recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
if (termPredicate(recvset,
@@ -989,9 +982,9 @@ public class FastLeaderElection implements Election {
if (n == null) {
self.setPeerState((proposedLeader == self.getId()) ?
ServerState.LEADING: learningState());
-
Vote endVote = new Vote(proposedLeader,
- proposedZxid, proposedEpoch);
+ proposedZxid, logicalclock.get(),
+ proposedEpoch);
leaveInstance(endVote);
return endVote;
}
@@ -1008,13 +1001,13 @@ public class FastLeaderElection implements Election {
*/
if(n.electionEpoch == logicalclock.get()){
recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch,
n.peerEpoch));
- if(termPredicate(recvset, new Vote(n.leader,
+ if(termPredicate(recvset, new Vote(n.version, n.leader,
n.zxid, n.electionEpoch, n.peerEpoch, n.state))
&& checkLeader(outofelection, n.leader,
n.electionEpoch)) {
self.setPeerState((n.leader == self.getId()) ?
ServerState.LEADING: learningState());
-
- Vote endVote = new Vote(n.leader, n.zxid, n.peerEpoch);
+ Vote endVote = new Vote(n.leader,
+ n.zxid, n.electionEpoch, n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
@@ -1023,28 +1016,19 @@ public class FastLeaderElection implements Election {
/*
* Before joining an established ensemble, verify that
* a majority are following the same leader.
- * Only peer epoch is used to check that the votes come
- * from the same ensemble. This is because there is at
- * least one corner case in which the ensemble can be
- * created with inconsistent zxid and election epoch
- * info. However, given that only one ensemble can be
- * running at a single point in time and that each
- * epoch is used only once, using only the epoch to
- * compare the votes is sufficient.
- *
- * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732
*/
- outofelection.put(n.sid, new Vote(n.leader,
- IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state));
- if (termPredicate(outofelection, new Vote(n.leader,
- IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state))
- && checkLeader(outofelection, n.leader, IGNOREVALUE))
{
+ outofelection.put(n.sid, new Vote(n.version, n.leader,
+ n.zxid, n.electionEpoch, n.peerEpoch, n.state));
+ if (termPredicate(outofelection, new Vote(n.version, n.leader,
+ n.zxid, n.electionEpoch, n.peerEpoch, n.state))
+ && checkLeader(outofelection, n.leader, n.electionEpoch))
{
synchronized(this){
logicalclock.set(n.electionEpoch);
self.setPeerState((n.leader == self.getId()) ?
ServerState.LEADING: learningState());
}
- Vote endVote = new Vote(n.leader, n.zxid, n.peerEpoch);
+ Vote endVote = new Vote(n.leader, n.zxid,
+ n.electionEpoch, n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
index 260ccd9..4f07951 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java
@@ -2059,8 +2059,40 @@ public class QuorumPeer extends ZooKeeperThread implements QuorumStats.Provider
/**
* Updates leader election info to avoid inconsistencies when
* a new server tries to join the ensemble.
+ *
+ * Here is the inconsistency scenario we try to solve by updating the peer
+ * epoch after following leader:
+ *
+ * Let's say we have an ensemble with 3 servers z1, z2 and z3.
+ *
+ * 1. z1, z2 were following z3 with peerEpoch to be 0xb8, the new epoch is
+ * 0xb9, aka current accepted epoch on disk.
+ * 2. z2 get restarted, which will use 0xb9 as it's peer epoch when loading
+ * the current accept epoch from disk.
+ * 3. z2 received notification from z1 and z3, which is following z3 with
+ * epoch 0xb8, so it started following z3 again with peer epoch 0xb8.
+ * 4. before z2 successfully connected to z3, z3 get restarted with new
+ * epoch 0xb9.
+ * 5. z2 will retry around a few round (default 5s) before giving up,
+ * meanwhile it will report z3 as leader.
+ * 6. z1 restarted, and looking with peer epoch 0xb9.
+ * 7. z1 voted z3, and z3 was elected as leader again with peer epoch 0xb9.
+ * 8. z2 successfully connected to z3 before giving up, but with peer
+ * epoch 0xb8.
+ * 9. z1 get restarted, looking for leader with peer epoch 0xba, but cannot
+ * join, because z2 is reporting peer epoch 0xb8, while z3 is reporting
+ * 0xb9.
+ *
+ * By updating the election vote after actually following leader, we can
+ * avoid this kind of stuck happened.
+ *
+ * Btw, the zxid and electionEpoch could be inconsistent because of the same
+ * reason, it's better to update these as well after syncing with leader, but
+ * that required protocol change which is non trivial. This problem is worked
+ * around by skipping comparing the zxid and electionEpoch when counting for
+ * votes for out of election servers during looking for leader.
*
- * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732
+ * {@see https://issues.apache.org/jira/browse/ZOOKEEPER-1732}
*/
protected void updateElectionVote(long newEpoch) {
Vote currentVote = getCurrentVote();
diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Vote.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Vote.java
index 8152c66..2c2b9bd 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Vote.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Vote.java
@@ -125,11 +125,48 @@ public class Vote {
return false;
}
Vote other = (Vote) o;
- return (id == other.id
+
+ if ((state == ServerState.LOOKING) ||
+ (other.state == ServerState.LOOKING)) {
+ return (id == other.id
&& zxid == other.zxid
&& electionEpoch == other.electionEpoch
&& peerEpoch == other.peerEpoch);
-
+ } else {
+ /*
+ * There are two things going on in the logic below:
+ *
+ * 1. skip comparing the zxid and electionEpoch for votes for servers
+ * out of election.
+ *
+ * Need to skip those because they can be inconsistent due to
+ * scenarios described in QuorumPeer.updateElectionVote.
+ *
+ * And given that only one ensemble can be running at a single point
+ * in time and that each epoch is used only once, using only id and
+ * epoch to compare the votes is sufficient.
+ *
+ * {@see https://issues.apache.org/jira/browse/ZOOKEEPER-1805}
+ *
+ * 2. skip comparing peerEpoch if if we're running with mixed ensemble
+ * with (version > 0x0) and without the change (version = 0x0)
+ * introduced in ZOOKEEPER-1732.
+ *
+ * {@see https://issues.apache.org/jira/browse/ZOOKEEPER-1732}
+ *
+ * The server running with and without ZOOKEEPER-1732 will return
+ * different peerEpoch. During rolling upgrades, it's possible
+ * that 2/5 servers are returning epoch 1, while the other 2/5
+ * are returning epoch 2, the other server need to ignore the
+ * peerEpoch to be able to join it.
+ */
+ if ((version > 0x0) ^ (other.version > 0x0)) {
+ return id == other.id;
+ } else {
+ return (id == other.id
+ && peerEpoch == other.peerEpoch);
+ }
+ }
}
@Override
diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/FLEOutOfElectionTest.java
b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/FLEOutOfElectionTest.java
new file mode 100644
index 0000000..df92750
--- /dev/null
+++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/FLEOutOfElectionTest.java
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.zookeeper.server.quorum;
+
+import java.io.File;
+import java.net.InetSocketAddress;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.zookeeper.PortAssignment;
+import org.apache.zookeeper.server.quorum.QuorumPeer;
+import org.apache.zookeeper.server.quorum.FastLeaderElection.Notification;
+import org.apache.zookeeper.server.quorum.Vote;
+import org.apache.zookeeper.server.quorum.QuorumPeer.QuorumServer;
+import org.apache.zookeeper.server.quorum.QuorumPeer.ServerState;
+import org.apache.zookeeper.server.util.ZxidUtils;
+import org.apache.zookeeper.test.ClientBase;
+import org.apache.zookeeper.test.FLETest;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test FastLeaderElection with out of election servers.
+ */
+public class FLEOutOfElectionTest {
+
+ private FastLeaderElection fle;
+
+ @Before
+ public void setUp() throws Exception {
+ File tmpdir = ClientBase.createTmpDir();
+ Map<Long, QuorumServer> peers = new HashMap<Long,QuorumServer>();
+ for(int i = 0; i < 5; i++) {
+ peers.put(Long.valueOf(i), new QuorumServer(Long.valueOf(i),
+ new InetSocketAddress("127.0.0.1", PortAssignment.unique())));
+ }
+ QuorumPeer peer = new QuorumPeer(peers, tmpdir, tmpdir,
+ PortAssignment.unique(), 3, 3, 1000, 2, 2);
+ fle = new FastLeaderElection(peer, peer.createCnxnManager());
+ }
+
+ @Test
+ public void testIgnoringZxidElectionEpoch() {
+ Map<Long, Vote> votes = new HashMap<Long, Vote>();
+ votes.put(0L, new Vote(0x1, 4L, ZxidUtils.makeZxid(1, 1), 1, 2, ServerState.FOLLOWING));
+ votes.put(1L, new Vote(0x1, 4L, ZxidUtils.makeZxid(1, 2), 1, 2, ServerState.FOLLOWING));
+ votes.put(3L, new Vote(0x1, 4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.FOLLOWING));
+ votes.put(4L, new Vote(0x1, 4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.LEADING));
+
+ Assert.assertTrue(fle.termPredicate(votes,
+ new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.FOLLOWING)));
+ }
+
+ @Test
+ public void testElectionWIthDifferentVersion() {
+ Map<Long, Vote> votes = new HashMap<Long, Vote>();
+ votes.put(0L, new Vote(0x1, 4L, ZxidUtils.makeZxid(1, 1), 1, 1, ServerState.FOLLOWING));
+ votes.put(1L, new Vote(0x1, 4L, ZxidUtils.makeZxid(1, 1), 1, 1, ServerState.FOLLOWING));
+ votes.put(3L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.FOLLOWING));
+ votes.put(4L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.LEADING));
+
+ Assert.assertTrue(fle.termPredicate(votes,
+ new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.FOLLOWING)));
+ }
+
+ @Test
+ public void testLookingNormal() {
+ Map<Long, Vote> votes = new HashMap<Long, Vote>();
+ votes.put(0L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LOOKING));
+ votes.put(1L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LOOKING));
+ votes.put(3L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LOOKING));
+ votes.put(4L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LEADING));
+
+ Assert.assertTrue(fle.termPredicate(votes,
+ new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LOOKING)));
+ }
+
+ @Test
+ public void testLookingDiffRounds() {
+ HashMap<Long, Vote> votes = new HashMap<Long, Vote>();
+ votes.put(0L, new Vote(4L, ZxidUtils.makeZxid(1, 1), 1, 1, ServerState.LOOKING));
+ votes.put(1L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.LOOKING));
+ votes.put(3L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 3, 2, ServerState.LOOKING));
+ votes.put(4L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 3, 2, ServerState.LEADING));
+
+ Assert.assertFalse(fle.termPredicate(votes,
+ new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.LOOKING)));
+ }
+
+ @Test
+ public void testOutofElection() {
+ HashMap<Long,Vote> outofelection = new HashMap<Long,Vote>();
+
+ outofelection.put(1L, new Vote(0x0, 5, ZxidUtils.makeZxid(15, 0), 0xa, 0x17, ServerState.FOLLOWING));
+ outofelection.put(2L, new Vote(0x0, 5, ZxidUtils.makeZxid(15, 0), 0xa, 0x17, ServerState.FOLLOWING));
+ outofelection.put(4L, new Vote(0x1, 5, ZxidUtils.makeZxid(15, 0), 0xa, 0x18, ServerState.FOLLOWING));
+ Vote vote = new Vote(0x1, 5, ZxidUtils.makeZxid(15, 0), 0xa, 0x18, ServerState.LEADING);
+ outofelection.put(5L, vote);
+
+ Notification n = new Notification();
+ n.version = vote.getVersion();
+ n.leader = vote.getId();
+ n.zxid = vote.getZxid();
+ n.electionEpoch = vote.getElectionEpoch();
+ n.state = vote.getState();
+ n.peerEpoch = vote.getPeerEpoch();
+ n.sid = 5L;
+
+ // Set the logical clock to 1 on fle instance of server 3.
+ fle.logicalclock.set(0x1);
+
+ Assert.assertTrue("Quorum check failed",
+ fle.termPredicate(outofelection, new Vote(n.version, n.leader,
+ n.zxid, n.electionEpoch, n.peerEpoch, n.state)));
+
+ Assert.assertTrue("Leader check failed", fle.checkLeader(outofelection,
+ n.leader, n.electionEpoch));
+ }
+}
|