From commits-return-7463-archive-asf-public=cust-asf.ponee.io@zookeeper.apache.org Mon Dec 17 15:20:18 2018 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx-eu-01.ponee.io (Postfix) with SMTP id 66984180652 for ; Mon, 17 Dec 2018 15:20:17 +0100 (CET) Received: (qmail 65168 invoked by uid 500); 17 Dec 2018 14:20:16 -0000 Mailing-List: contact commits-help@zookeeper.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@zookeeper.apache.org Delivered-To: mailing list commits@zookeeper.apache.org Received: (qmail 65157 invoked by uid 99); 17 Dec 2018 14:20:16 -0000 Received: from ec2-52-202-80-70.compute-1.amazonaws.com (HELO gitbox.apache.org) (52.202.80.70) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 17 Dec 2018 14:20:16 +0000 Received: by gitbox.apache.org (ASF Mail Server at gitbox.apache.org, from userid 33) id A6FDC85286; Mon, 17 Dec 2018 14:20:15 +0000 (UTC) Date: Mon, 17 Dec 2018 14:20:15 +0000 To: "commits@zookeeper.apache.org" Subject: [zookeeper] branch branch-3.5 updated: ZOOKEEPER-1818: Correctly handle potential inconsistent zxid/electionEpoch and peerEpoch during leader election MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Message-ID: <154505641556.29438.17609932754265098347@gitbox.apache.org> From: andor@apache.org X-Git-Host: gitbox.apache.org X-Git-Repo: zookeeper X-Git-Refname: refs/heads/branch-3.5 X-Git-Reftype: branch X-Git-Oldrev: 4bd32bef2b75238e67b13bdfc7e6896d64c32434 X-Git-Newrev: 193a6ed04a1f2ae19829db974878f5ecb6505cbf X-Git-Rev: 193a6ed04a1f2ae19829db974878f5ecb6505cbf X-Git-NotificationType: ref_changed_plus_diff X-Git-Multimail-Version: 1.5.dev Auto-Submitted: auto-generated This is an automated email from the ASF dual-hosted git repository. andor pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/zookeeper.git The following commit(s) were added to refs/heads/branch-3.5 by this push: new 193a6ed ZOOKEEPER-1818: Correctly handle potential inconsistent zxid/electionEpoch and peerEpoch during leader election 193a6ed is described below commit 193a6ed04a1f2ae19829db974878f5ecb6505cbf Author: Fangmin Lyu AuthorDate: Mon Dec 17 15:19:47 2018 +0100 ZOOKEEPER-1818: Correctly handle potential inconsistent zxid/electionEpoch and peerEpoch during leader election Rebasing Fangmin Lyu's patch from https://github.com/lvfangmin/zookeeper/tree/ZOOKEEPER-1818 onto the 3.5 branch (cumulative with other PRs in flight). Author: Michael Edwards Author: Fangmin Lyu Reviewers: andor@apache.org Closes #714 from mkedwards/ZOOKEEPER-1818 and squashes the following commits: 6adf67749 [Michael Edwards] Apply changes from #703 code review 65198539f [Fangmin Lyu] ZOOKEEPER-1818: Correctly handle potentially inconsistent zxid/electionEpoch and peerEpoch during leader election --- .../server/quorum/FastLeaderElection.java | 48 +++----- .../apache/zookeeper/server/quorum/QuorumPeer.java | 34 +++++- .../org/apache/zookeeper/server/quorum/Vote.java | 41 ++++++- .../server/quorum/FLEOutOfElectionTest.java | 136 +++++++++++++++++++++ 4 files changed, 224 insertions(+), 35 deletions(-) diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/FastLeaderElection.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/FastLeaderElection.java index e3cb045..401ca05 100644 --- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/FastLeaderElection.java +++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/FastLeaderElection.java @@ -71,14 +71,6 @@ public class FastLeaderElection implements Election { final static int maxNotificationInterval = 60000; /** - * This value is passed to the methods that check the quorum - * majority of an established ensemble for those values that - * should not be taken into account in the comparison - * (electionEpoch and zxid). - */ - final static int IGNOREVALUE = -1; - - /** * Connection manager. Fast leader election uses TCP for * communication between peers, and QuorumCnxManager manages * such connections. @@ -732,7 +724,7 @@ public class FastLeaderElection implements Election { * @param vote * Identifier of the vote received last */ - private boolean termPredicate(HashMap votes, Vote vote) { + protected boolean termPredicate(Map votes, Vote vote) { SyncedLearnerTracker voteSet = new SyncedLearnerTracker(); voteSet.addQuorumVerifier(self.getQuorumVerifier()); if (self.getLastSeenQuorumVerifier() != null @@ -765,8 +757,8 @@ public class FastLeaderElection implements Election { * @param leader leader id * @param electionEpoch epoch id */ - private boolean checkLeader( - HashMap votes, + protected boolean checkLeader( + Map votes, long leader, long electionEpoch){ @@ -966,6 +958,7 @@ public class FastLeaderElection implements Election { ", proposed election epoch=0x" + Long.toHexString(n.electionEpoch)); } + // don't care about the version if it's in LOOKING state recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch)); if (termPredicate(recvset, @@ -989,9 +982,9 @@ public class FastLeaderElection implements Election { if (n == null) { self.setPeerState((proposedLeader == self.getId()) ? ServerState.LEADING: learningState()); - Vote endVote = new Vote(proposedLeader, - proposedZxid, proposedEpoch); + proposedZxid, logicalclock.get(), + proposedEpoch); leaveInstance(endVote); return endVote; } @@ -1008,13 +1001,13 @@ public class FastLeaderElection implements Election { */ if(n.electionEpoch == logicalclock.get()){ recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch)); - if(termPredicate(recvset, new Vote(n.leader, + if(termPredicate(recvset, new Vote(n.version, n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state)) && checkLeader(outofelection, n.leader, n.electionEpoch)) { self.setPeerState((n.leader == self.getId()) ? ServerState.LEADING: learningState()); - - Vote endVote = new Vote(n.leader, n.zxid, n.peerEpoch); + Vote endVote = new Vote(n.leader, + n.zxid, n.electionEpoch, n.peerEpoch); leaveInstance(endVote); return endVote; } @@ -1023,28 +1016,19 @@ public class FastLeaderElection implements Election { /* * Before joining an established ensemble, verify that * a majority are following the same leader. - * Only peer epoch is used to check that the votes come - * from the same ensemble. This is because there is at - * least one corner case in which the ensemble can be - * created with inconsistent zxid and election epoch - * info. However, given that only one ensemble can be - * running at a single point in time and that each - * epoch is used only once, using only the epoch to - * compare the votes is sufficient. - * - * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732 */ - outofelection.put(n.sid, new Vote(n.leader, - IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state)); - if (termPredicate(outofelection, new Vote(n.leader, - IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state)) - && checkLeader(outofelection, n.leader, IGNOREVALUE)) { + outofelection.put(n.sid, new Vote(n.version, n.leader, + n.zxid, n.electionEpoch, n.peerEpoch, n.state)); + if (termPredicate(outofelection, new Vote(n.version, n.leader, + n.zxid, n.electionEpoch, n.peerEpoch, n.state)) + && checkLeader(outofelection, n.leader, n.electionEpoch)) { synchronized(this){ logicalclock.set(n.electionEpoch); self.setPeerState((n.leader == self.getId()) ? ServerState.LEADING: learningState()); } - Vote endVote = new Vote(n.leader, n.zxid, n.peerEpoch); + Vote endVote = new Vote(n.leader, n.zxid, + n.electionEpoch, n.peerEpoch); leaveInstance(endVote); return endVote; } diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java index 260ccd9..4f07951 100644 --- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java +++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java @@ -2059,8 +2059,40 @@ public class QuorumPeer extends ZooKeeperThread implements QuorumStats.Provider /** * Updates leader election info to avoid inconsistencies when * a new server tries to join the ensemble. + * + * Here is the inconsistency scenario we try to solve by updating the peer + * epoch after following leader: + * + * Let's say we have an ensemble with 3 servers z1, z2 and z3. + * + * 1. z1, z2 were following z3 with peerEpoch to be 0xb8, the new epoch is + * 0xb9, aka current accepted epoch on disk. + * 2. z2 get restarted, which will use 0xb9 as it's peer epoch when loading + * the current accept epoch from disk. + * 3. z2 received notification from z1 and z3, which is following z3 with + * epoch 0xb8, so it started following z3 again with peer epoch 0xb8. + * 4. before z2 successfully connected to z3, z3 get restarted with new + * epoch 0xb9. + * 5. z2 will retry around a few round (default 5s) before giving up, + * meanwhile it will report z3 as leader. + * 6. z1 restarted, and looking with peer epoch 0xb9. + * 7. z1 voted z3, and z3 was elected as leader again with peer epoch 0xb9. + * 8. z2 successfully connected to z3 before giving up, but with peer + * epoch 0xb8. + * 9. z1 get restarted, looking for leader with peer epoch 0xba, but cannot + * join, because z2 is reporting peer epoch 0xb8, while z3 is reporting + * 0xb9. + * + * By updating the election vote after actually following leader, we can + * avoid this kind of stuck happened. + * + * Btw, the zxid and electionEpoch could be inconsistent because of the same + * reason, it's better to update these as well after syncing with leader, but + * that required protocol change which is non trivial. This problem is worked + * around by skipping comparing the zxid and electionEpoch when counting for + * votes for out of election servers during looking for leader. * - * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732 + * {@see https://issues.apache.org/jira/browse/ZOOKEEPER-1732} */ protected void updateElectionVote(long newEpoch) { Vote currentVote = getCurrentVote(); diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Vote.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Vote.java index 8152c66..2c2b9bd 100644 --- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Vote.java +++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Vote.java @@ -125,11 +125,48 @@ public class Vote { return false; } Vote other = (Vote) o; - return (id == other.id + + if ((state == ServerState.LOOKING) || + (other.state == ServerState.LOOKING)) { + return (id == other.id && zxid == other.zxid && electionEpoch == other.electionEpoch && peerEpoch == other.peerEpoch); - + } else { + /* + * There are two things going on in the logic below: + * + * 1. skip comparing the zxid and electionEpoch for votes for servers + * out of election. + * + * Need to skip those because they can be inconsistent due to + * scenarios described in QuorumPeer.updateElectionVote. + * + * And given that only one ensemble can be running at a single point + * in time and that each epoch is used only once, using only id and + * epoch to compare the votes is sufficient. + * + * {@see https://issues.apache.org/jira/browse/ZOOKEEPER-1805} + * + * 2. skip comparing peerEpoch if if we're running with mixed ensemble + * with (version > 0x0) and without the change (version = 0x0) + * introduced in ZOOKEEPER-1732. + * + * {@see https://issues.apache.org/jira/browse/ZOOKEEPER-1732} + * + * The server running with and without ZOOKEEPER-1732 will return + * different peerEpoch. During rolling upgrades, it's possible + * that 2/5 servers are returning epoch 1, while the other 2/5 + * are returning epoch 2, the other server need to ignore the + * peerEpoch to be able to join it. + */ + if ((version > 0x0) ^ (other.version > 0x0)) { + return id == other.id; + } else { + return (id == other.id + && peerEpoch == other.peerEpoch); + } + } } @Override diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/FLEOutOfElectionTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/FLEOutOfElectionTest.java new file mode 100644 index 0000000..df92750 --- /dev/null +++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/FLEOutOfElectionTest.java @@ -0,0 +1,136 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.zookeeper.server.quorum; + +import java.io.File; +import java.net.InetSocketAddress; +import java.util.HashMap; +import java.util.Map; + +import org.apache.zookeeper.PortAssignment; +import org.apache.zookeeper.server.quorum.QuorumPeer; +import org.apache.zookeeper.server.quorum.FastLeaderElection.Notification; +import org.apache.zookeeper.server.quorum.Vote; +import org.apache.zookeeper.server.quorum.QuorumPeer.QuorumServer; +import org.apache.zookeeper.server.quorum.QuorumPeer.ServerState; +import org.apache.zookeeper.server.util.ZxidUtils; +import org.apache.zookeeper.test.ClientBase; +import org.apache.zookeeper.test.FLETest; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +/** + * Test FastLeaderElection with out of election servers. + */ +public class FLEOutOfElectionTest { + + private FastLeaderElection fle; + + @Before + public void setUp() throws Exception { + File tmpdir = ClientBase.createTmpDir(); + Map peers = new HashMap(); + for(int i = 0; i < 5; i++) { + peers.put(Long.valueOf(i), new QuorumServer(Long.valueOf(i), + new InetSocketAddress("127.0.0.1", PortAssignment.unique()))); + } + QuorumPeer peer = new QuorumPeer(peers, tmpdir, tmpdir, + PortAssignment.unique(), 3, 3, 1000, 2, 2); + fle = new FastLeaderElection(peer, peer.createCnxnManager()); + } + + @Test + public void testIgnoringZxidElectionEpoch() { + Map votes = new HashMap(); + votes.put(0L, new Vote(0x1, 4L, ZxidUtils.makeZxid(1, 1), 1, 2, ServerState.FOLLOWING)); + votes.put(1L, new Vote(0x1, 4L, ZxidUtils.makeZxid(1, 2), 1, 2, ServerState.FOLLOWING)); + votes.put(3L, new Vote(0x1, 4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.FOLLOWING)); + votes.put(4L, new Vote(0x1, 4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.LEADING)); + + Assert.assertTrue(fle.termPredicate(votes, + new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.FOLLOWING))); + } + + @Test + public void testElectionWIthDifferentVersion() { + Map votes = new HashMap(); + votes.put(0L, new Vote(0x1, 4L, ZxidUtils.makeZxid(1, 1), 1, 1, ServerState.FOLLOWING)); + votes.put(1L, new Vote(0x1, 4L, ZxidUtils.makeZxid(1, 1), 1, 1, ServerState.FOLLOWING)); + votes.put(3L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.FOLLOWING)); + votes.put(4L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.LEADING)); + + Assert.assertTrue(fle.termPredicate(votes, + new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.FOLLOWING))); + } + + @Test + public void testLookingNormal() { + Map votes = new HashMap(); + votes.put(0L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LOOKING)); + votes.put(1L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LOOKING)); + votes.put(3L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LOOKING)); + votes.put(4L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LEADING)); + + Assert.assertTrue(fle.termPredicate(votes, + new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LOOKING))); + } + + @Test + public void testLookingDiffRounds() { + HashMap votes = new HashMap(); + votes.put(0L, new Vote(4L, ZxidUtils.makeZxid(1, 1), 1, 1, ServerState.LOOKING)); + votes.put(1L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.LOOKING)); + votes.put(3L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 3, 2, ServerState.LOOKING)); + votes.put(4L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 3, 2, ServerState.LEADING)); + + Assert.assertFalse(fle.termPredicate(votes, + new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.LOOKING))); + } + + @Test + public void testOutofElection() { + HashMap outofelection = new HashMap(); + + outofelection.put(1L, new Vote(0x0, 5, ZxidUtils.makeZxid(15, 0), 0xa, 0x17, ServerState.FOLLOWING)); + outofelection.put(2L, new Vote(0x0, 5, ZxidUtils.makeZxid(15, 0), 0xa, 0x17, ServerState.FOLLOWING)); + outofelection.put(4L, new Vote(0x1, 5, ZxidUtils.makeZxid(15, 0), 0xa, 0x18, ServerState.FOLLOWING)); + Vote vote = new Vote(0x1, 5, ZxidUtils.makeZxid(15, 0), 0xa, 0x18, ServerState.LEADING); + outofelection.put(5L, vote); + + Notification n = new Notification(); + n.version = vote.getVersion(); + n.leader = vote.getId(); + n.zxid = vote.getZxid(); + n.electionEpoch = vote.getElectionEpoch(); + n.state = vote.getState(); + n.peerEpoch = vote.getPeerEpoch(); + n.sid = 5L; + + // Set the logical clock to 1 on fle instance of server 3. + fle.logicalclock.set(0x1); + + Assert.assertTrue("Quorum check failed", + fle.termPredicate(outofelection, new Vote(n.version, n.leader, + n.zxid, n.electionEpoch, n.peerEpoch, n.state))); + + Assert.assertTrue("Leader check failed", fle.checkLeader(outofelection, + n.leader, n.electionEpoch)); + } +}