Author: ivank Date: Thu Apr 24 11:40:47 2014 New Revision: 1589675 URL: http://svn.apache.org/r1589675 Log: BOOKKEEPER-742: Fix for empty ledgers losing quorum. (ivank) Modified: zookeeper/bookkeeper/branches/branch-4.2/CHANGES.txt zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/BookKeeperAdmin.java zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/Auditor.java zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AuditorElector.java zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AutoRecoveryMain.java zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationException.java zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/test/java/org/apache/bookkeeper/test/BookKeeperClusterTestCase.java Modified: zookeeper/bookkeeper/branches/branch-4.2/CHANGES.txt URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/branches/branch-4.2/CHANGES.txt?rev=1589675&r1=1589674&r2=1589675&view=diff ============================================================================== --- zookeeper/bookkeeper/branches/branch-4.2/CHANGES.txt (original) +++ zookeeper/bookkeeper/branches/branch-4.2/CHANGES.txt Thu Apr 24 11:40:47 2014 @@ -20,6 +20,8 @@ Release 4.2.3 - 2013-12-04 BOOKKEEPER-710: OpenLedgerNoRecovery should watch ensemble change. (sijie, ivank via fpj) + BOOKKEEPER-742: Fix for empty ledgers losing quorum. (ivank) + Release 4.2.2 - 2013-10-02 Backward compatible changes: Modified: zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/BookKeeperAdmin.java URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/BookKeeperAdmin.java?rev=1589675&r1=1589674&r2=1589675&view=diff ============================================================================== --- zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/BookKeeperAdmin.java (original) +++ zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/BookKeeperAdmin.java Thu Apr 24 11:40:47 2014 @@ -873,4 +873,11 @@ public class BookKeeperAdmin { } }; } + + /** + * @return the metadata for the passed ledger handle + */ + public LedgerMetadata getLedgerMetadata(LedgerHandle lh) { + return lh.getLedgerMetadata(); + } } Modified: zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/Auditor.java URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/Auditor.java?rev=1589675&r1=1589674&r2=1589675&view=diff ============================================================================== --- zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/Auditor.java (original) +++ zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/Auditor.java Thu Apr 24 11:40:47 2014 @@ -27,6 +27,7 @@ import java.util.ArrayList; import java.util.Map; import java.util.Set; +import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; @@ -62,6 +63,8 @@ import org.apache.bookkeeper.replication import org.apache.bookkeeper.replication.ReplicationException.UnavailableException; import org.apache.commons.collections.CollectionUtils; import com.google.common.collect.Sets; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.util.concurrent.SettableFuture; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.WatchedEvent; import org.apache.zookeeper.ZooKeeper; @@ -146,47 +149,48 @@ public class Auditor implements BookiesL } } - private synchronized void submitAuditTask() { - synchronized (this) { - if (executor.isShutdown()) { - return; - } - executor.submit(new Runnable() { - public void run() { - try { - waitIfLedgerReplicationDisabled(); + @VisibleForTesting + synchronized Future submitAuditTask() { + if (executor.isShutdown()) { + SettableFuture f = SettableFuture.create(); + f.setException(new BKAuditException("Auditor shutting down")); + return f; + } + return executor.submit(new Runnable() { + public void run() { + try { + waitIfLedgerReplicationDisabled(); - List availableBookies = getAvailableBookies(); + List availableBookies = getAvailableBookies(); - // casting to String, as knownBookies and availableBookies - // contains only String values - // find new bookies(if any) and update the known bookie list - Collection newBookies = CollectionUtils.subtract( - availableBookies, knownBookies); - knownBookies.addAll(newBookies); - - // find lost bookies(if any) - Collection lostBookies = CollectionUtils.subtract( - knownBookies, availableBookies); - - if (lostBookies.size() > 0) { - knownBookies.removeAll(lostBookies); - Map> ledgerDetails = generateBookie2LedgersIndex(); - handleLostBookies(lostBookies, ledgerDetails); - } - } catch (BKException bke) { - LOG.error("Exception getting bookie list", bke); - } catch (InterruptedException ie) { - Thread.currentThread().interrupt(); - LOG.error("Interrupted while watching available bookies ", ie); - } catch (BKAuditException bke) { - LOG.error("Exception while watching available bookies", bke); - } catch (UnavailableException ue) { - LOG.error("Exception while watching available bookies", ue); + // casting to String, as knownBookies and availableBookies + // contains only String values + // find new bookies(if any) and update the known bookie list + Collection newBookies = CollectionUtils.subtract( + availableBookies, knownBookies); + knownBookies.addAll(newBookies); + + // find lost bookies(if any) + Collection lostBookies = CollectionUtils.subtract( + knownBookies, availableBookies); + + if (lostBookies.size() > 0) { + knownBookies.removeAll(lostBookies); + Map> ledgerDetails = generateBookie2LedgersIndex(); + handleLostBookies(lostBookies, ledgerDetails); } + } catch (BKException bke) { + LOG.error("Exception getting bookie list", bke); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + LOG.error("Interrupted while watching available bookies ", ie); + } catch (BKAuditException bke) { + LOG.error("Exception while watching available bookies", bke); + } catch (UnavailableException ue) { + LOG.error("Exception while watching available bookies", ue); } - }); - } + } + }); } public void start() { Modified: zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AuditorElector.java URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AuditorElector.java?rev=1589675&r1=1589674&r2=1589675&view=diff ============================================================================== --- zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AuditorElector.java (original) +++ zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AuditorElector.java Thu Apr 24 11:40:47 2014 @@ -269,6 +269,11 @@ public class AuditorElector { executor.submit(r); } + @VisibleForTesting + Auditor getAuditor() { + return auditor; + } + /** * Shutting down AuditorElector */ Modified: zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AutoRecoveryMain.java URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AutoRecoveryMain.java?rev=1589675&r1=1589674&r2=1589675&view=diff ============================================================================== --- zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AutoRecoveryMain.java (original) +++ zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AutoRecoveryMain.java Thu Apr 24 11:40:47 2014 @@ -24,6 +24,8 @@ import java.io.File; import java.io.IOException; import java.net.MalformedURLException; +import com.google.common.annotations.VisibleForTesting; + import org.apache.bookkeeper.bookie.Bookie; import org.apache.bookkeeper.bookie.ExitCode; import org.apache.bookkeeper.conf.ServerConfiguration; @@ -140,6 +142,11 @@ public class AutoRecoveryMain { return exitCode; } + @VisibleForTesting + public Auditor getAuditor() { + return auditorElector.getAuditor(); + } + /** Is auto-recovery service running? */ public boolean isAutoRecoveryRunning() { return running; Modified: zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationException.java URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationException.java?rev=1589675&r1=1589674&r2=1589675&view=diff ============================================================================== --- zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationException.java (original) +++ zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationException.java Thu Apr 24 11:40:47 2014 @@ -70,5 +70,9 @@ public abstract class ReplicationExcepti BKAuditException(String message, Throwable cause) { super(message, cause); } + + BKAuditException(String message) { + super(message); + } } } Modified: zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java?rev=1589675&r1=1589674&r2=1589675&view=diff ============================================================================== --- zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java (original) +++ zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java Thu Apr 24 11:40:47 2014 @@ -25,6 +25,9 @@ import java.util.List; import java.util.Set; import java.util.Timer; import java.util.TimerTask; +import java.util.SortedMap; +import java.util.ArrayList; +import java.util.Collection; import java.util.concurrent.CountDownLatch; import org.apache.bookkeeper.client.BKException; @@ -33,6 +36,7 @@ import org.apache.bookkeeper.client.Book import org.apache.bookkeeper.client.LedgerChecker; import org.apache.bookkeeper.client.LedgerFragment; import org.apache.bookkeeper.client.LedgerHandle; +import org.apache.bookkeeper.client.LedgerMetadata; import org.apache.bookkeeper.client.BKException.BKBookieHandleNotAvailableException; import org.apache.bookkeeper.client.BKException.BKNoSuchLedgerExistsException; import org.apache.bookkeeper.client.BKException.BKReadException; @@ -195,7 +199,7 @@ public class ReplicationWorker implement } } - if (foundOpenFragments) { + if (foundOpenFragments || isLastSegmentOpenAndMissingBookies(lh)) { deferLedgerLockRelease(ledgerIdToReplicate); return; } @@ -213,6 +217,45 @@ public class ReplicationWorker implement } } + /** + * When checking the fragments of a ledger, there is a corner case + * where if the last segment/ensemble is open, but nothing has been written to + * some of the quorums in the ensemble, bookies can fail without any action being + * taken. This is fine, until enough bookies fail to cause a quorum to become + * unavailable, by which time the ledger is unrecoverable. + * + * For example, if in a E3Q2, only 1 entry is written and the last bookie + * in the ensemble fails, nothing has been written to it, so nothing needs to be + * recovered. But if the second to last bookie fails, we've now lost quorum for + * the second entry, so it's impossible to see if the second has been written or + * not. + * + * To avoid this situation, we need to check if bookies in the final open ensemble + * are unavailable, and take action if so. The action to take is to close the ledger, + * after a grace period as the writting client may replace the faulty bookie on its + * own. + * + * Missing bookies in closed ledgers are fine, as we know the last confirmed add, so + * we can tell which entries are supposed to exist and rereplicate them if necessary. + */ + private boolean isLastSegmentOpenAndMissingBookies(LedgerHandle lh) throws BKException { + LedgerMetadata md = admin.getLedgerMetadata(lh); + if (md.isClosed()) { + return false; + } + + SortedMap> ensembles + = admin.getLedgerMetadata(lh).getEnsembles(); + ArrayList finalEnsemble = ensembles.get(ensembles.lastKey()); + Collection available = admin.getAvailableBookies(); + for (InetSocketAddress b : finalEnsemble) { + if (!available.contains(b)) { + return true; + } + } + return false; + } + /** Gets the under replicated fragments */ private Set getUnderreplicatedFragments(LedgerHandle lh) throws InterruptedException { @@ -235,6 +278,10 @@ public class ReplicationWorker implement LedgerHandle lh = null; try { lh = admin.openLedgerNoRecovery(ledgerId); + if (isLastSegmentOpenAndMissingBookies(lh)) { + lh = admin.openLedger(ledgerId); + } + Set fragments = getUnderreplicatedFragments(lh); for (LedgerFragment fragment : fragments) { if (!fragment.isClosed()) { Modified: zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java?rev=1589675&r1=1589674&r2=1589675&view=diff ============================================================================== --- zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java (original) +++ zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java Thu Apr 24 11:40:47 2014 @@ -25,6 +25,7 @@ import java.util.ArrayList; import java.util.List; import java.util.SortedMap; import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; import org.apache.bookkeeper.client.BKException; import org.apache.bookkeeper.client.BookKeeper.DigestType; @@ -37,7 +38,7 @@ import org.apache.bookkeeper.meta.ZkLedg import org.apache.bookkeeper.proto.BookieServer; import org.apache.bookkeeper.replication.ReplicationException.CompatibilityException; import org.apache.bookkeeper.replication.ReplicationException.UnavailableException; -import org.apache.bookkeeper.test.MultiLedgerManagerTestCase; +import org.apache.bookkeeper.test.BookKeeperClusterTestCase; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.WatchedEvent; import org.apache.zookeeper.Watcher; @@ -52,8 +53,7 @@ import org.slf4j.LoggerFactory; * Auditor-rereplication process: Auditor will publish the bookie failures, * consequently ReplicationWorker will get the notifications and act on it. */ -public class BookieAutoRecoveryTest extends - MultiLedgerManagerTestCase { +public class BookieAutoRecoveryTest extends BookKeeperClusterTestCase { private static final Logger LOG = LoggerFactory .getLogger(BookieAutoRecoveryTest.class); private static final byte[] PASSWD = "admin".getBytes(); @@ -68,15 +68,15 @@ public class BookieAutoRecoveryTest exte private final String UNDERREPLICATED_PATH = baseClientConf .getZkLedgersRootPath() + "/underreplication/ledgers"; - public BookieAutoRecoveryTest(String ledgerManagerFactory) throws IOException, KeeperException, + public BookieAutoRecoveryTest() throws IOException, KeeperException, InterruptedException, UnavailableException, CompatibilityException { super(3); - LOG.info("Running test case using ledger manager : " - + ledgerManagerFactory); - // set ledger manager name - baseConf.setLedgerManagerFactoryClassName(ledgerManagerFactory); + + baseConf.setLedgerManagerFactoryClassName( + "org.apache.bookkeeper.meta.HierarchicalLedgerManagerFactory"); baseConf.setOpenLedgerRereplicationGracePeriod(openLedgerRereplicationGracePeriod); - baseClientConf.setLedgerManagerFactoryClassName(ledgerManagerFactory); + baseClientConf.setLedgerManagerFactoryClassName( + "org.apache.bookkeeper.meta.HierarchicalLedgerManagerFactory"); this.digestType = DigestType.MAC; setAutoRecoveryEnabled(true); } @@ -342,6 +342,51 @@ public class BookieAutoRecoveryTest exte } } + /** + * Test that if a empty ledger loses the bookie not in the quorum for entry 0, it will + * still be openable when it loses enough bookies to lose a whole quorum. + */ + @Test(timeout=10000) + public void testEmptyLedgerLosesQuorumEventually() throws Exception { + LedgerHandle lh = bkc.createLedger(3, 2, 2, DigestType.CRC32, PASSWD); + CountDownLatch latch = new CountDownLatch(1); + String urZNode = getUrLedgerZNode(lh); + watchUrLedgerNode(urZNode, latch); + + InetSocketAddress replicaToKill = LedgerHandleAdapter + .getLedgerMetadata(lh).getEnsembles().get(0L).get(2); + LOG.info("Killing last bookie, {}, in ensemble {}", replicaToKill, + LedgerHandleAdapter.getLedgerMetadata(lh).getEnsembles().get(0L)); + killBookie(replicaToKill); + + getAuditor().submitAuditTask().get(); // ensure auditor runs + + assertTrue("Should be marked as underreplicated", latch.await(5, TimeUnit.SECONDS)); + latch = new CountDownLatch(1); + Stat s = watchUrLedgerNode(urZNode, latch); // should be marked as replicated + if (s != null) { + assertTrue("Should be marked as replicated", latch.await(10, TimeUnit.SECONDS)); + } + + replicaToKill = LedgerHandleAdapter + .getLedgerMetadata(lh).getEnsembles().get(0L).get(1); + LOG.info("Killing second bookie, {}, in ensemble {}", replicaToKill, + LedgerHandleAdapter.getLedgerMetadata(lh).getEnsembles().get(0L)); + killBookie(replicaToKill); + + getAuditor().submitAuditTask().get(); // ensure auditor runs + + assertTrue("Should be marked as underreplicated", latch.await(5, TimeUnit.SECONDS)); + latch = new CountDownLatch(1); + s = watchUrLedgerNode(urZNode, latch); // should be marked as replicated + if (s != null) { + assertTrue("Should be marked as replicated", latch.await(5, TimeUnit.SECONDS)); + } + + // should be able to open ledger without issue + bkc.openLedger(lh.getId(), DigestType.CRC32, PASSWD); + } + private int getReplicaIndexInLedger(LedgerHandle lh, InetSocketAddress replicaToKill) { SortedMap> ensembles = LedgerHandleAdapter Modified: zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/test/java/org/apache/bookkeeper/test/BookKeeperClusterTestCase.java URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/test/java/org/apache/bookkeeper/test/BookKeeperClusterTestCase.java?rev=1589675&r1=1589674&r2=1589675&view=diff ============================================================================== --- zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/test/java/org/apache/bookkeeper/test/BookKeeperClusterTestCase.java (original) +++ zookeeper/bookkeeper/branches/branch-4.2/bookkeeper-server/src/test/java/org/apache/bookkeeper/test/BookKeeperClusterTestCase.java Thu Apr 24 11:40:47 2014 @@ -43,6 +43,7 @@ import org.apache.bookkeeper.conf.Server import org.apache.bookkeeper.metastore.InMemoryMetaStore; import org.apache.bookkeeper.proto.BookieServer; import org.apache.bookkeeper.replication.AutoRecoveryMain; +import org.apache.bookkeeper.replication.Auditor; import org.apache.bookkeeper.replication.ReplicationException.CompatibilityException; import org.apache.bookkeeper.replication.ReplicationException.UnavailableException; import org.apache.commons.io.FileUtils; @@ -529,4 +530,14 @@ public abstract class BookKeeperClusterT + autoRecoveryProcess.getKey().getLocalAddress()); } } + + public Auditor getAuditor() throws Exception { + for (AutoRecoveryMain p : autoRecoveryProcesses.values()) { + Auditor a = p.getAuditor(); + if (a != null) { + return a; + } + } + throw new Exception("No auditor found"); + } }