Author: ivank
Date: Thu Apr 24 08:43:56 2014
New Revision: 1589636
URL: http://svn.apache.org/r1589636
Log:
BOOKKEEPER-742: Fix for empty ledgers losing quorum. (ivank)
Modified:
zookeeper/bookkeeper/trunk/CHANGES.txt
zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/BookKeeperAdmin.java
zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/Auditor.java
zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AuditorElector.java
zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AutoRecoveryMain.java
zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationException.java
zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java
zookeeper/bookkeeper/trunk/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java
zookeeper/bookkeeper/trunk/bookkeeper-server/src/test/java/org/apache/bookkeeper/test/BookKeeperClusterTestCase.java
Modified: zookeeper/bookkeeper/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/trunk/CHANGES.txt?rev=1589636&r1=1589635&r2=1589636&view=diff
==============================================================================
--- zookeeper/bookkeeper/trunk/CHANGES.txt (original)
+++ zookeeper/bookkeeper/trunk/CHANGES.txt Thu Apr 24 08:43:56 2014
@@ -178,6 +178,8 @@ Trunk (unreleased changes)
BOOKKEEPER-432: Improve performance of entry log range read per ledger entries (yixue,
sijie via ivank)
+ BOOKKEEPER-742: Fix for empty ledgers losing quorum. (ivank)
+
hedwig-server:
BOOKKEEPER-601: readahead cache size isn't updated correctly (sijie via fpj)
Modified: zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/BookKeeperAdmin.java
URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/BookKeeperAdmin.java?rev=1589636&r1=1589635&r2=1589636&view=diff
==============================================================================
--- zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/BookKeeperAdmin.java
(original)
+++ zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/client/BookKeeperAdmin.java
Thu Apr 24 08:43:56 2014
@@ -879,4 +879,11 @@ public class BookKeeperAdmin {
}
};
}
+
+ /**
+ * @return the metadata for the passed ledger handle
+ */
+ public LedgerMetadata getLedgerMetadata(LedgerHandle lh) {
+ return lh.getLedgerMetadata();
+ }
}
Modified: zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/Auditor.java
URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/Auditor.java?rev=1589636&r1=1589635&r2=1589636&view=diff
==============================================================================
--- zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/Auditor.java
(original)
+++ zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/Auditor.java
Thu Apr 24 08:43:56 2014
@@ -43,6 +43,8 @@ import org.apache.bookkeeper.util.ZkUtil
import org.apache.bookkeeper.zookeeper.ZooKeeperWatcherBase;
import org.apache.commons.collections.CollectionUtils;
import org.apache.zookeeper.AsyncCallback;
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.util.concurrent.SettableFuture;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.ZooKeeper;
import org.slf4j.Logger;
@@ -54,6 +56,7 @@ import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
+import java.util.concurrent.Future;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
@@ -137,48 +140,49 @@ public class Auditor implements BookiesL
}
}
- private synchronized void submitAuditTask() {
- synchronized (this) {
- if (executor.isShutdown()) {
- return;
- }
- executor.submit(new Runnable() {
- @SuppressWarnings("unchecked")
- public void run() {
- try {
- waitIfLedgerReplicationDisabled();
+ @VisibleForTesting
+ synchronized Future<?> submitAuditTask() {
+ if (executor.isShutdown()) {
+ SettableFuture<Void> f = SettableFuture.<Void>create();
+ f.setException(new BKAuditException("Auditor shutting down"));
+ return f;
+ }
+ return executor.submit(new Runnable() {
+ @SuppressWarnings("unchecked")
+ public void run() {
+ try {
+ waitIfLedgerReplicationDisabled();
- List<String> availableBookies = getAvailableBookies();
+ List<String> availableBookies = getAvailableBookies();
- // casting to String, as knownBookies and availableBookies
- // contains only String values
- // find new bookies(if any) and update the known bookie list
- Collection<String> newBookies = CollectionUtils.subtract(
- availableBookies, knownBookies);
- knownBookies.addAll(newBookies);
-
- // find lost bookies(if any)
- Collection<String> lostBookies = CollectionUtils.subtract(
- knownBookies, availableBookies);
-
- if (lostBookies.size() > 0) {
- knownBookies.removeAll(lostBookies);
- Map<String, Set<Long>> ledgerDetails = generateBookie2LedgersIndex();
- handleLostBookies(lostBookies, ledgerDetails);
- }
- } catch (BKException bke) {
- LOG.error("Exception getting bookie list", bke);
- } catch (InterruptedException ie) {
- Thread.currentThread().interrupt();
- LOG.error("Interrupted while watching available bookies ", ie);
- } catch (BKAuditException bke) {
- LOG.error("Exception while watching available bookies", bke);
- } catch (UnavailableException ue) {
- LOG.error("Exception while watching available bookies", ue);
+ // casting to String, as knownBookies and availableBookies
+ // contains only String values
+ // find new bookies(if any) and update the known bookie list
+ Collection<String> newBookies = CollectionUtils.subtract(
+ availableBookies, knownBookies);
+ knownBookies.addAll(newBookies);
+
+ // find lost bookies(if any)
+ Collection<String> lostBookies = CollectionUtils.subtract(
+ knownBookies, availableBookies);
+
+ if (lostBookies.size() > 0) {
+ knownBookies.removeAll(lostBookies);
+ Map<String, Set<Long>> ledgerDetails = generateBookie2LedgersIndex();
+ handleLostBookies(lostBookies, ledgerDetails);
}
+ } catch (BKException bke) {
+ LOG.error("Exception getting bookie list", bke);
+ } catch (InterruptedException ie) {
+ Thread.currentThread().interrupt();
+ LOG.error("Interrupted while watching available bookies ", ie);
+ } catch (BKAuditException bke) {
+ LOG.error("Exception while watching available bookies", bke);
+ } catch (UnavailableException ue) {
+ LOG.error("Exception while watching available bookies", ue);
}
- });
- }
+ }
+ });
}
public void start() {
Modified: zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AuditorElector.java
URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AuditorElector.java?rev=1589636&r1=1589635&r2=1589636&view=diff
==============================================================================
--- zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AuditorElector.java
(original)
+++ zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AuditorElector.java
Thu Apr 24 08:43:56 2014
@@ -269,6 +269,11 @@ public class AuditorElector {
executor.submit(r);
}
+ @VisibleForTesting
+ Auditor getAuditor() {
+ return auditor;
+ }
+
/**
* Shutting down AuditorElector
*/
Modified: zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AutoRecoveryMain.java
URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AutoRecoveryMain.java?rev=1589636&r1=1589635&r2=1589636&view=diff
==============================================================================
--- zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AutoRecoveryMain.java
(original)
+++ zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AutoRecoveryMain.java
Thu Apr 24 08:43:56 2014
@@ -24,6 +24,8 @@ import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
+import com.google.common.annotations.VisibleForTesting;
+
import org.apache.bookkeeper.bookie.Bookie;
import org.apache.bookkeeper.bookie.BookieCriticalThread;
import org.apache.bookkeeper.bookie.ExitCode;
@@ -144,6 +146,11 @@ public class AutoRecoveryMain {
return exitCode;
}
+ @VisibleForTesting
+ public Auditor getAuditor() {
+ return auditorElector.getAuditor();
+ }
+
/** Is auto-recovery service running? */
public boolean isAutoRecoveryRunning() {
return running;
Modified: zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationException.java
URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationException.java?rev=1589636&r1=1589635&r2=1589636&view=diff
==============================================================================
--- zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationException.java
(original)
+++ zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationException.java
Thu Apr 24 08:43:56 2014
@@ -70,5 +70,9 @@ public abstract class ReplicationExcepti
BKAuditException(String message, Throwable cause) {
super(message, cause);
}
+
+ BKAuditException(String message) {
+ super(message);
+ }
}
}
Modified: zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java
URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java?rev=1589636&r1=1589635&r2=1589636&view=diff
==============================================================================
--- zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java
(original)
+++ zookeeper/bookkeeper/trunk/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java
Thu Apr 24 08:43:56 2014
@@ -24,6 +24,9 @@ import java.util.List;
import java.util.Set;
import java.util.Timer;
import java.util.TimerTask;
+import java.util.SortedMap;
+import java.util.ArrayList;
+import java.util.Collection;
import java.util.concurrent.CountDownLatch;
import org.apache.bookkeeper.bookie.BookieThread;
@@ -36,6 +39,7 @@ import org.apache.bookkeeper.client.Book
import org.apache.bookkeeper.client.LedgerChecker;
import org.apache.bookkeeper.client.LedgerFragment;
import org.apache.bookkeeper.client.LedgerHandle;
+import org.apache.bookkeeper.client.LedgerMetadata;
import org.apache.bookkeeper.conf.ClientConfiguration;
import org.apache.bookkeeper.conf.ServerConfiguration;
import org.apache.bookkeeper.meta.LedgerManagerFactory;
@@ -197,7 +201,7 @@ public class ReplicationWorker implement
}
}
- if (foundOpenFragments) {
+ if (foundOpenFragments || isLastSegmentOpenAndMissingBookies(lh)) {
deferLedgerLockRelease(ledgerIdToReplicate);
return;
}
@@ -215,6 +219,45 @@ public class ReplicationWorker implement
}
}
+ /**
+ * When checking the fragments of a ledger, there is a corner case
+ * where if the last segment/ensemble is open, but nothing has been written to
+ * some of the quorums in the ensemble, bookies can fail without any action being
+ * taken. This is fine, until enough bookies fail to cause a quorum to become
+ * unavailable, by which time the ledger is unrecoverable.
+ *
+ * For example, if in a E3Q2, only 1 entry is written and the last bookie
+ * in the ensemble fails, nothing has been written to it, so nothing needs to be
+ * recovered. But if the second to last bookie fails, we've now lost quorum for
+ * the second entry, so it's impossible to see if the second has been written or
+ * not.
+ *
+ * To avoid this situation, we need to check if bookies in the final open ensemble
+ * are unavailable, and take action if so. The action to take is to close the ledger,
+ * after a grace period as the writting client may replace the faulty bookie on its
+ * own.
+ *
+ * Missing bookies in closed ledgers are fine, as we know the last confirmed add, so
+ * we can tell which entries are supposed to exist and rereplicate them if necessary.
+ */
+ private boolean isLastSegmentOpenAndMissingBookies(LedgerHandle lh) throws BKException
{
+ LedgerMetadata md = admin.getLedgerMetadata(lh);
+ if (md.isClosed()) {
+ return false;
+ }
+
+ SortedMap<Long, ArrayList<BookieSocketAddress>> ensembles
+ = admin.getLedgerMetadata(lh).getEnsembles();
+ ArrayList<BookieSocketAddress> finalEnsemble = ensembles.get(ensembles.lastKey());
+ Collection<BookieSocketAddress> available = admin.getAvailableBookies();
+ for (BookieSocketAddress b : finalEnsemble) {
+ if (!available.contains(b)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
/** Gets the under replicated fragments */
private Set<LedgerFragment> getUnderreplicatedFragments(LedgerHandle lh)
throws InterruptedException {
@@ -237,6 +280,10 @@ public class ReplicationWorker implement
LedgerHandle lh = null;
try {
lh = admin.openLedgerNoRecovery(ledgerId);
+ if (isLastSegmentOpenAndMissingBookies(lh)) {
+ lh = admin.openLedger(ledgerId);
+ }
+
Set<LedgerFragment> fragments = getUnderreplicatedFragments(lh);
for (LedgerFragment fragment : fragments) {
if (!fragment.isClosed()) {
Modified: zookeeper/bookkeeper/trunk/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java
URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/trunk/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java?rev=1589636&r1=1589635&r2=1589636&view=diff
==============================================================================
--- zookeeper/bookkeeper/trunk/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java
(original)
+++ zookeeper/bookkeeper/trunk/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java
Thu Apr 24 08:43:56 2014
@@ -24,6 +24,7 @@ import java.util.ArrayList;
import java.util.List;
import java.util.SortedMap;
import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
import org.apache.bookkeeper.client.BKException;
import org.apache.bookkeeper.client.BookKeeper.DigestType;
@@ -37,7 +38,7 @@ import org.apache.bookkeeper.net.BookieS
import org.apache.bookkeeper.proto.BookieServer;
import org.apache.bookkeeper.replication.ReplicationException.CompatibilityException;
import org.apache.bookkeeper.replication.ReplicationException.UnavailableException;
-import org.apache.bookkeeper.test.MultiLedgerManagerTestCase;
+import org.apache.bookkeeper.test.BookKeeperClusterTestCase;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher;
@@ -52,8 +53,7 @@ import org.slf4j.LoggerFactory;
* Auditor-rereplication process: Auditor will publish the bookie failures,
* consequently ReplicationWorker will get the notifications and act on it.
*/
-public class BookieAutoRecoveryTest extends
- MultiLedgerManagerTestCase {
+public class BookieAutoRecoveryTest extends BookKeeperClusterTestCase {
private static final Logger LOG = LoggerFactory
.getLogger(BookieAutoRecoveryTest.class);
private static final byte[] PASSWD = "admin".getBytes();
@@ -68,15 +68,15 @@ public class BookieAutoRecoveryTest exte
private final String UNDERREPLICATED_PATH = baseClientConf
.getZkLedgersRootPath() + "/underreplication/ledgers";
- public BookieAutoRecoveryTest(String ledgerManagerFactory) throws IOException, KeeperException,
+ public BookieAutoRecoveryTest() throws IOException, KeeperException,
InterruptedException, UnavailableException, CompatibilityException {
super(3);
- LOG.info("Running test case using ledger manager : "
- + ledgerManagerFactory);
- // set ledger manager name
- baseConf.setLedgerManagerFactoryClassName(ledgerManagerFactory);
+
+ baseConf.setLedgerManagerFactoryClassName(
+ "org.apache.bookkeeper.meta.HierarchicalLedgerManagerFactory");
baseConf.setOpenLedgerRereplicationGracePeriod(openLedgerRereplicationGracePeriod);
- baseClientConf.setLedgerManagerFactoryClassName(ledgerManagerFactory);
+ baseClientConf.setLedgerManagerFactoryClassName(
+ "org.apache.bookkeeper.meta.HierarchicalLedgerManagerFactory");
this.digestType = DigestType.MAC;
setAutoRecoveryEnabled(true);
}
@@ -342,6 +342,51 @@ public class BookieAutoRecoveryTest exte
}
}
+ /**
+ * Test that if a empty ledger loses the bookie not in the quorum for entry 0, it will
+ * still be openable when it loses enough bookies to lose a whole quorum.
+ */
+ @Test(timeout=10000)
+ public void testEmptyLedgerLosesQuorumEventually() throws Exception {
+ LedgerHandle lh = bkc.createLedger(3, 2, 2, DigestType.CRC32, PASSWD);
+ CountDownLatch latch = new CountDownLatch(1);
+ String urZNode = getUrLedgerZNode(lh);
+ watchUrLedgerNode(urZNode, latch);
+
+ BookieSocketAddress replicaToKill = LedgerHandleAdapter
+ .getLedgerMetadata(lh).getEnsembles().get(0L).get(2);
+ LOG.info("Killing last bookie, {}, in ensemble {}", replicaToKill,
+ LedgerHandleAdapter.getLedgerMetadata(lh).getEnsembles().get(0L));
+ killBookie(replicaToKill);
+
+ getAuditor().submitAuditTask().get(); // ensure auditor runs
+
+ assertTrue("Should be marked as underreplicated", latch.await(5, TimeUnit.SECONDS));
+ latch = new CountDownLatch(1);
+ Stat s = watchUrLedgerNode(urZNode, latch); // should be marked as replicated
+ if (s != null) {
+ assertTrue("Should be marked as replicated", latch.await(10, TimeUnit.SECONDS));
+ }
+
+ replicaToKill = LedgerHandleAdapter
+ .getLedgerMetadata(lh).getEnsembles().get(0L).get(1);
+ LOG.info("Killing second bookie, {}, in ensemble {}", replicaToKill,
+ LedgerHandleAdapter.getLedgerMetadata(lh).getEnsembles().get(0L));
+ killBookie(replicaToKill);
+
+ getAuditor().submitAuditTask().get(); // ensure auditor runs
+
+ assertTrue("Should be marked as underreplicated", latch.await(5, TimeUnit.SECONDS));
+ latch = new CountDownLatch(1);
+ s = watchUrLedgerNode(urZNode, latch); // should be marked as replicated
+ if (s != null) {
+ assertTrue("Should be marked as replicated", latch.await(5, TimeUnit.SECONDS));
+ }
+
+ // should be able to open ledger without issue
+ bkc.openLedger(lh.getId(), DigestType.CRC32, PASSWD);
+ }
+
private int getReplicaIndexInLedger(LedgerHandle lh,
BookieSocketAddress replicaToKill) {
SortedMap<Long, ArrayList<BookieSocketAddress>> ensembles = LedgerHandleAdapter
Modified: zookeeper/bookkeeper/trunk/bookkeeper-server/src/test/java/org/apache/bookkeeper/test/BookKeeperClusterTestCase.java
URL: http://svn.apache.org/viewvc/zookeeper/bookkeeper/trunk/bookkeeper-server/src/test/java/org/apache/bookkeeper/test/BookKeeperClusterTestCase.java?rev=1589636&r1=1589635&r2=1589636&view=diff
==============================================================================
--- zookeeper/bookkeeper/trunk/bookkeeper-server/src/test/java/org/apache/bookkeeper/test/BookKeeperClusterTestCase.java
(original)
+++ zookeeper/bookkeeper/trunk/bookkeeper-server/src/test/java/org/apache/bookkeeper/test/BookKeeperClusterTestCase.java
Thu Apr 24 08:43:56 2014
@@ -44,6 +44,7 @@ import org.apache.bookkeeper.metastore.I
import org.apache.bookkeeper.net.BookieSocketAddress;
import org.apache.bookkeeper.proto.BookieServer;
import org.apache.bookkeeper.replication.AutoRecoveryMain;
+import org.apache.bookkeeper.replication.Auditor;
import org.apache.bookkeeper.replication.ReplicationException.CompatibilityException;
import org.apache.bookkeeper.replication.ReplicationException.UnavailableException;
import org.apache.commons.io.FileUtils;
@@ -528,4 +529,14 @@ public abstract class BookKeeperClusterT
+ autoRecoveryProcess.getKey().getLocalAddress());
}
}
+
+ public Auditor getAuditor() throws Exception {
+ for (AutoRecoveryMain p : autoRecoveryProcesses.values()) {
+ Auditor a = p.getAuditor();
+ if (a != null) {
+ return a;
+ }
+ }
+ throw new Exception("No auditor found");
+ }
}
|