Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id B2B5E200B74 for ; Thu, 18 Aug 2016 05:15:29 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id B1647160AB5; Thu, 18 Aug 2016 03:15:29 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id CF838160A8C for ; Thu, 18 Aug 2016 05:15:28 +0200 (CEST) Received: (qmail 51145 invoked by uid 500); 18 Aug 2016 03:15:28 -0000 Mailing-List: contact commits-help@hbase.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@hbase.apache.org Delivered-To: mailing list commits@hbase.apache.org Received: (qmail 51136 invoked by uid 99); 18 Aug 2016 03:15:27 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 18 Aug 2016 03:15:27 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id D4C9FE01BD; Thu, 18 Aug 2016 03:15:27 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: liyu@apache.org To: commits@hbase.apache.org Message-Id: <314817e8f0e743f28b0e720100fba2e5@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: hbase git commit: HBASE-16429 FSHLog: deadlock if rollWriter called when ring buffer filled with appends Date: Thu, 18 Aug 2016 03:15:27 +0000 (UTC) archived-at: Thu, 18 Aug 2016 03:15:29 -0000 Repository: hbase Updated Branches: refs/heads/branch-1.3 4e78f7a99 -> e3ed8ecdb HBASE-16429 FSHLog: deadlock if rollWriter called when ring buffer filled with appends Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/e3ed8ecd Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/e3ed8ecd Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/e3ed8ecd Branch: refs/heads/branch-1.3 Commit: e3ed8ecdb3ce7ee727b525f88bc58650559ebc7e Parents: 4e78f7a Author: Yu Li Authored: Thu Aug 18 09:59:36 2016 +0800 Committer: Yu Li Committed: Thu Aug 18 10:04:44 2016 +0800 ---------------------------------------------------------------------- .../hadoop/hbase/regionserver/wal/FSHLog.java | 33 +++++++-- .../hadoop/hbase/regionserver/TestHRegion.java | 70 ++++++++++++++++++++ 2 files changed, 98 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/e3ed8ecd/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java index 070ba3b..097101b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java @@ -821,8 +821,20 @@ public class FSHLog implements WAL { // constructor BEFORE the ring buffer is set running so it is null on first time through // here; allow for that. SyncFuture syncFuture = null; - SafePointZigZagLatch zigzagLatch = (this.ringBufferEventHandler == null)? - null: this.ringBufferEventHandler.attainSafePoint(); + SafePointZigZagLatch zigzagLatch = null; + long sequence = -1L; + if (this.ringBufferEventHandler != null) { + // Get sequence first to avoid dead lock when ring buffer is full + // Considering below sequence + // 1. replaceWriter is called and zigzagLatch is initialized + // 2. ringBufferEventHandler#onEvent is called and arrives at #attainSafePoint(long) then wait + // on safePointReleasedLatch + // 3. Since ring buffer is full, if we get sequence when publish sync, the replaceWriter + // thread will wait for the ring buffer to be consumed, but the only consumer is waiting + // replaceWriter thread to release safePointReleasedLatch, which causes a deadlock + sequence = getSequenceOnRingBuffer(); + zigzagLatch = this.ringBufferEventHandler.attainSafePoint(); + } afterCreatingZigZagLatch(); TraceScope scope = Trace.startSpan("FSHFile.replaceWriter"); try { @@ -833,8 +845,11 @@ public class FSHLog implements WAL { // to come back. Cleanup this syncFuture down below after we are ready to run again. try { if (zigzagLatch != null) { + // use assert to make sure no change breaks the logic that + // sequence and zigzagLatch will be set together + assert sequence > 0L : "Failed to get sequence from ring buffer"; Trace.addTimelineAnnotation("awaiting safepoint"); - syncFuture = zigzagLatch.waitSafePoint(publishSyncOnRingBuffer()); + syncFuture = zigzagLatch.waitSafePoint(publishSyncOnRingBuffer(sequence)); } } catch (FailedSyncBeforeLogCloseException e) { // If unflushed/unsynced entries on close, it is reason to abort. @@ -1337,12 +1352,20 @@ public class FSHLog implements WAL { return logRollNeeded; } - private SyncFuture publishSyncOnRingBuffer() { - return publishSyncOnRingBuffer(null); + private SyncFuture publishSyncOnRingBuffer(long sequence) { + return publishSyncOnRingBuffer(sequence, null); + } + + private long getSequenceOnRingBuffer() { + return this.disruptor.getRingBuffer().next(); } private SyncFuture publishSyncOnRingBuffer(Span span) { long sequence = this.disruptor.getRingBuffer().next(); + return publishSyncOnRingBuffer(sequence, span); + } + + private SyncFuture publishSyncOnRingBuffer(long sequence, Span span) { SyncFuture syncFuture = getSyncFuture(sequence, span); try { RingBufferTruck truck = this.disruptor.getRingBuffer().get(sequence); http://git-wip-us.apache.org/repos/asf/hbase/blob/e3ed8ecd/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegion.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegion.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegion.java index 4d9c6bd..708af58 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegion.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegion.java @@ -6623,4 +6623,74 @@ public class TestHRegion { return initHRegion(tableName, callingMethod, HBaseConfiguration.create(), families); } + + /** + * HBASE-16429 Make sure no stuck if roll writer when ring buffer is filled with appends + * @throws IOException if IO error occurred during test + */ + @Test(timeout = 60000) + public void testWritesWhileRollWriter() throws IOException { + int testCount = 10; + int numRows = 1024; + int numFamilies = 2; + int numQualifiers = 2; + final byte[][] families = new byte[numFamilies][]; + for (int i = 0; i < numFamilies; i++) { + families[i] = Bytes.toBytes("family" + i); + } + final byte[][] qualifiers = new byte[numQualifiers][]; + for (int i = 0; i < numQualifiers; i++) { + qualifiers[i] = Bytes.toBytes("qual" + i); + } + + String method = "testWritesWhileRollWriter"; + CONF.setInt("hbase.regionserver.wal.disruptor.event.count", 2); + this.region = initHRegion(tableName, method, CONF, families); + try { + List threads = new ArrayList(); + for (int i = 0; i < numRows; i++) { + final int count = i; + Thread t = new Thread(new Runnable() { + + @Override + public void run() { + byte[] row = Bytes.toBytes("row" + count); + Put put = new Put(row); + put.setDurability(Durability.SYNC_WAL); + byte[] value = Bytes.toBytes(String.valueOf(count)); + for (byte[] family : families) { + for (byte[] qualifier : qualifiers) { + put.addColumn(family, qualifier, (long) count, value); + } + } + try { + region.put(put); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }); + threads.add(t); + } + for (Thread t : threads) { + t.start(); + } + + for (int i = 0; i < testCount; i++) { + region.getWAL().rollWriter(); + Thread.yield(); + } + } finally { + try { + HBaseTestingUtility.closeRegionAndWAL(this.region); + CONF.setInt("hbase.regionserver.wal.disruptor.event.count", 16 * 1024); + } catch (DroppedSnapshotException dse) { + // We could get this on way out because we interrupt the background flusher and it could + // fail anywhere causing a DSE over in the background flusher... only it is not properly + // dealt with so could still be memory hanging out when we get to here -- memory we can't + // flush because the accounting is 'off' since original DSE. + } + this.region = null; + } + } }