Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 21FE7200D6C for ; Tue, 28 Nov 2017 22:47:35 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id 207ED160C07; Tue, 28 Nov 2017 21:47:35 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id E33F3160C18 for ; Tue, 28 Nov 2017 22:47:33 +0100 (CET) Received: (qmail 88689 invoked by uid 500); 28 Nov 2017 21:47:22 -0000 Mailing-List: contact common-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Delivered-To: mailing list common-commits@hadoop.apache.org Received: (qmail 85838 invoked by uid 99); 28 Nov 2017 21:47:20 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 28 Nov 2017 21:47:20 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id C940FF6036; Tue, 28 Nov 2017 21:47:19 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: kkaranasos@apache.org To: common-commits@hadoop.apache.org Date: Tue, 28 Nov 2017 21:47:48 -0000 Message-Id: <4c84390db8d748ef8972d3ebeecc2e81@git.apache.org> In-Reply-To: <7246f4abfe7d4e1b850d697f7cf2a998@git.apache.org> References: <7246f4abfe7d4e1b850d697f7cf2a998@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [30/50] [abbrv] hadoop git commit: HDFS-12754. Lease renewal can hit a deadlock. Contributed by Kuhu Shukla. archived-at: Tue, 28 Nov 2017 21:47:35 -0000 HDFS-12754. Lease renewal can hit a deadlock. Contributed by Kuhu Shukla. Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/738d1a20 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/738d1a20 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/738d1a20 Branch: refs/heads/YARN-6592 Commit: 738d1a206aba05f0b4be7d633b17db7fcd1c74bc Parents: 785732c Author: Kihwal Lee Authored: Wed Nov 22 13:01:07 2017 -0600 Committer: Kihwal Lee Committed: Wed Nov 22 13:01:55 2017 -0600 ---------------------------------------------------------------------- .../java/org/apache/hadoop/hdfs/DFSClient.java | 15 +++- .../hadoop/hdfs/DFSClientFaultInjector.java | 2 + .../hadoop/hdfs/client/impl/LeaseRenewer.java | 55 +++++--------- .../hdfs/client/impl/TestLeaseRenewer.java | 23 ++---- .../hadoop/hdfs/TestDFSClientRetries.java | 80 ++++++++++++++++++++ .../hadoop/hdfs/TestDistributedFileSystem.java | 5 +- 6 files changed, 122 insertions(+), 58 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/738d1a20/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSClient.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSClient.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSClient.java index 467d6be..5652321 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSClient.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSClient.java @@ -483,12 +483,21 @@ public class DFSClient implements java.io.Closeable, RemotePeerFactory, /** Get a lease and start automatic renewal */ private void beginFileLease(final long inodeId, final DFSOutputStream out) throws IOException { - getLeaseRenewer().put(inodeId, out, this); + synchronized (filesBeingWritten) { + putFileBeingWritten(inodeId, out); + getLeaseRenewer().put(this); + } } /** Stop renewal of lease for the file. */ void endFileLease(final long inodeId) { - getLeaseRenewer().closeFile(inodeId, this); + synchronized (filesBeingWritten) { + removeFileBeingWritten(inodeId); + // remove client from renewer if no files are open + if (filesBeingWritten.isEmpty()) { + getLeaseRenewer().closeClient(this); + } + } } @@ -614,9 +623,9 @@ public class DFSClient implements java.io.Closeable, RemotePeerFactory, @Override public synchronized void close() throws IOException { if(clientRunning) { + // lease renewal stops when all files are closed closeAllFilesBeingWritten(false); clientRunning = false; - getLeaseRenewer().closeClient(this); // close connections to the namenode closeConnectionToNamenode(); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/738d1a20/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSClientFaultInjector.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSClientFaultInjector.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSClientFaultInjector.java index b58cf16..d36c058 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSClientFaultInjector.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSClientFaultInjector.java @@ -63,4 +63,6 @@ public class DFSClientFaultInjector { } public void sleepBeforeHedgedGet() {} + + public void delayWhenRenewLeaseTimeout() {} } http://git-wip-us.apache.org/repos/asf/hadoop/blob/738d1a20/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/impl/LeaseRenewer.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/impl/LeaseRenewer.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/impl/LeaseRenewer.java index 6faf133..e33d024 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/impl/LeaseRenewer.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/impl/LeaseRenewer.java @@ -30,7 +30,7 @@ import java.util.Map; import org.apache.hadoop.HadoopIllegalArgumentException; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.hdfs.DFSClient; -import org.apache.hadoop.hdfs.DFSOutputStream; +import org.apache.hadoop.hdfs.DFSClientFaultInjector; import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.Daemon; @@ -76,7 +76,7 @@ import org.slf4j.LoggerFactory; public class LeaseRenewer { static final Logger LOG = LoggerFactory.getLogger(LeaseRenewer.class); - static final long LEASE_RENEWER_GRACE_DEFAULT = 60*1000L; + private static long leaseRenewerGraceDefault = 60*1000L; static final long LEASE_RENEWER_SLEEP_DEFAULT = 1000L; /** Get a {@link LeaseRenewer} instance */ @@ -156,9 +156,7 @@ public class LeaseRenewer { final LeaseRenewer stored = renewers.get(r.factorykey); //Since a renewer may expire, the stored renewer can be different. if (r == stored) { - if (!r.clientsRunning()) { - renewers.remove(r.factorykey); - } + renewers.remove(r.factorykey); } } } @@ -201,7 +199,7 @@ public class LeaseRenewer { private LeaseRenewer(Factory.Key factorykey) { this.factorykey = factorykey; - unsyncSetGraceSleepPeriod(LEASE_RENEWER_GRACE_DEFAULT); + unsyncSetGraceSleepPeriod(leaseRenewerGraceDefault); if (LOG.isTraceEnabled()) { instantiationTrace = StringUtils.stringifyException( @@ -293,8 +291,7 @@ public class LeaseRenewer { && Time.monotonicNow() - emptyTime > gracePeriod; } - public synchronized void put(final long inodeId, final DFSOutputStream out, - final DFSClient dfsc) { + public synchronized void put(final DFSClient dfsc) { if (dfsc.isClientRunning()) { if (!isRunning() || isRenewerExpired()) { //start a new deamon with a new id. @@ -328,7 +325,6 @@ public class LeaseRenewer { }); daemon.start(); } - dfsc.putFileBeingWritten(inodeId, out); emptyTime = Long.MAX_VALUE; } } @@ -338,28 +334,6 @@ public class LeaseRenewer { emptyTime = time; } - /** Close a file. */ - public void closeFile(final long inodeId, final DFSClient dfsc) { - dfsc.removeFileBeingWritten(inodeId); - - synchronized(this) { - if (dfsc.isFilesBeingWrittenEmpty()) { - dfsclients.remove(dfsc); - } - //update emptyTime if necessary - if (emptyTime == Long.MAX_VALUE) { - for(DFSClient c : dfsclients) { - if (!c.isFilesBeingWrittenEmpty()) { - //found a non-empty file-being-written map - return; - } - } - //discover the first time that all file-being-written maps are empty. - emptyTime = Time.monotonicNow(); - } - } - } - /** Close the given client. */ public synchronized void closeClient(final DFSClient dfsc) { dfsclients.remove(dfsc); @@ -447,14 +421,17 @@ public class LeaseRenewer { } catch (SocketTimeoutException ie) { LOG.warn("Failed to renew lease for " + clientsString() + " for " + (elapsed/1000) + " seconds. Aborting ...", ie); + List dfsclientsCopy; synchronized (this) { - while (!dfsclients.isEmpty()) { - DFSClient dfsClient = dfsclients.get(0); - dfsClient.closeAllFilesBeingWritten(true); - closeClient(dfsClient); - } + DFSClientFaultInjector.get().delayWhenRenewLeaseTimeout(); + dfsclientsCopy = new ArrayList<>(dfsclients); + dfsclients.clear(); //Expire the current LeaseRenewer thread. emptyTime = 0; + Factory.INSTANCE.remove(LeaseRenewer.this); + } + for (DFSClient dfsClient : dfsclientsCopy) { + dfsClient.closeAllFilesBeingWritten(true); } break; } catch (IOException ie) { @@ -511,4 +488,10 @@ public class LeaseRenewer { return b.append("]").toString(); } } + + @VisibleForTesting + public static void setLeaseRenewerGraceDefault( + long leaseRenewerGraceDefault) { + LeaseRenewer.leaseRenewerGraceDefault = leaseRenewerGraceDefault; + } } http://git-wip-us.apache.org/repos/asf/hadoop/blob/738d1a20/hadoop-hdfs-project/hadoop-hdfs-client/src/test/java/org/apache/hadoop/hdfs/client/impl/TestLeaseRenewer.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/test/java/org/apache/hadoop/hdfs/client/impl/TestLeaseRenewer.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/test/java/org/apache/hadoop/hdfs/client/impl/TestLeaseRenewer.java index eb10e96..f73ea6d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/test/java/org/apache/hadoop/hdfs/client/impl/TestLeaseRenewer.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/test/java/org/apache/hadoop/hdfs/client/impl/TestLeaseRenewer.java @@ -109,7 +109,7 @@ public class TestLeaseRenewer { // Set up a file so that we start renewing our lease. DFSOutputStream mockStream = Mockito.mock(DFSOutputStream.class); long fileId = 123L; - renewer.put(fileId, mockStream, MOCK_DFSCLIENT); + renewer.put(MOCK_DFSCLIENT); // Wait for lease to get renewed long failTime = Time.monotonicNow() + 5000; @@ -121,7 +121,7 @@ public class TestLeaseRenewer { Assert.fail("Did not renew lease at all!"); } - renewer.closeFile(fileId, MOCK_DFSCLIENT); + renewer.closeClient(MOCK_DFSCLIENT); } /** @@ -136,11 +136,8 @@ public class TestLeaseRenewer { Mockito.doReturn(false).when(mockClient1).renewLease(); assertSame(renewer, LeaseRenewer.getInstance( FAKE_AUTHORITY, FAKE_UGI_A, mockClient1)); - - // Set up a file so that we start renewing our lease. - DFSOutputStream mockStream1 = Mockito.mock(DFSOutputStream.class); long fileId = 456L; - renewer.put(fileId, mockStream1, mockClient1); + renewer.put(mockClient1); // Second DFSClient does renew lease final DFSClient mockClient2 = createMockClient(); @@ -148,9 +145,7 @@ public class TestLeaseRenewer { assertSame(renewer, LeaseRenewer.getInstance( FAKE_AUTHORITY, FAKE_UGI_A, mockClient2)); - // Set up a file so that we start renewing our lease. - DFSOutputStream mockStream2 = Mockito.mock(DFSOutputStream.class); - renewer.put(fileId, mockStream2, mockClient2); + renewer.put(mockClient2); // Wait for lease to get renewed @@ -171,19 +166,17 @@ public class TestLeaseRenewer { } }, 100, 10000); - renewer.closeFile(fileId, mockClient1); - renewer.closeFile(fileId, mockClient2); + renewer.closeClient(mockClient1); + renewer.closeClient(mockClient2); } @Test public void testThreadName() throws Exception { - DFSOutputStream mockStream = Mockito.mock(DFSOutputStream.class); - long fileId = 789L; Assert.assertFalse("Renewer not initially running", renewer.isRunning()); // Pretend to open a file - renewer.put(fileId, mockStream, MOCK_DFSCLIENT); + renewer.put(MOCK_DFSCLIENT); Assert.assertTrue("Renewer should have started running", renewer.isRunning()); @@ -193,7 +186,7 @@ public class TestLeaseRenewer { Assert.assertEquals("LeaseRenewer:myuser@hdfs://nn1/", threadName); // Pretend to close the file - renewer.closeFile(fileId, MOCK_DFSCLIENT); + renewer.closeClient(MOCK_DFSCLIENT); renewer.setEmptyTime(Time.monotonicNow()); // Should stop the renewer running within a few seconds http://git-wip-us.apache.org/repos/asf/hadoop/blob/738d1a20/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSClientRetries.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSClientRetries.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSClientRetries.java index 9674454..880e3df 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSClientRetries.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSClientRetries.java @@ -47,6 +47,7 @@ import java.util.Arrays; import java.util.EnumSet; import java.util.List; import java.util.Random; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import org.apache.commons.logging.Log; @@ -1253,4 +1254,83 @@ public class TestDFSClientRetries { cluster.shutdown(); } } + + @Test(timeout=120000) + public void testLeaseRenewAndDFSOutputStreamDeadLock() throws Exception { + CountDownLatch testLatch = new CountDownLatch(1); + DFSClientFaultInjector.set(new DFSClientFaultInjector() { + public void delayWhenRenewLeaseTimeout() { + try { + testLatch.await(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + }); + String file1 = "/testFile1"; + // Set short retry timeouts so this test runs faster + conf.setInt(DFS_CLIENT_SOCKET_TIMEOUT_KEY, 1000); + MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).build(); + try { + cluster.waitActive(); + final NamenodeProtocols spyNN = spy(cluster.getNameNodeRpc()); + + doAnswer(new SleepFixedTimeAnswer(1500, testLatch)).when(spyNN).complete( + anyString(), anyString(), any(ExtendedBlock.class), anyLong()); + DFSClient client = new DFSClient(null, spyNN, conf, null); + // Get hold of the lease renewer instance used by the client + LeaseRenewer leaseRenewer = client.getLeaseRenewer(); + leaseRenewer.setRenewalTime(100); + final OutputStream out1 = client.create(file1, false); + + out1.write(new byte[256]); + + Thread closeThread = new Thread(new Runnable() { + @Override public void run() { + try { + //1. trigger get LeaseRenewer lock + Mockito.doThrow(new SocketTimeoutException()).when(spyNN) + .renewLease(Mockito.anyString()); + } catch (IOException e) { + e.printStackTrace(); + } + } + }); + closeThread.start(); + + //2. trigger get DFSOutputStream lock + out1.close(); + + } finally { + cluster.shutdown(); + } + } + + private static class SleepFixedTimeAnswer implements Answer { + private final int sleepTime; + private final CountDownLatch testLatch; + + SleepFixedTimeAnswer(int sleepTime, CountDownLatch latch) { + this.sleepTime = sleepTime; + this.testLatch = latch; + } + + @Override + public Object answer(InvocationOnMock invocation) throws Throwable { + boolean interrupted = false; + try { + Thread.sleep(sleepTime); + } catch (InterruptedException ie) { + interrupted = true; + } + try { + return invocation.callRealMethod(); + } finally { + testLatch.countDown(); + if (interrupted) { + Thread.currentThread().interrupt(); + } + } + } + } } http://git-wip-us.apache.org/repos/asf/hadoop/blob/738d1a20/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDistributedFileSystem.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDistributedFileSystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDistributedFileSystem.java index 6389f0d..823c747 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDistributedFileSystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDistributedFileSystem.java @@ -424,6 +424,7 @@ public class TestDistributedFileSystem { Configuration conf = getTestConfiguration(); final long grace = 1000L; MiniDFSCluster cluster = null; + LeaseRenewer.setLeaseRenewerGraceDefault(grace); try { cluster = new MiniDFSCluster.Builder(conf).numDataNodes(2).build(); @@ -436,10 +437,6 @@ public class TestDistributedFileSystem { { final DistributedFileSystem dfs = cluster.getFileSystem(); - Method setMethod = dfs.dfs.getLeaseRenewer().getClass() - .getDeclaredMethod("setGraceSleepPeriod", long.class); - setMethod.setAccessible(true); - setMethod.invoke(dfs.dfs.getLeaseRenewer(), grace); Method checkMethod = dfs.dfs.getLeaseRenewer().getClass() .getDeclaredMethod("isRunning"); checkMethod.setAccessible(true); --------------------------------------------------------------------- To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org For additional commands, e-mail: common-commits-help@hadoop.apache.org