Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 85DAA200B7E for ; Tue, 6 Sep 2016 18:43:46 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 84AE4160AA9; Tue, 6 Sep 2016 16:43:46 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 811B2160AD2 for ; Tue, 6 Sep 2016 18:43:45 +0200 (CEST) Received: (qmail 27006 invoked by uid 500); 6 Sep 2016 16:43:36 -0000 Mailing-List: contact common-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Delivered-To: mailing list common-commits@hadoop.apache.org Received: (qmail 26236 invoked by uid 99); 6 Sep 2016 16:43:35 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 06 Sep 2016 16:43:35 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 933D2EEE6A; Tue, 6 Sep 2016 16:43:35 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: cnauroth@apache.org To: common-commits@hadoop.apache.org Date: Tue, 06 Sep 2016 16:44:04 -0000 Message-Id: <2c2becb37b6143daab1b88398c9ea2c6@git.apache.org> In-Reply-To: <99b9cf187a7d42009393da29b21bbd46@git.apache.org> References: <99b9cf187a7d42009393da29b21bbd46@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [31/50] [abbrv] hadoop git commit: HDFS-10817. Add Logging for Long-held NN Read Locks. Contributed by Erik Krogen. archived-at: Tue, 06 Sep 2016 16:43:46 -0000 HDFS-10817. Add Logging for Long-held NN Read Locks. Contributed by Erik Krogen. Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/6f4b0d33 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/6f4b0d33 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/6f4b0d33 Branch: refs/heads/HADOOP-13345 Commit: 6f4b0d33ca339e3724623a1d23c101f8cfd3cdd5 Parents: 85bab5f Author: Zhe Zhang Authored: Wed Aug 31 15:40:01 2016 -0700 Committer: Zhe Zhang Committed: Wed Aug 31 15:40:09 2016 -0700 ---------------------------------------------------------------------- .../org/apache/hadoop/hdfs/DFSConfigKeys.java | 6 +- .../hdfs/server/namenode/FSNamesystem.java | 32 +++++ .../src/main/resources/hdfs-default.xml | 9 ++ .../hdfs/server/namenode/TestFSNamesystem.java | 144 ++++++++++++++++++- 4 files changed, 188 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/6f4b0d33/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java index 2eff3b0..f2ab321 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java @@ -407,10 +407,14 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final long DFS_NAMENODE_MAX_LOCK_HOLD_TO_RELEASE_LEASE_MS_DEFAULT = 25; - // Threshold for how long a write lock must be held for the event to be logged + // Threshold for how long namenode locks must be held for the + // event to be logged public static final String DFS_NAMENODE_WRITE_LOCK_REPORTING_THRESHOLD_MS_KEY = "dfs.namenode.write-lock-reporting-threshold-ms"; public static final long DFS_NAMENODE_WRITE_LOCK_REPORTING_THRESHOLD_MS_DEFAULT = 1000L; + public static final String DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_KEY = + "dfs.namenode.read-lock-reporting-threshold-ms"; + public static final long DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_DEFAULT = 5000L; public static final String DFS_UPGRADE_DOMAIN_FACTOR = "dfs.namenode.upgrade.domain.factor"; public static final int DFS_UPGRADE_DOMAIN_FACTOR_DEFAULT = DFS_REPLICATION_DEFAULT; http://git-wip-us.apache.org/repos/asf/hadoop/blob/6f4b0d33/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index f4b742e..3b14eec 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -73,6 +73,8 @@ import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_I import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_WRITE_LOCK_REPORTING_THRESHOLD_MS_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_WRITE_LOCK_REPORTING_THRESHOLD_MS_DEFAULT; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT; @@ -824,6 +826,9 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, this.writeLockReportingThreshold = conf.getLong( DFS_NAMENODE_WRITE_LOCK_REPORTING_THRESHOLD_MS_KEY, DFS_NAMENODE_WRITE_LOCK_REPORTING_THRESHOLD_MS_DEFAULT); + this.readLockReportingThreshold = conf.getLong( + DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_KEY, + DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_DEFAULT); // For testing purposes, allow the DT secret manager to be started regardless // of whether security is enabled. @@ -1507,14 +1512,41 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, private long writeLockReportingThreshold; /** Last time stamp for write lock. Keep the longest one for multi-entrance.*/ private long writeLockHeldTimeStamp; + /** Threshold (ms) for long holding read lock report. */ + private long readLockReportingThreshold; + /** + * Last time stamp for read lock. Keep the longest one for + * multi-entrance. This is ThreadLocal since there could be + * many read locks held simultaneously. + */ + private static ThreadLocal readLockHeldTimeStamp = + new ThreadLocal() { + @Override + public Long initialValue() { + return Long.MAX_VALUE; + } + }; @Override public void readLock() { this.fsLock.readLock().lock(); + if (this.fsLock.getReadHoldCount() == 1) { + readLockHeldTimeStamp.set(monotonicNow()); + } } @Override public void readUnlock() { + final boolean needReport = this.fsLock.getReadHoldCount() == 1; + final long readLockInterval = monotonicNow() - readLockHeldTimeStamp.get(); this.fsLock.readLock().unlock(); + + if (needReport) { + readLockHeldTimeStamp.remove(); + if (readLockInterval > this.readLockReportingThreshold) { + LOG.info("FSNamesystem read lock held for " + readLockInterval + + " ms via\n" + StringUtils.getStackTrace(Thread.currentThread())); + } + } } @Override public void writeLock() { http://git-wip-us.apache.org/repos/asf/hadoop/blob/6f4b0d33/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml index b9a0812..0c2c743 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml @@ -2624,6 +2624,15 @@ + dfs.namenode.read-lock-reporting-threshold-ms + 5000 + When a read lock is held on the namenode for a long time, + this will be logged as the lock is released. This sets how long the + lock must be held for logging to occur. + + + + dfs.namenode.startup.delay.block.deletion.sec 0 The delay in seconds at which we will pause the blocks deletion http://git-wip-us.apache.org/repos/asf/hadoop/blob/6f4b0d33/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystem.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystem.java index df9001d..ccc8a6e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystem.java @@ -28,6 +28,7 @@ import java.io.File; import java.io.IOException; import java.net.InetAddress; import java.net.URI; +import java.util.ArrayList; import java.util.Collection; import com.google.common.base.Supplier; @@ -58,6 +59,7 @@ import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeoutException; +import java.util.regex.Pattern; public class TestFSNamesystem { @@ -286,10 +288,11 @@ public class TestFSNamesystem { } /** - * Test when FSNamesystem lock is held for a long time, logger will report it. + * Test when FSNamesystem write lock is held for a long time, + * logger will report it. */ @Test(timeout=45000) - public void testFSLockLongHoldingReport() throws Exception { + public void testFSWriteLockLongHoldingReport() throws Exception { final long writeLockReportingThreshold = 100L; Configuration conf = new Configuration(); conf.setLong(DFSConfigKeys.DFS_NAMENODE_WRITE_LOCK_REPORTING_THRESHOLD_MS_KEY, @@ -341,6 +344,143 @@ public class TestFSNamesystem { assertTrue(logs.getOutput().contains(GenericTestUtils.getMethodName())); } + /** + * Test when FSNamesystem read lock is held for a long time, + * logger will report it. + */ + @Test(timeout=45000) + public void testFSReadLockLongHoldingReport() throws Exception { + final long readLockReportingThreshold = 100L; + final String readLockLogStmt = "FSNamesystem read lock held for "; + Configuration conf = new Configuration(); + conf.setLong( + DFSConfigKeys.DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_KEY, + readLockReportingThreshold); + FSImage fsImage = Mockito.mock(FSImage.class); + FSEditLog fsEditLog = Mockito.mock(FSEditLog.class); + Mockito.when(fsImage.getEditLog()).thenReturn(fsEditLog); + FSNamesystem fsn = new FSNamesystem(conf, fsImage); + + LogCapturer logs = LogCapturer.captureLogs(FSNamesystem.LOG); + GenericTestUtils.setLogLevel(FSNamesystem.LOG, Level.INFO); + + // Don't report if the read lock is held for a short time + fsn.readLock(); + Thread.sleep(readLockReportingThreshold / 2); + fsn.readUnlock(); + assertFalse(logs.getOutput().contains(GenericTestUtils.getMethodName()) && + logs.getOutput().contains(readLockLogStmt)); + + // Report if the read lock is held for a long time + fsn.readLock(); + Thread.sleep(readLockReportingThreshold + 10); + logs.clearOutput(); + fsn.readUnlock(); + assertTrue(logs.getOutput().contains(GenericTestUtils.getMethodName()) + && logs.getOutput().contains(readLockLogStmt)); + + // Report if it's held for a long time when re-entering read lock + fsn.readLock(); + Thread.sleep(readLockReportingThreshold / 2 + 1); + fsn.readLock(); + Thread.sleep(readLockReportingThreshold / 2 + 1); + logs.clearOutput(); + fsn.readUnlock(); + assertFalse(logs.getOutput().contains(GenericTestUtils.getMethodName()) || + logs.getOutput().contains(readLockLogStmt)); + logs.clearOutput(); + fsn.readUnlock(); + assertTrue(logs.getOutput().contains(GenericTestUtils.getMethodName()) && + logs.getOutput().contains(readLockLogStmt)); + + // Report if it's held for a long time while another thread also has the + // read lock. Let one thread hold the lock long enough to activate an + // alert, then have another thread grab the read lock to ensure that this + // doesn't reset the timing. + logs.clearOutput(); + CountDownLatch barrier = new CountDownLatch(1); + CountDownLatch barrier2 = new CountDownLatch(1); + Thread t1 = new Thread() { + @Override + public void run() { + try { + fsn.readLock(); + Thread.sleep(readLockReportingThreshold + 1); + barrier.countDown(); // Allow for t2 to acquire the read lock + barrier2.await(); // Wait until t2 has the read lock + fsn.readUnlock(); + } catch (InterruptedException e) { + fail("Interrupted during testing"); + } + } + }; + Thread t2 = new Thread() { + @Override + public void run() { + try { + barrier.await(); // Wait until t1 finishes sleeping + fsn.readLock(); + barrier2.countDown(); // Allow for t1 to unlock + fsn.readUnlock(); + } catch (InterruptedException e) { + fail("Interrupted during testing"); + } + } + }; + t1.start(); + t2.start(); + t1.join(); + t2.join(); + Pattern t1Pattern = Pattern.compile( + String.format("\\Q%s\\E.+%s", t1.getName(), readLockLogStmt)); + assertTrue(t1Pattern.matcher(logs.getOutput()).find()); + Pattern t2Pattern = Pattern.compile( + String.format("\\Q%s\\E.+%s", t2.getName(), readLockLogStmt)); + assertFalse(t2Pattern.matcher(logs.getOutput()).find()); + + // Spin up a bunch of threads all grabbing the lock at once; assign some + // to go over threshold and some under. Check that they all log correctly. + logs.clearOutput(); + final int threadCount = 50; + List threads = new ArrayList<>(threadCount); + for (int i = 0; i < threadCount; i++) { + threads.add(new Thread() { + @Override + public void run() { + try { + long sleepTime; + if (this.getName().hashCode() % 2 == 0) { + sleepTime = readLockReportingThreshold + 10; + } else { + sleepTime = readLockReportingThreshold / 2; + } + fsn.readLock(); + Thread.sleep(sleepTime); + fsn.readUnlock(); + } catch (InterruptedException e) { + fail("Interrupted during testing"); + } + } + }); + } + for (Thread t : threads) { + t.start(); + } + for (Thread t : threads) { + t.join(); + } + for (Thread t : threads) { + Pattern p = Pattern.compile( + String.format("\\Q%s\\E.+%s", t.getName(), readLockLogStmt)); + boolean foundLog = p.matcher(logs.getOutput()).find(); + if (t.getName().hashCode() % 2 == 0) { + assertTrue(foundLog); + } else { + assertFalse(foundLog); + } + } + } + @Test public void testSafemodeReplicationConf() throws IOException { Configuration conf = new Configuration(); --------------------------------------------------------------------- To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org For additional commands, e-mail: common-commits-help@hadoop.apache.org