Return-Path: X-Original-To: apmail-hadoop-hdfs-commits-archive@minotaur.apache.org Delivered-To: apmail-hadoop-hdfs-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 5471DFC42 for ; Wed, 27 Mar 2013 18:43:06 +0000 (UTC) Received: (qmail 61785 invoked by uid 500); 27 Mar 2013 18:43:06 -0000 Delivered-To: apmail-hadoop-hdfs-commits-archive@hadoop.apache.org Received: (qmail 61745 invoked by uid 500); 27 Mar 2013 18:43:06 -0000 Mailing-List: contact hdfs-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: hdfs-dev@hadoop.apache.org Delivered-To: mailing list hdfs-commits@hadoop.apache.org Received: (qmail 61735 invoked by uid 99); 27 Mar 2013 18:43:06 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 27 Mar 2013 18:43:06 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 27 Mar 2013 18:43:04 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 7165F2388847; Wed, 27 Mar 2013 18:42:44 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1461777 - in /hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs: ./ src/main/java/org/apache/hadoop/hdfs/qjournal/client/ src/main/java/org/apache/hadoop/hdfs/qjournal/server/ Date: Wed, 27 Mar 2013 18:42:44 -0000 To: hdfs-commits@hadoop.apache.org From: todd@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20130327184244.7165F2388847@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: todd Date: Wed Mar 27 18:42:43 2013 New Revision: 1461777 URL: http://svn.apache.org/r1461777 Log: HDFS-4621. Additional logging to help diagnose slow QJM syncs. Contributed by Todd Lipcon. Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/IPCLoggerChannel.java hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/QuorumCall.java hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/Journal.java Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt?rev=1461777&r1=1461776&r2=1461777&view=diff ============================================================================== --- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt (original) +++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt Wed Mar 27 18:42:43 2013 @@ -371,6 +371,8 @@ Release 2.0.5-beta - UNRELEASED HDFS-4635. Move BlockManager#computeCapacity to LightWeightGSet. (suresh) + HDFS-4621. Additional logging to help diagnose slow QJM syncs. (todd) + OPTIMIZATIONS BUG FIXES Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/IPCLoggerChannel.java URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/IPCLoggerChannel.java?rev=1461777&r1=1461776&r2=1461777&view=diff ============================================================================== --- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/IPCLoggerChannel.java (original) +++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/IPCLoggerChannel.java Wed Mar 27 18:42:43 2013 @@ -133,6 +133,8 @@ public class IPCLoggerChannel implements private Stopwatch lastHeartbeatStopwatch = new Stopwatch(); private static final long HEARTBEAT_INTERVAL_MILLIS = 1000; + + private static final long WARN_JOURNAL_MILLIS_THRESHOLD = 1000; static final Factory FACTORY = new AsyncLogger.Factory() { @Override @@ -371,6 +373,12 @@ public class IPCLoggerChannel implements now - submitNanos, TimeUnit.NANOSECONDS); metrics.addWriteEndToEndLatency(endToEndTime); metrics.addWriteRpcLatency(rpcTime); + if (rpcTime / 1000 > WARN_JOURNAL_MILLIS_THRESHOLD) { + QuorumJournalManager.LOG.warn( + "Took " + (rpcTime / 1000) + "ms to send a batch of " + + numTxns + " edits (" + data.length + " bytes) to " + + "remote journal " + IPCLoggerChannel.this); + } } synchronized (IPCLoggerChannel.this) { highestAckedTxId = firstTxnId + numTxns - 1; Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/QuorumCall.java URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/QuorumCall.java?rev=1461777&r1=1461776&r2=1461777&view=diff ============================================================================== --- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/QuorumCall.java (original) +++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/QuorumCall.java Wed Mar 27 18:42:43 2013 @@ -24,6 +24,7 @@ import java.util.concurrent.TimeoutExcep import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.util.Time; +import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.collect.Maps; import com.google.common.util.concurrent.FutureCallback; @@ -120,6 +121,15 @@ class QuorumCall { String msg = String.format( "Waited %s ms (timeout=%s ms) for a response for %s", waited, millis, operationName); + if (!successes.isEmpty()) { + msg += ". Succeeded so far: [" + Joiner.on(",").join(successes.keySet()) + "]"; + } + if (!exceptions.isEmpty()) { + msg += ". Exceptions so far: [" + getExceptionMapString() + "]"; + } + if (successes.isEmpty() && exceptions.isEmpty()) { + msg += ". No responses yet."; + } if (waited > millis * WAIT_PROGRESS_WARN_THRESHOLD) { QuorumJournalManager.LOG.warn(msg); } else { @@ -227,4 +237,22 @@ class QuorumCall { } return sb.toString(); } + + /** + * Return a string suitable for displaying to the user, containing + * any exceptions that have been received so far. + */ + private String getExceptionMapString() { + StringBuilder sb = new StringBuilder(); + boolean first = true; + for (Map.Entry e : exceptions.entrySet()) { + if (!first) { + sb.append(", "); + } + first = false; + sb.append(e.getKey()).append(": ") + .append(e.getValue().getLocalizedMessage()); + } + return sb.toString(); + } } Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/Journal.java URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/Journal.java?rev=1461777&r1=1461776&r2=1461777&view=diff ============================================================================== --- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/Journal.java (original) +++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/Journal.java Wed Mar 27 18:42:43 2013 @@ -128,6 +128,10 @@ class Journal implements Closeable { private final JournalMetrics metrics; + /** + * Time threshold for sync calls, beyond which a warning should be logged to the console. + */ + private static final int WARN_SYNC_MILLIS_THRESHOLD = 1000; Journal(File logDir, String journalId, StorageErrorReporter errorReporter) throws IOException { @@ -370,6 +374,10 @@ class Journal implements Closeable { sw.stop(); metrics.addSync(sw.elapsedTime(TimeUnit.MICROSECONDS)); + if (sw.elapsedTime(TimeUnit.MILLISECONDS) > WARN_SYNC_MILLIS_THRESHOLD) { + LOG.warn("Sync of transaction range " + firstTxnId + "-" + lastTxnId + + " took " + sw.elapsedTime(TimeUnit.MILLISECONDS) + "ms"); + } if (isLagging) { // This batch of edits has already been committed on a quorum of other