Return-Path: X-Original-To: apmail-hadoop-hdfs-commits-archive@minotaur.apache.org Delivered-To: apmail-hadoop-hdfs-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id C5F159B90 for ; Thu, 23 Feb 2012 01:23:40 +0000 (UTC) Received: (qmail 3849 invoked by uid 500); 23 Feb 2012 01:23:40 -0000 Delivered-To: apmail-hadoop-hdfs-commits-archive@hadoop.apache.org Received: (qmail 3814 invoked by uid 500); 23 Feb 2012 01:23:40 -0000 Mailing-List: contact hdfs-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: hdfs-dev@hadoop.apache.org Delivered-To: mailing list hdfs-commits@hadoop.apache.org Received: (qmail 3805 invoked by uid 99); 23 Feb 2012 01:23:40 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 23 Feb 2012 01:23:40 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 23 Feb 2012 01:23:38 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id BD4D5238899C; Thu, 23 Feb 2012 01:23:18 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1292610 - in /hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs: ./ src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ src/main/java/org/apache/hadoop/hdfs/server/namenode/ src/test/java/org/apache/hadoop/hdfs/ s... Date: Thu, 23 Feb 2012 01:23:18 -0000 To: hdfs-commits@hadoop.apache.org From: todd@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20120223012318.BD4D5238899C@eris.apache.org> Author: todd Date: Thu Feb 23 01:23:17 2012 New Revision: 1292610 URL: http://svn.apache.org/viewvc?rev=1292610&view=rev Log: HDFS-2985. Improve logging when replicas are marked as corrupt. Contributed by Todd Lipcon. Modified: hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileCorruption.java hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestCorruptReplicaInfo.java hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java Modified: hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt?rev=1292610&r1=1292609&r2=1292610&view=diff ============================================================================== --- hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt (original) +++ hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt Thu Feb 23 01:23:17 2012 @@ -28,6 +28,8 @@ Release 0.23.2 - UNRELEASED HDFS-2907. Add a conf property dfs.datanode.fsdataset.factory to make FSDataset in Datanode pluggable. (szetszwo) + HDFS-2985. Improve logging when replicas are marked as corrupt. (todd) + OPTIMIZATIONS BUG FIXES Modified: hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java?rev=1292610&r1=1292609&r2=1292610&view=diff ============================================================================== --- hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java (original) +++ hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java Thu Feb 23 01:23:17 2012 @@ -805,9 +805,11 @@ public class BlockManager { * Mark the block belonging to datanode as corrupt * @param blk Block to be marked as corrupt * @param dn Datanode which holds the corrupt replica + * @param reason a textual reason why the block should be marked corrupt, + * for logging purposes */ public void findAndMarkBlockAsCorrupt(final ExtendedBlock blk, - final DatanodeInfo dn) throws IOException { + final DatanodeInfo dn, String reason) throws IOException { namesystem.writeLock(); try { final BlockInfo storedBlock = getStoredBlock(blk.getLocalBlock()); @@ -820,14 +822,15 @@ public class BlockManager { + blk + " not found."); return; } - markBlockAsCorrupt(storedBlock, dn); + markBlockAsCorrupt(storedBlock, dn, reason); } finally { namesystem.writeUnlock(); } } private void markBlockAsCorrupt(BlockInfo storedBlock, - DatanodeInfo dn) throws IOException { + DatanodeInfo dn, + String reason) throws IOException { assert storedBlock != null : "storedBlock should not be null"; DatanodeDescriptor node = getDatanodeManager().getDatanode(dn); if (node == null) { @@ -851,7 +854,7 @@ public class BlockManager { node.addBlock(storedBlock); // Add this replica to corruptReplicas Map - corruptReplicas.addToCorruptReplicasMap(storedBlock, node); + corruptReplicas.addToCorruptReplicasMap(storedBlock, node, reason); if (countNodes(storedBlock).liveReplicas() >= inode.getReplication()) { // the block is over-replicated so invalidate the replicas immediately invalidateBlock(storedBlock, node); @@ -1313,6 +1316,21 @@ public class BlockManager { this.reportedState = reportedState; } } + + /** + * BlockToMarkCorrupt is used to build the "toCorrupt" list, which is a + * list of blocks that should be considered corrupt due to a block report. + */ + private static class BlockToMarkCorrupt { + final BlockInfo blockInfo; + final String reason; + + BlockToMarkCorrupt(BlockInfo blockInfo, String reason) { + super(); + this.blockInfo = blockInfo; + this.reason = reason; + } + } /** * The given datanode is reporting all its blocks. @@ -1367,7 +1385,7 @@ public class BlockManager { Collection toAdd = new LinkedList(); Collection toRemove = new LinkedList(); Collection toInvalidate = new LinkedList(); - Collection toCorrupt = new LinkedList(); + Collection toCorrupt = new LinkedList(); Collection toUC = new LinkedList(); reportDiff(node, report, toAdd, toRemove, toInvalidate, toCorrupt, toUC); @@ -1387,8 +1405,8 @@ public class BlockManager { + " does not belong to any file."); addToInvalidates(b, node); } - for (BlockInfo b : toCorrupt) { - markBlockAsCorrupt(b, node); + for (BlockToMarkCorrupt b : toCorrupt) { + markBlockAsCorrupt(b.blockInfo, node, b.reason); } } @@ -1419,8 +1437,10 @@ public class BlockManager { // If block is corrupt, mark it and continue to next block. BlockUCState ucState = storedBlock.getBlockUCState(); - if (isReplicaCorrupt(iblk, reportedState, storedBlock, ucState, node)) { - markBlockAsCorrupt(storedBlock, node); + BlockToMarkCorrupt c = checkReplicaCorrupt( + iblk, reportedState, storedBlock, ucState, node); + if (c != null) { + markBlockAsCorrupt(c.blockInfo, node, c.reason); continue; } @@ -1442,7 +1462,7 @@ public class BlockManager { Collection toAdd, // add to DatanodeDescriptor Collection toRemove, // remove from DatanodeDescriptor Collection toInvalidate, // should be removed from DN - Collection toCorrupt, // add to corrupt replicas list + Collection toCorrupt, // add to corrupt replicas list Collection toUC) { // add to under-construction list // place a delimiter in the list which separates blocks // that have been reported from those that have not @@ -1505,7 +1525,7 @@ public class BlockManager { final Block block, final ReplicaState reportedState, final Collection toAdd, final Collection toInvalidate, - final Collection toCorrupt, + final Collection toCorrupt, final Collection toUC) { if(LOG.isDebugEnabled()) { @@ -1536,8 +1556,10 @@ public class BlockManager { return storedBlock; } - if (isReplicaCorrupt(block, reportedState, storedBlock, ucState, dn)) { - toCorrupt.add(storedBlock); + BlockToMarkCorrupt c = checkReplicaCorrupt( + block, reportedState, storedBlock, ucState, dn); + if (c != null) { + toCorrupt.add(c); return storedBlock; } @@ -1561,8 +1583,11 @@ public class BlockManager { * as switch statements, on the theory that it is easier to understand * the combinatorics of reportedState and ucState that way. It should be * at least as efficient as boolean expressions. + * + * @return a BlockToMarkCorrupt object, or null if the replica is not corrupt */ - private boolean isReplicaCorrupt(Block iblk, ReplicaState reportedState, + private BlockToMarkCorrupt checkReplicaCorrupt( + Block iblk, ReplicaState reportedState, BlockInfo storedBlock, BlockUCState ucState, DatanodeDescriptor dn) { switch(reportedState) { @@ -1570,17 +1595,31 @@ public class BlockManager { switch(ucState) { case COMPLETE: case COMMITTED: - return (storedBlock.getGenerationStamp() != iblk.getGenerationStamp() - || storedBlock.getNumBytes() != iblk.getNumBytes()); + if (storedBlock.getGenerationStamp() != iblk.getGenerationStamp()) { + return new BlockToMarkCorrupt(storedBlock, + "block is " + ucState + " and reported genstamp " + + iblk.getGenerationStamp() + " does not match " + + "genstamp in block map " + storedBlock.getGenerationStamp()); + } else if (storedBlock.getNumBytes() != iblk.getNumBytes()) { + return new BlockToMarkCorrupt(storedBlock, + "block is " + ucState + " and reported length " + + iblk.getNumBytes() + " does not match " + + "length in block map " + storedBlock.getNumBytes()); + } else { + return null; // not corrupt + } default: - return false; + return null; } case RBW: case RWR: if (!storedBlock.isComplete()) { - return false; + return null; // not corrupt } else if (storedBlock.getGenerationStamp() != iblk.getGenerationStamp()) { - return true; + return new BlockToMarkCorrupt(storedBlock, + "reported " + reportedState + " replica with genstamp " + + iblk.getGenerationStamp() + " does not match COMPLETE block's " + + "genstamp in block map " + storedBlock.getGenerationStamp()); } else { // COMPLETE block, same genstamp if (reportedState == ReplicaState.RBW) { // If it's a RBW report for a COMPLETE block, it may just be that @@ -1590,18 +1629,22 @@ public class BlockManager { LOG.info("Received an RBW replica for block " + storedBlock + " on " + dn.getName() + ": ignoring it, since the block is " + "complete with the same generation stamp."); - return false; + return null; } else { - return true; + return new BlockToMarkCorrupt(storedBlock, + "reported replica has invalid state " + reportedState); } } case RUR: // should not be reported case TEMPORARY: // should not be reported default: - LOG.warn("Unexpected replica state " + reportedState - + " for block: " + storedBlock + - " on " + dn.getName() + " size " + storedBlock.getNumBytes()); - return true; + String msg = "Unexpected replica state " + reportedState + + " for block: " + storedBlock + + " on " + dn.getName() + " size " + storedBlock.getNumBytes(); + // log here at WARN level since this is really a broken HDFS + // invariant + LOG.warn(msg); + return new BlockToMarkCorrupt(storedBlock, msg); } } @@ -2132,7 +2175,7 @@ public class BlockManager { // blockReceived reports a finalized block Collection toAdd = new LinkedList(); Collection toInvalidate = new LinkedList(); - Collection toCorrupt = new LinkedList(); + Collection toCorrupt = new LinkedList(); Collection toUC = new LinkedList(); processReportedBlock(node, block, ReplicaState.FINALIZED, toAdd, toInvalidate, toCorrupt, toUC); @@ -2153,8 +2196,8 @@ public class BlockManager { + " does not belong to any file."); addToInvalidates(b, node); } - for (BlockInfo b : toCorrupt) { - markBlockAsCorrupt(b, node); + for (BlockToMarkCorrupt b : toCorrupt) { + markBlockAsCorrupt(b.blockInfo, node, b.reason); } } Modified: hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java?rev=1292610&r1=1292609&r2=1292610&view=diff ============================================================================== --- hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java (original) +++ hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java Thu Feb 23 01:23:17 2012 @@ -44,25 +44,37 @@ public class CorruptReplicasMap{ * * @param blk Block to be added to CorruptReplicasMap * @param dn DatanodeDescriptor which holds the corrupt replica + * @param reason a textual reason (for logging purposes) */ - public void addToCorruptReplicasMap(Block blk, DatanodeDescriptor dn) { + public void addToCorruptReplicasMap(Block blk, DatanodeDescriptor dn, + String reason) { Collection nodes = getNodes(blk); if (nodes == null) { nodes = new TreeSet(); corruptReplicasMap.put(blk, nodes); } + + String reasonText; + if (reason != null) { + reasonText = " because " + reason; + } else { + reasonText = ""; + } + if (!nodes.contains(dn)) { nodes.add(dn); NameNode.stateChangeLog.info("BLOCK NameSystem.addToCorruptReplicasMap: "+ blk.getBlockName() + " added as corrupt on " + dn.getName() + - " by " + Server.getRemoteIp()); + " by " + Server.getRemoteIp() + + reasonText); } else { NameNode.stateChangeLog.info("BLOCK NameSystem.addToCorruptReplicasMap: "+ "duplicate requested for " + blk.getBlockName() + " to add as corrupt " + "on " + dn.getName() + - " by " + Server.getRemoteIp()); + " by " + Server.getRemoteIp() + + reasonText); } } Modified: hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java?rev=1292610&r1=1292609&r2=1292610&view=diff ============================================================================== --- hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java (original) +++ hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java Thu Feb 23 01:23:17 2012 @@ -459,7 +459,8 @@ class NameNodeRpcServer implements Namen DatanodeInfo[] nodes = blocks[i].getLocations(); for (int j = 0; j < nodes.length; j++) { DatanodeInfo dn = nodes[j]; - namesystem.getBlockManager().findAndMarkBlockAsCorrupt(blk, dn); + namesystem.getBlockManager().findAndMarkBlockAsCorrupt(blk, dn, + "client machine reported it"); } } } Modified: hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileCorruption.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileCorruption.java?rev=1292610&r1=1292609&r2=1292610&view=diff ============================================================================== --- hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileCorruption.java (original) +++ hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileCorruption.java Thu Feb 23 01:23:17 2012 @@ -147,7 +147,7 @@ public class TestFileCorruption extends DatanodeRegistration dnR = DataNodeTestUtils.getDNRegistrationForBP(dataNode, blk.getBlockPoolId()); cluster.getNamesystem().getBlockManager().findAndMarkBlockAsCorrupt( - blk, new DatanodeInfo(dnR)); + blk, new DatanodeInfo(dnR), "TEST"); // open the file fs.open(FILE_PATH); Modified: hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestCorruptReplicaInfo.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestCorruptReplicaInfo.java?rev=1292610&r1=1292609&r2=1292610&view=diff ============================================================================== --- hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestCorruptReplicaInfo.java (original) +++ hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestCorruptReplicaInfo.java Thu Feb 23 01:23:17 2012 @@ -83,14 +83,14 @@ public class TestCorruptReplicaInfo exte DatanodeDescriptor dn1 = new DatanodeDescriptor(); DatanodeDescriptor dn2 = new DatanodeDescriptor(); - crm.addToCorruptReplicasMap(getBlock(0), dn1); + crm.addToCorruptReplicasMap(getBlock(0), dn1, "TEST"); assertEquals("Number of corrupt blocks not returning correctly", 1, crm.size()); - crm.addToCorruptReplicasMap(getBlock(1), dn1); + crm.addToCorruptReplicasMap(getBlock(1), dn1, "TEST"); assertEquals("Number of corrupt blocks not returning correctly", 2, crm.size()); - crm.addToCorruptReplicasMap(getBlock(1), dn2); + crm.addToCorruptReplicasMap(getBlock(1), dn2, "TEST"); assertEquals("Number of corrupt blocks not returning correctly", 2, crm.size()); @@ -103,7 +103,7 @@ public class TestCorruptReplicaInfo exte 0, crm.size()); for (Long block_id: block_ids) { - crm.addToCorruptReplicasMap(getBlock(block_id), dn1); + crm.addToCorruptReplicasMap(getBlock(block_id), dn1, "TEST"); } assertEquals("Number of corrupt blocks not returning correctly", Modified: hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java?rev=1292610&r1=1292609&r2=1292610&view=diff ============================================================================== --- hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java (original) +++ hadoop/common/branches/branch-0.23/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java Thu Feb 23 01:23:17 2012 @@ -174,7 +174,8 @@ public class TestNameNodeMetrics { // Corrupt first replica of the block LocatedBlock block = NameNodeAdapter.getBlockLocations( cluster.getNameNode(), file.toString(), 0, 1).get(0); - bm.findAndMarkBlockAsCorrupt(block.getBlock(), block.getLocations()[0]); + bm.findAndMarkBlockAsCorrupt(block.getBlock(), block.getLocations()[0], + "TEST"); updateMetrics(); MetricsRecordBuilder rb = getMetrics(NS_METRICS); assertGauge("CorruptBlocks", 1L, rb); @@ -213,7 +214,8 @@ public class TestNameNodeMetrics { // Corrupt the only replica of the block to result in a missing block LocatedBlock block = NameNodeAdapter.getBlockLocations( cluster.getNameNode(), file.toString(), 0, 1).get(0); - bm.findAndMarkBlockAsCorrupt(block.getBlock(), block.getLocations()[0]); + bm.findAndMarkBlockAsCorrupt(block.getBlock(), block.getLocations()[0], + "TEST"); updateMetrics(); MetricsRecordBuilder rb = getMetrics(NS_METRICS); assertGauge("UnderReplicatedBlocks", 1L, rb);