Author: suresh Date: Fri Sep 4 00:20:56 2009 New Revision: 811185 URL: http://svn.apache.org/viewvc?rev=811185&view=rev Log: HDFS-173. Namenode will not block until a large directory deletion completes. It allows other operations when the deletion is in progress. Added: hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestLargeDirectoryDelete.java Modified: hadoop/hdfs/trunk/CHANGES.txt hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/BlockManager.java hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/INodeFile.java Modified: hadoop/hdfs/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/hdfs/trunk/CHANGES.txt?rev=811185&r1=811184&r2=811185&view=diff ============================================================================== --- hadoop/hdfs/trunk/CHANGES.txt (original) +++ hadoop/hdfs/trunk/CHANGES.txt Fri Sep 4 00:20:56 2009 @@ -122,6 +122,9 @@ allows an execution of non-FI tests in FI-enable environment. (Konstantin Boudnik via szetszwo) + HDFS-173. Namenode will not block until a large directory deletion completes. + It allows other operations when the deletion is in progress. (suresh) + BUG FIXES HDFS-76. Better error message to users when commands fail because of Modified: hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/BlockManager.java URL: http://svn.apache.org/viewvc/hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/BlockManager.java?rev=811185&r1=811184&r2=811185&view=diff ============================================================================== --- hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/BlockManager.java (original) +++ hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/BlockManager.java Fri Sep 4 00:20:56 2009 @@ -352,12 +352,13 @@ /** * Adds block to list of blocks which will be invalidated on specified - * datanode and log the move + * datanode * * @param b block * @param dn datanode + * @param log true to create an entry in the log */ - void addToInvalidates(Block b, DatanodeInfo dn) { + void addToInvalidates(Block b, DatanodeInfo dn, boolean log) { Collection invalidateSet = recentInvalidateSets .get(dn.getStorageID()); if (invalidateSet == null) { @@ -366,20 +367,39 @@ } if (invalidateSet.add(b)) { pendingDeletionBlocksCount++; - NameNode.stateChangeLog.info("BLOCK* NameSystem.addToInvalidates: " - + b.getBlockName() + " is added to invalidSet of " + dn.getName()); + if (log) { + NameNode.stateChangeLog.info("BLOCK* NameSystem.addToInvalidates: " + + b.getBlockName() + " to " + dn.getName()); + } } } /** + * Adds block to list of blocks which will be invalidated on specified + * datanode and log the operation + * + * @param b block + * @param dn datanode + */ + void addToInvalidates(Block b, DatanodeInfo dn) { + addToInvalidates(b, dn, true); + } + + /** * Adds block to list of blocks which will be invalidated on all its * datanodes. */ private void addToInvalidates(Block b) { + StringBuilder datanodes = new StringBuilder(); for (Iterator it = blocksMap.nodeIterator(b); it .hasNext();) { DatanodeDescriptor node = it.next(); - addToInvalidates(b, node); + addToInvalidates(b, node, false); + datanodes.append(node.getName()).append(" "); + } + if (datanodes.length() != 0) { + NameNode.stateChangeLog.info("BLOCK* NameSystem.addToInvalidates: " + + b.getBlockName() + " to " + datanodes.toString()); } } Modified: hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java URL: http://svn.apache.org/viewvc/hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java?rev=811185&r1=811184&r2=811185&view=diff ============================================================================== --- hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java (original) +++ hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java Fri Sep 4 00:20:56 2009 @@ -570,19 +570,26 @@ } /** - * Remove the file from management, return blocks + * Delete the target directory and collect the blocks under it + * + * @param src Path of a directory to delete + * @param collectedBlocks Blocks under the deleted directory + * @return true on successful deletion; else false */ - INode delete(String src) { + boolean delete(String src, ListcollectedBlocks) { if (NameNode.stateChangeLog.isDebugEnabled()) { - NameNode.stateChangeLog.debug("DIR* FSDirectory.delete: "+src); + NameNode.stateChangeLog.debug("DIR* FSDirectory.delete: " + src); } waitForReady(); long now = FSNamesystem.now(); - INode deletedNode = unprotectedDelete(src, now); - if (deletedNode != null) { - fsImage.getEditLog().logDelete(src, now); - } - return deletedNode; + INode removedNode = unprotectedDelete(src, collectedBlocks, now); + if (removedNode == null) { + return false; + } + // Blocks will be deleted later by the caller of this method + getFSNamesystem().removePathAndBlocks(src, null); + fsImage.getEditLog().logDelete(src, now); + return true; } /** Return if a directory is empty or not **/ @@ -608,12 +615,30 @@ /** * Delete a path from the name space * Update the count at each ancestor directory with quota + *
+ * Note: This is to be used by {@link FSEditLog} only. + *
+ * @param src a string representation of a path to an inode + * @param mtime the time the inode is removed + * @return deleted inode if deletion succeeds; else null + */ + INode unprotectedDelete(String src, long mtime) { + List collectedBlocks = new ArrayList(); + INode removedNode = unprotectedDelete(src, collectedBlocks, mtime); + getFSNamesystem().removePathAndBlocks(src, collectedBlocks); + return removedNode; + } + + /** + * Delete a path from the name space + * Update the count at each ancestor directory with quota * @param src a string representation of a path to an inode - * @param modificationTime the time the inode is removed - * @param deletedBlocks the place holder for the blocks to be removed - * @return if the deletion succeeds + * @param collectedBlocks blocks collected from the deleted path + * @param mtime the time the inode is removed + * @return deleted inode if deletion succeeds; else null */ - INode unprotectedDelete(String src, long modificationTime) { + INode unprotectedDelete(String src, List collectedBlocks, + long mtime) { src = normalizePath(src); synchronized (rootDir) { @@ -624,33 +649,34 @@ NameNode.stateChangeLog.debug("DIR* FSDirectory.unprotectedDelete: " +"failed to remove "+src+" because it does not exist"); return null; - } else if (inodes.length == 1) { // src is the root + } + if (inodes.length == 1) { // src is the root NameNode.stateChangeLog.warn("DIR* FSDirectory.unprotectedDelete: " + "failed to remove " + src + " because the root is not allowed to be deleted"); return null; - } else { - try { - // Remove the node from the namespace - removeChild(inodes, inodes.length-1); - // set the parent's modification time - inodes[inodes.length-2].setModificationTime(modificationTime); - // GC all the blocks underneath the node. - ArrayList v = new ArrayList(); - int filesRemoved = targetNode.collectSubtreeBlocksAndClear(v); - incrDeletedFileCount(filesRemoved); - getFSNamesystem().removePathAndBlocks(src, v); - if (NameNode.stateChangeLog.isDebugEnabled()) { - NameNode.stateChangeLog.debug("DIR* FSDirectory.unprotectedDelete: " - +src+" is removed"); - } - return targetNode; - } catch(QuotaExceededException e) { - NameNode.stateChangeLog.warn("DIR* FSDirectory.unprotectedDelete: " + - "failed to remove " + src + " because " + e.getMessage()); - return null; - } } + int pos = inodes.length - 1; + try { + // Remove the node from the namespace + targetNode = removeChild(inodes, pos); + } catch(QuotaExceededException e) { + NameNode.stateChangeLog.warn("DIR* FSDirectory.unprotectedDelete: " + + "failed to remove " + src + " because " + e.getMessage()); + return null; + } + if (targetNode == null) { + return null; + } + // set the parent's modification time + inodes[pos-1].setModificationTime(mtime); + int filesRemoved = targetNode.collectSubtreeBlocksAndClear(collectedBlocks); + incrDeletedFileCount(filesRemoved); + if (NameNode.stateChangeLog.isDebugEnabled()) { + NameNode.stateChangeLog.debug("DIR* FSDirectory.unprotectedDelete: " + +src+" is removed"); + } + return targetNode; } } Modified: hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java URL: http://svn.apache.org/viewvc/hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java?rev=811185&r1=811184&r2=811185&view=diff ============================================================================== --- hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java (original) +++ hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java Fri Sep 4 00:20:56 2009 @@ -123,6 +123,7 @@ public static final Log auditLog = LogFactory.getLog( FSNamesystem.class.getName() + ".audit"); + static int BLOCK_DELETION_INCREMENT = 1000; private boolean isPermissionEnabled; private UserGroupInformation fsOwner; private String supergroup; @@ -1385,8 +1386,10 @@ if ((!recursive) && (!dir.isDirEmpty(src))) { throw new IOException(src + " is non empty"); } + if (NameNode.stateChangeLog.isDebugEnabled()) { + NameNode.stateChangeLog.debug("DIR* NameSystem.delete: " + src); + } boolean status = deleteInternal(src, true); - getEditLog().logSync(); if (status && auditLog.isInfoEnabled()) { logAuditEvent(UserGroupInformation.getCurrentUGI(), Server.getRemoteIp(), @@ -1396,25 +1399,68 @@ } /** - * Remove the indicated filename from the namespace. This may - * invalidate some blocks that make up the file. + * Remove a file/directory from the namespace. + *

+ * For large directories, deletion is incremental. The blocks under + * the directory are collected and deleted a small number at a time holding + * the {@link FSNamesystem} lock. + *

+ * For small directory or file the deletion is done in one shot. */ - synchronized boolean deleteInternal(String src, + private boolean deleteInternal(String src, boolean enforcePermission) throws IOException { - if (NameNode.stateChangeLog.isDebugEnabled()) { - NameNode.stateChangeLog.debug("DIR* NameSystem.delete: " + src); + boolean deleteNow = false; + ArrayList collectedBlocks = new ArrayList(); + synchronized(this) { + if (isInSafeMode()) { + throw new SafeModeException("Cannot delete " + src, safeMode); + } + if (enforcePermission && isPermissionEnabled) { + checkPermission(src, false, null, FsAction.WRITE, null, FsAction.ALL); + } + // Unlink the target directory from directory tree + if (!dir.delete(src, collectedBlocks)) { + return false; + } + deleteNow = collectedBlocks.size() <= BLOCK_DELETION_INCREMENT; + if (deleteNow) { // Perform small deletes right away + removeBlocks(collectedBlocks); + } } - if (isInSafeMode()) - throw new SafeModeException("Cannot delete " + src, safeMode); - if (enforcePermission && isPermissionEnabled) { - checkPermission(src, false, null, FsAction.WRITE, null, FsAction.ALL); + // Log directory deletion to editlog + getEditLog().logSync(); + if (!deleteNow) { + removeBlocks(collectedBlocks); // Incremental deletion of blocks } - - return dir.delete(src) != null; + collectedBlocks.clear(); + if (NameNode.stateChangeLog.isDebugEnabled()) { + NameNode.stateChangeLog.debug("DIR* Namesystem.delete: " + + src +" is removed"); + } + return true; } + /** From the given list, incrementally remove the blocks from blockManager */ + private void removeBlocks(List blocks) { + int start = 0; + int end = 0; + while (start < blocks.size()) { + end = BLOCK_DELETION_INCREMENT + start; + end = end > blocks.size() ? blocks.size() : end; + synchronized(this) { + for (int i=start; i blocks) { leaseManager.removeLeaseWithPrefixPath(src); + if (blocks == null) { + return; + } for(Block b : blocks) { blockManager.removeBlock(b); } Modified: hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/INodeFile.java URL: http://svn.apache.org/viewvc/hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/INodeFile.java?rev=811185&r1=811184&r2=811185&view=diff ============================================================================== --- hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/INodeFile.java (original) +++ hadoop/hdfs/trunk/src/java/org/apache/hadoop/hdfs/server/namenode/INodeFile.java Fri Sep 4 00:20:56 2009 @@ -112,8 +112,9 @@ int collectSubtreeBlocksAndClear(List v) { parent = null; - for (Block blk : blocks) { + for (BlockInfo blk : blocks) { v.add(blk); + blk.setINode(null); } blocks = null; return 1; Added: hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestLargeDirectoryDelete.java URL: http://svn.apache.org/viewvc/hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestLargeDirectoryDelete.java?rev=811185&view=auto ============================================================================== --- hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestLargeDirectoryDelete.java (added) +++ hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestLargeDirectoryDelete.java Fri Sep 4 00:20:56 2009 @@ -0,0 +1,152 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode; + +import java.io.IOException; +import java.util.Random; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSTestUtil; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.junit.Assert; +import org.junit.Test; + + +/** + * Ensure during large directory delete, namenode does not block until the + * deletion completes and handles new requests from other clients + */ +public class TestLargeDirectoryDelete { + private static final Log LOG = LogFactory.getLog(TestLargeDirectoryDelete.class); + private static final Configuration CONF = new Configuration(); + private static final int TOTAL_BLOCKS = 10000; + private MiniDFSCluster mc = null; + private int createOps = 0; + private int lockOps = 0; + + static { + CONF.setLong("dfs.block.size", 1); + CONF.setInt("io.bytes.per.checksum", 1); + } + + /** create a file with a length of filelen */ + private void createFile(final String fileName, final long filelen) throws IOException { + FileSystem fs = mc.getFileSystem(); + Path filePath = new Path(fileName); + DFSTestUtil.createFile(fs, filePath, filelen, (short) 1, 0); + } + + /** Create a large number of directories and files */ + private void createFiles() throws IOException { + Random rand = new Random(); + // Create files in a directory with random depth + // ranging from 0-10. + for (int i = 0; i < TOTAL_BLOCKS; i+=100) { + String filename = "/root/"; + int dirs = rand.nextInt(10); // Depth of the directory + for (int j=i; j >=(i-dirs); j--) { + filename += j + "/"; + } + filename += "file" + i; + createFile(filename, 100); + } + } + + private int getBlockCount() { + return (int)mc.getNamesystem().getBlocksTotal(); + } + + /** Run multiple threads doing simultaneous operations on the namenode + * while a large directory is being deleted. + */ + private void runThreads() throws IOException { + final Thread threads[] = new Thread[2]; + + // Thread for creating files + threads[0] = new Thread() { + @Override + public void run() { + while(true) { + try { + int blockcount = getBlockCount(); + if (blockcount < TOTAL_BLOCKS && blockcount > 0) { + String file = "/tmp" + createOps; + createFile(file, 1); + mc.getFileSystem().delete(new Path(file), true); + createOps++; + } + } catch (IOException ex) { + LOG.info("createFile exception ", ex); + break; + } + } + } + }; + + // Thread that periodically acquires the FSNamesystem lock + threads[1] = new Thread() { + @Override + public void run() { + while(true) { + try { + int blockcount = getBlockCount(); + if (blockcount < TOTAL_BLOCKS && blockcount > 0) { + synchronized(mc.getNamesystem()) { + lockOps++; + } + Thread.sleep(1); + } + } catch (InterruptedException ex) { + LOG.info("lockOperation exception ", ex); + break; + } + } + } + }; + threads[0].start(); + threads[1].start(); + + final long start = System.currentTimeMillis(); + FSNamesystem.BLOCK_DELETION_INCREMENT = 1; + mc.getFileSystem().delete(new Path("/root"), true); // recursive delete + final long end = System.currentTimeMillis(); + threads[0].interrupt(); + threads[1].interrupt(); + LOG.info("Deletion took " + (end - start) + "msecs"); + LOG.info("createOperations " + createOps); + LOG.info("lockOperations " + lockOps); + Assert.assertTrue(lockOps + createOps > 0); + } + + @Test + public void largeDelete() throws IOException, InterruptedException { + mc = new MiniDFSCluster(CONF, 1, true, null); + try { + mc.waitActive(); + createFiles(); + Assert.assertEquals(TOTAL_BLOCKS, getBlockCount()); + runThreads(); + } finally { + mc.shutdown(); + } + } +} \ No newline at end of file