Return-Path: Delivered-To: apmail-hadoop-mapreduce-commits-archive@minotaur.apache.org Received: (qmail 2213 invoked from network); 2 Sep 2010 07:31:05 -0000 Received: from unknown (HELO mail.apache.org) (140.211.11.3) by 140.211.11.9 with SMTP; 2 Sep 2010 07:31:05 -0000 Received: (qmail 34502 invoked by uid 500); 2 Sep 2010 07:31:04 -0000 Delivered-To: apmail-hadoop-mapreduce-commits-archive@hadoop.apache.org Received: (qmail 34414 invoked by uid 500); 2 Sep 2010 07:31:02 -0000 Mailing-List: contact mapreduce-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: mapreduce-dev@hadoop.apache.org Delivered-To: mailing list mapreduce-commits@hadoop.apache.org Received: (qmail 34406 invoked by uid 99); 2 Sep 2010 07:31:01 -0000 Received: from Unknown (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 02 Sep 2010 07:31:01 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 02 Sep 2010 07:30:43 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id D5F88238890A; Thu, 2 Sep 2010 07:29:23 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r991827 - in /hadoop/mapreduce/trunk: CHANGES.txt src/java/org/apache/hadoop/mapreduce/lib/input/CombineFileInputFormat.java src/test/mapred/org/apache/hadoop/mapreduce/lib/input/TestCombineFileInputFormat.java Date: Thu, 02 Sep 2010 07:29:23 -0000 To: mapreduce-commits@hadoop.apache.org From: amareshwari@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20100902072923.D5F88238890A@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: amareshwari Date: Thu Sep 2 07:29:23 2010 New Revision: 991827 URL: http://svn.apache.org/viewvc?rev=991827&view=rev Log: MAPREDUCE-2021. Fixes duplicate hostnames in CombineFileInputFormat's split locations. Modified: hadoop/mapreduce/trunk/CHANGES.txt hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/input/CombineFileInputFormat.java hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/mapreduce/lib/input/TestCombineFileInputFormat.java Modified: hadoop/mapreduce/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/CHANGES.txt?rev=991827&r1=991826&r2=991827&view=diff ============================================================================== --- hadoop/mapreduce/trunk/CHANGES.txt (original) +++ hadoop/mapreduce/trunk/CHANGES.txt Thu Sep 2 07:29:23 2010 @@ -265,6 +265,9 @@ Trunk (unreleased changes) MAPREDUCE-1668. RaidNode Hars a directory only if all its parity files have been created. (Ramkumar Vadali via dhruba) + MAPREDUCE-2021. Fixes duplicate hostnames in CombineFileInputFormat's + split locations. (amareshwari) + Release 0.21.0 - Unreleased INCOMPATIBLE CHANGES Modified: hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/input/CombineFileInputFormat.java URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/input/CombineFileInputFormat.java?rev=991827&r1=991826&r2=991827&view=diff ============================================================================== --- hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/input/CombineFileInputFormat.java (original) +++ hadoop/mapreduce/trunk/src/java/org/apache/hadoop/mapreduce/lib/input/CombineFileInputFormat.java Thu Sep 2 07:29:23 2010 @@ -20,6 +20,7 @@ package org.apache.hadoop.mapreduce.lib. import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; import java.util.LinkedList; import java.util.HashSet; import java.util.List; @@ -272,7 +273,7 @@ public abstract class CombineFileInputFo } ArrayList validBlocks = new ArrayList(); - ArrayList nodes = new ArrayList(); + Set nodes = new HashSet(); long curSplitSize = 0; // process all nodes and create splits that are local @@ -326,7 +327,7 @@ public abstract class CombineFileInputFo // in 'overflow'. After the processing of all racks is complete, these // overflow blocks will be combined into splits. ArrayList overflowBlocks = new ArrayList(); - ArrayList racks = new ArrayList(); + Set racks = new HashSet(); // Process all racks over and over again until there is no more work to do. while (blockToNodes.size() > 0) { @@ -431,7 +432,7 @@ public abstract class CombineFileInputFo * Add this new split into splitList. */ private void addCreatedSplit(List splitList, - List locations, + Collection locations, ArrayList validBlocks) { // create an input split Path[] fl = new Path[validBlocks.size()]; @@ -577,8 +578,8 @@ public abstract class CombineFileInputFo hosts.add(host); } - private List getHosts(List racks) { - List hosts = new ArrayList(); + private Set getHosts(Set racks) { + Set hosts = new HashSet(); for (String rack : racks) { hosts.addAll(rackToNodes.get(rack)); } Modified: hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/mapreduce/lib/input/TestCombineFileInputFormat.java URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/mapreduce/lib/input/TestCombineFileInputFormat.java?rev=991827&r1=991826&r2=991827&view=diff ============================================================================== --- hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/mapreduce/lib/input/TestCombineFileInputFormat.java (original) +++ hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/mapreduce/lib/input/TestCombineFileInputFormat.java Thu Sep 2 07:29:23 2010 @@ -69,6 +69,7 @@ public class TestCombineFileInputFormat final Path dir2 = new Path(inDir, "/dir2"); final Path dir3 = new Path(inDir, "/dir3"); final Path dir4 = new Path(inDir, "/dir4"); + final Path dir5 = new Path(inDir, "/dir5"); static final int BLOCKSIZE = 1024; static final byte[] databuf = new byte[BLOCKSIZE]; @@ -245,16 +246,16 @@ public class TestCombineFileInputFormat MiniDFSCluster dfs = null; FileSystem fileSys = null; try { - /* Start 3 datanodes, one each in rack r1, r2, r3. Create three files - * 1) file1, just after starting the datanode on r1, with + /* Start 3 datanodes, one each in rack r1, r2, r3. Create five files + * 1) file1 and file5, just after starting the datanode on r1, with * a repl factor of 1, and, * 2) file2, just after starting the datanode on r2, with * a repl factor of 2, and, - * 3) file3 after starting the all three datanodes, with a repl + * 3) file3, file4 after starting the all three datanodes, with a repl * factor of 3. - * At the end, file1 will be present on only datanode1, file2 will be - * present on datanode 1 and datanode2 and - * file3 will be present on all datanodes. + * At the end, file1, file5 will be present on only datanode1, file2 will + * be present on datanode 1 and datanode2 and + * file3, file4 will be present on all datanodes. */ Configuration conf = new Configuration(); conf.setBoolean("dfs.replication.considerLoad", false); @@ -267,6 +268,30 @@ public class TestCombineFileInputFormat } Path file1 = new Path(dir1 + "/file1"); writeFile(conf, file1, (short)1, 1); + // create another file on the same datanode + Path file5 = new Path(dir5 + "/file5"); + writeFile(conf, file5, (short)1, 1); + // split it using a CombinedFile input format + DummyInputFormat inFormat = new DummyInputFormat(); + Job job = Job.getInstance(conf); + FileInputFormat.setInputPaths(job, dir1 + "," + dir5); + List splits = inFormat.getSplits(job); + System.out.println("Made splits(Test0): " + splits.size()); + for (InputSplit split : splits) { + System.out.println("File split(Test0): " + split); + } + assertEquals(splits.size(), 1); + CombineFileSplit fileSplit = (CombineFileSplit) splits.get(0); + assertEquals(2, fileSplit.getNumPaths()); + assertEquals(1, fileSplit.getLocations().length); + assertEquals(file1.getName(), fileSplit.getPath(0).getName()); + assertEquals(0, fileSplit.getOffset(0)); + assertEquals(BLOCKSIZE, fileSplit.getLength(0)); + assertEquals(file5.getName(), fileSplit.getPath(1).getName()); + assertEquals(0, fileSplit.getOffset(1)); + assertEquals(BLOCKSIZE, fileSplit.getLength(1)); + assertEquals(hosts1[0], fileSplit.getLocations()[0]); + dfs.startDataNodes(conf, 1, true, null, rack2, hosts2, null); dfs.waitActive(); @@ -275,11 +300,10 @@ public class TestCombineFileInputFormat writeFile(conf, file2, (short)2, 2); // split it using a CombinedFile input format - DummyInputFormat inFormat = new DummyInputFormat(); - Job job = Job.getInstance(conf); + inFormat = new DummyInputFormat(); FileInputFormat.setInputPaths(job, dir1 + "," + dir2); inFormat.setMinSplitSizeRack(BLOCKSIZE); - List splits = inFormat.getSplits(job); + splits = inFormat.getSplits(job); System.out.println("Made splits(Test1): " + splits.size()); // make sure that each split has different locations @@ -287,7 +311,7 @@ public class TestCombineFileInputFormat System.out.println("File split(Test1): " + split); } assertEquals(splits.size(), 2); - CombineFileSplit fileSplit = (CombineFileSplit) splits.get(0); + fileSplit = (CombineFileSplit) splits.get(0); assertEquals(fileSplit.getNumPaths(), 2); assertEquals(fileSplit.getLocations().length, 1); assertEquals(fileSplit.getPath(0).getName(), file2.getName());