Return-Path: Delivered-To: apmail-hadoop-mapreduce-commits-archive@minotaur.apache.org Received: (qmail 87265 invoked from network); 17 Jul 2009 02:03:40 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3) by minotaur.apache.org with SMTP; 17 Jul 2009 02:03:40 -0000 Received: (qmail 16077 invoked by uid 500); 17 Jul 2009 02:04:45 -0000 Delivered-To: apmail-hadoop-mapreduce-commits-archive@hadoop.apache.org Received: (qmail 16046 invoked by uid 500); 17 Jul 2009 02:04:45 -0000 Mailing-List: contact mapreduce-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: mapreduce-dev@hadoop.apache.org Delivered-To: mailing list mapreduce-commits@hadoop.apache.org Received: (qmail 16036 invoked by uid 99); 17 Jul 2009 02:04:45 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 17 Jul 2009 02:04:45 +0000 X-ASF-Spam-Status: No, hits=-1998.9 required=10.0 tests=ALL_TRUSTED,FB_GET_MEDS X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 17 Jul 2009 02:04:35 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id D51832388A56; Fri, 17 Jul 2009 02:04:15 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r794942 - in /hadoop/mapreduce/trunk: CHANGES.txt src/test/mapred/org/apache/hadoop/cli/testMRConf.xml src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java src/tools/org/apache/hadoop/tools/HadoopArchives.java Date: Fri, 17 Jul 2009 02:04:15 -0000 To: mapreduce-commits@hadoop.apache.org From: cdouglas@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20090717020415.D51832388A56@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: cdouglas Date: Fri Jul 17 02:04:15 2009 New Revision: 794942 URL: http://svn.apache.org/viewvc?rev=794942&view=rev Log: MAPREDUCE-739. Allow relative paths to be created in archives. Contributed by Mahadev Konar Modified: hadoop/mapreduce/trunk/CHANGES.txt hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/cli/testMRConf.xml hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/HadoopArchives.java Modified: hadoop/mapreduce/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/CHANGES.txt?rev=794942&r1=794941&r2=794942&view=diff ============================================================================== --- hadoop/mapreduce/trunk/CHANGES.txt (original) +++ hadoop/mapreduce/trunk/CHANGES.txt Fri Jul 17 02:04:15 2009 @@ -114,6 +114,9 @@ MAPREDUCE-353. Makes the shuffle read and connection timeouts configurable. (Ravi Gummadi via ddas) + MAPREDUCE-739. Allow relative paths to be created in archives. (Mahadev + Konar via cdouglas) + BUG FIXES MAPREDUCE-703. Sqoop requires dependency on hsqldb in ivy. (Aaron Kimball via matei) Modified: hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/cli/testMRConf.xml URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/cli/testMRConf.xml?rev=794942&r1=794941&r2=794942&view=diff ============================================================================== --- hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/cli/testMRConf.xml (original) +++ hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/cli/testMRConf.xml Fri Jul 17 02:04:15 2009 @@ -1,6 +1,21 @@ - + @@ -53,7 +68,7 @@ -fs NAMENODE -touchz /dir0/file0 -fs NAMENODE -mkdir /dest - -fs NAMENODE -archiveName dest/test.har /dir0/ + -fs NAMENODE -archiveName dest/test.har -p / dir0/ -fs NAMENODE -rmr /dir0 /dest @@ -61,11 +76,11 @@ RegexpComparator - archive -archiveName NAME <src>\* <dest> + archive -archiveName NAME -p <parent path> <src>\* <dest> TokenComparator - Invalid usage. + Invalid name for archives. dest/test.har @@ -81,8 +96,8 @@ -fs NAMENODE -touchz /dir0/dir1/file2 -fs NAMENODE -touchz /dir0/dir2/file1 -fs NAMENODE -mkdir /dest - -fs NAMENODE -archiveName dir0.har /dir0/ /dest - -fs NAMENODE -archiveName dir0.har /dir0/ /dest + -fs NAMENODE -archiveName dir0.har -p / dir0/ /dest + -fs NAMENODE -archiveName dir0.har -p / dir0/ /dest -fs NAMENODE -rmr /dir0 /dest @@ -90,7 +105,7 @@ TokenComparator - Invalid Output. + Invalid Output: /dest/dir0.har @@ -105,7 +120,7 @@ -fs NAMENODE -touchz /dir0/dir1/file1 -fs NAMENODE -touchz /dir0/dir1/file2 -fs NAMENODE -touchz /dir0/dir2/file1 - -fs NAMENODE -archiveName dir0.har /dir0/ /dir0/ + -fs NAMENODE -archiveName dir0.har -p / dir0/ /dir0/ -fs NAMENODE -ls /dir0/ @@ -147,7 +162,7 @@ -fs NAMENODE -touchz /dir0/dir1/file2 -fs NAMENODE -touchz /dir0/dir2/file1 -fs NAMENODE -mkdir /dest - -fs NAMENODE -archiveName dir0.har /dir0/ /file1 + -fs NAMENODE -archiveName dir0.har -p / dir0/ /file1 -fs NAMENODE -rmr /* @@ -155,7 +170,7 @@ TokenComparator - Invalid Output. + Invalid Output: /file1/dir0.har @@ -171,7 +186,7 @@ -fs NAMENODE -touchz /dir0/dir1/file2 -fs NAMENODE -touchz /dir0/dir2/file1 -fs NAMENODE -mkdir /dest - -fs NAMENODE -archiveName dir0.har /dir0/ /dest + -fs NAMENODE -archiveName dir0.har -p / dir0/ /dest -fs NAMENODE -rm har:///dest/dir0.har/dir0/file0 @@ -196,7 +211,7 @@ -fs NAMENODE -touchz /dir0/dir1/file2 -fs NAMENODE -touchz /dir0/dir2/file1 -fs NAMENODE -mkdir /dest - -fs NAMENODE -archiveName dir0.har /dir0/ /dest + -fs NAMENODE -archiveName dir0.har -p / dir0/ /dest -fs NAMENODE -mv har:///dest/dir0.har/dir0/file0 har:///dest/dir0.har/dir0/file1 @@ -225,7 +240,7 @@ -fs NAMENODE -touchz /dir0/dir1/file2 -fs NAMENODE -touchz /dir0/dir2/file1 -fs NAMENODE -mkdir /dest - -fs NAMENODE -archiveName dir0.har /dir0/ /dest + -fs NAMENODE -archiveName dir0.har -p / dir0/ /dest -fs NAMENODE -count har:///dest/dir0.har/dir0/file0 @@ -244,7 +259,7 @@ -fs NAMENODE -mkdir /dir0 -fs NAMENODE -touchz /dir0/file0 - -fs NAMENODE -archiveName dir0.har /dir0 + -fs NAMENODE -archiveName dir0.har -p /dir0 -fs NAMENODE -rmr /* @@ -252,7 +267,7 @@ RegexpComparator - archive -archiveName NAME <src>\* <dest> + archive -archiveName NAME -p <parent path> <src>\* <dest> TokenComparator @@ -266,7 +281,7 @@ -fs NAMENODE -mkdir /dir0 -fs NAMENODE -touchz /dir0/file0 - -fs NAMENODE -archiveName /dir0 /dest + -fs NAMENODE -archiveName -p / dir0 /dest -fs NAMENODE -rmr /* @@ -274,11 +289,11 @@ RegexpComparator - archive -archiveName NAME <src>\* <dest> + archive -archiveName NAME -p <parent path> <src>\* <dest> TokenComparator - Invalid usage. + archive -archiveName NAME -p <parent path> <src>* <dest> @@ -291,7 +306,7 @@ -fs NAMENODE -touchz /dir0/file1 -fs NAMENODE -touchz /dir0/file2 -fs NAMENODE -mkdir /dir1 - -fs NAMENODE -archiveName test.har /dir0/file* /dir1 + -fs NAMENODE -archiveName test.har -p / dir0/file* /dir1 -fs NAMENODE -ls /dir1 @@ -318,7 +333,7 @@ -fs NAMENODE -touchz /dir0/file0 -fs NAMENODE -touchz /dir1/file1 -fs NAMENODE -mkdir /dest - -fs NAMENODE -archiveName test.har /dir* /dest + -fs NAMENODE -archiveName test.har -p / dir* /dest -fs NAMENODE -ls /dest @@ -341,7 +356,7 @@ -fs NAMENODE -touchz /file0 -fs NAMENODE -mkdir /dir1 - -fs NAMENODE -archiveName test.har /file0 /dir1 + -fs NAMENODE -archiveName test.har -p / file0 /dir1 -fs NAMENODE -ls /dir1 @@ -364,7 +379,7 @@ -fs NAMENODE -mkdir /dir0 -fs NAMENODE -mkdir /dest - -fs NAMENODE -archiveName test.har /dir0 /dest + -fs NAMENODE -archiveName test.har -p / dir0 /dest -fs NAMENODE -ls /dest @@ -385,7 +400,7 @@ Archive: Invalid Source is specified - -fs NAMENODE -archiveName test.har file0 /dest + -fs NAMENODE -archiveName test.har -p file0 /dest -fs NAMENODE -rmr /* @@ -403,19 +418,19 @@ -fs NAMENODE -mkdir /dir0 -fs NAMENODE -mkdir /dest - -fs NAMENODE -archiveName test /dir0 /dest + -fs NAMENODE -archiveName test -p / dir0 /dest -fs NAMENODE -rmr /* - TokenComparator - archive -archiveName NAME <src>* <dest> + RegexpComparator + archive -archiveName NAME -p <parent path> <src>\* <dest> TokenComparator - Invalid name for archives. test + archive -archiveName NAME -p <parent path> <src>* <dest> @@ -425,7 +440,7 @@ -fs NAMENODE -mkdir /dir0 -fs NAMENODE -mkdir /dest - -fs NAMENODE -archiveName test.har /dir0 /dest + -fs NAMENODE -archiveName test.har -p / dir0 /dest -fs NAMENODE -rmr /dest/test.har -fs NAMENODE -ls /dest/ @@ -445,7 +460,7 @@ -fs NAMENODE -mkdir /dir0 -fs NAMENODE -mkdir /dest - -fs NAMENODE -archiveName test.har /dir0 /dest + -fs NAMENODE -archiveName test.har -p / dir0 /dest -fs NAMENODE -mv /dest/test.har /dest/test1.har -fs NAMENODE -ls /dest/ @@ -475,7 +490,7 @@ -fs NAMENODE -touchz /dir0/dir1/file2 -fs NAMENODE -touchz /dir0/dir2/file1 -fs NAMENODE -mkdir /dest - -fs NAMENODE -archiveName dir0.har /dir0/ /dest + -fs NAMENODE -archiveName dir0.har -p / dir0/ /dest -fs NAMENODE -lsr har:///dest/dir0.har/dir0 Modified: hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java?rev=794942&r1=794941&r2=794942&view=diff ============================================================================== --- hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java (original) +++ hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java Fri Jul 17 02:04:15 2009 @@ -19,6 +19,7 @@ package org.apache.hadoop.tools; import java.io.IOException; +import java.net.URI; import java.util.Iterator; import junit.framework.TestCase; @@ -44,7 +45,7 @@ * and then run a map reduce job */ public class TestHarFileSystem extends TestCase { - private Path inputPath; + private Path inputPath, inputrelPath; private MiniDFSCluster dfscluster; private MiniMRCluster mapred; private FileSystem fs; @@ -53,14 +54,26 @@ protected void setUp() throws Exception { super.setUp(); - dfscluster = new MiniDFSCluster(new JobConf(), 2, true, null); + dfscluster = new MiniDFSCluster(new Configuration(), 2, true, null); fs = dfscluster.getFileSystem(); mapred = new MiniMRCluster(2, fs.getUri().toString(), 1); inputPath = new Path(fs.getHomeDirectory(), "test"); + inputrelPath = new Path(fs.getHomeDirectory().toUri(). + getPath().substring(1), "test"); filea = new Path(inputPath,"a"); fileb = new Path(inputPath,"b"); filec = new Path(inputPath,"c"); archivePath = new Path(fs.getHomeDirectory(), "tmp"); + fs.mkdirs(inputPath); + FSDataOutputStream out = fs.create(filea); + out.write("a".getBytes()); + out.close(); + out = fs.create(fileb); + out.write("b".getBytes()); + out.close(); + out = fs.create(filec); + out.write("c".getBytes()); + out.close(); } protected void tearDown() throws Exception { @@ -100,45 +113,90 @@ } } - public void testArchives() throws Exception { - fs.mkdirs(inputPath); - - FSDataOutputStream out = fs.create(filea); - out.write("a".getBytes()); - out.close(); - out = fs.create(fileb); - out.write("b".getBytes()); - out.close(); - out = fs.create(filec); - out.write("c".getBytes()); - out.close(); + // test archives with a -p option + public void testRelativeArchives() throws Exception { + fs.delete(archivePath,true); Configuration conf = mapred.createJobConf(); HadoopArchives har = new HadoopArchives(conf); - String[] args = new String[3]; + String[] args = new String[6]; + args[0] = "-archiveName"; + args[1] = "foo.har"; + args[2] = "-p"; + args[3] = fs.getHomeDirectory().toString(); + args[4] = "test"; + args[5] = archivePath.toString(); + int ret = ToolRunner.run(har, args); + assertTrue("failed test", ret == 0); + Path finalPath = new Path(archivePath, "foo.har"); + Path fsPath = new Path(inputPath.toUri().getPath()); + Path filePath = new Path(finalPath, "test"); + //make it a har path + Path harPath = new Path("har://" + filePath.toUri().getPath()); + assertTrue(fs.exists(new Path(finalPath, "_index"))); + assertTrue(fs.exists(new Path(finalPath, "_masterindex"))); + assertTrue(!fs.exists(new Path(finalPath, "_logs"))); + args = new String[2]; + args[0] = "-ls"; + args[1] = harPath.toString(); + FsShell shell = new FsShell(conf); + ret = ToolRunner.run(shell, args); + // fileb and filec + assertTrue(ret == 0); + Path harFilea = new Path(harPath, "a"); + Path harFileb = new Path(harPath, "b"); + Path harFilec = new Path(harPath, "c"); + FileSystem harFs = harFilea.getFileSystem(conf); + FSDataInputStream fin = harFs.open(harFilea); + byte[] b = new byte[4]; + int readBytes = fin.read(b); + fin.close(); + assertTrue("strings are equal ", (b[0] == "a".getBytes()[0])); + fin = harFs.open(harFileb); + fin.read(b); + fin.close(); + assertTrue("strings are equal ", (b[0] == "b".getBytes()[0])); + fin = harFs.open(harFilec); + fin.read(b); + fin.close(); + assertTrue("strings are equal ", (b[0] == "c".getBytes()[0])); + } + + + public void testArchivesWithMapred() throws Exception { + fs.delete(archivePath, true); + Configuration conf = mapred.createJobConf(); + HadoopArchives har = new HadoopArchives(conf); + String[] args = new String[4]; + //check for destination not specfied args[0] = "-archiveName"; args[1] = "foo.har"; - args[2] = inputPath.toString(); + args[2] = "-p"; + args[3] = "/"; int ret = ToolRunner.run(har, args); assertTrue(ret != 0); - args = new String[4]; + args = new String[6]; //check for wrong archiveName args[0] = "-archiveName"; args[1] = "/d/foo.har"; - args[2] = inputPath.toString(); - args[3] = archivePath.toString(); + args[2] = "-p"; + args[3] = "/"; + args[4] = inputrelPath.toString(); + args[5] = archivePath.toString(); ret = ToolRunner.run(har, args); assertTrue(ret != 0); -// se if dest is a file + // se if dest is a file args[1] = "foo.har"; - args[3] = filec.toString(); + args[5] = filec.toString(); ret = ToolRunner.run(har, args); assertTrue(ret != 0); //this is a valid run args[0] = "-archiveName"; args[1] = "foo.har"; - args[2] = inputPath.toString(); - args[3] = archivePath.toString(); + args[2] = "-p"; + args[3] = "/"; + args[4] = inputrelPath.toString(); + args[5] = archivePath.toString(); ret = ToolRunner.run(har, args); //checl for the existenece of the archive assertTrue(ret == 0); @@ -151,13 +209,16 @@ String relative = fsPath.toString().substring(1); Path filePath = new Path(finalPath, relative); //make it a har path - Path harPath = new Path("har://" + filePath.toUri().getPath()); + URI uri = fs.getUri(); + Path harPath = new Path("har://" + "hdfs-" + uri.getHost() +":" + + uri.getPort() + filePath.toUri().getPath()); assertTrue(fs.exists(new Path(finalPath, "_index"))); assertTrue(fs.exists(new Path(finalPath, "_masterindex"))); assertTrue(!fs.exists(new Path(finalPath, "_logs"))); //creation tested //check if the archive is same // do ls and cat on all the files + FsShell shell = new FsShell(conf); args = new String[2]; args[0] = "-ls"; Modified: hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/HadoopArchives.java URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/HadoopArchives.java?rev=794942&r1=794941&r2=794942&view=diff ============================================================================== --- hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/HadoopArchives.java (original) +++ hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/HadoopArchives.java Fri Jul 17 02:04:15 2009 @@ -59,6 +59,8 @@ import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; +import com.sun.corba.se.spi.ior.MakeImmutable; + /** * a archive creation utility. @@ -77,12 +79,13 @@ static final String SRC_COUNT_LABEL = NAME + ".src.count"; static final String TOTAL_SIZE_LABEL = NAME + ".total.size"; static final String DST_HAR_LABEL = NAME + ".archive.name"; + static final String SRC_PARENT_LABEL = NAME + ".parent.path"; // size of each part file // its fixed for now. static final long partSize = 2 * 1024 * 1024 * 1024l; private static final String usage = "archive" - + " -archiveName NAME * " + + + " -archiveName NAME -p * " + "\n"; @@ -228,24 +231,53 @@ return deepest; } - // this method is tricky. This method writes - // the top level directories in such a way so that - // the output only contains valid directoreis in archives. - // so for an input path specified by the user - // as /user/hadoop - // we need to index - // / as the root - // /user as a directory - // /user/hadoop as a directory - // so for multiple input paths it makes sure that it - // does the right thing. - // so if the user specifies the input directories as - // /user/harry and /user/hadoop - // we need to write / and user as its child - // and /user and harry and hadoop as its children + /** + * truncate the prefix root from the full path + * @param fullPath the full path + * @param root the prefix root to be truncated + * @return the relative path + */ + private Path relPathToRoot(Path fullPath, Path root) { + // just take some effort to do it + // rather than just using substring + // so that we do not break sometime later + Path justRoot = new Path(Path.SEPARATOR); + if (fullPath.depth() == root.depth()) { + return justRoot; + } + else if (fullPath.depth() > root.depth()) { + Path retPath = new Path(fullPath.getName()); + Path parent = fullPath.getParent(); + for (int i=0; i < (fullPath.depth() - root.depth() -1); i++) { + retPath = new Path(parent.getName(), retPath); + parent = parent.getParent(); + } + return new Path(justRoot, retPath); + } + return null; + } + + /** + * this method writes all the valid top level directories + * into the srcWriter for indexing. This method is a little + * tricky. example- + * for an input with parent path /home/user/ and sources + * as /home/user/source/dir1, /home/user/source/dir2 - this + * will output (dir means that source is a dir + * with dir1 and dir2 as children) and + * and + * @param srcWriter the sequence file writer to write the + * directories to + * @param paths the source paths provided by the user. They + * are glob free and have full path (not relative paths) + * @param parentPath the parent path that you wnat the archives + * to be relative to. example - /home/user/dir1 can be archived with + * parent as /home or /home/user. + * @throws IOException + */ private void writeTopLevelDirs(SequenceFile.Writer srcWriter, - List paths) throws IOException { - //these are qualified paths + List paths, Path parentPath) throws IOException { + //add all the directories List justDirs = new ArrayList(); for (Path p: paths) { if (!p.getFileSystem(getConf()).isFile(p)) { @@ -255,17 +287,23 @@ justDirs.add(new Path(p.getParent().toUri().getPath())); } } - - //get the largest depth path - // this is tricky - TreeMap> allpaths = new TreeMap>(); + /* find all the common parents of paths that are valid archive + * paths. The below is done so that we do not add a common path + * twice and also we need to only add valid child of a path that + * are specified the user. + */ + TreeMap> allpaths = new TreeMap>(); + /* the largest depth of paths. the max number of times + * we need to iterate + */ Path deepest = largestDepth(paths); Path root = new Path(Path.SEPARATOR); - for (int i = 0; i < deepest.depth(); i++) { + for (int i = parentPath.depth(); i < deepest.depth(); i++) { List parents = new ArrayList(); for (Path p: justDirs) { if (p.compareTo(root) == 0){ - //don nothing + //do nothing } else { Path parent = p.getParent(); @@ -285,34 +323,40 @@ } Set>> keyVals = allpaths.entrySet(); for (Map.Entry> entry : keyVals) { - HashSet children = entry.getValue(); - String toWrite = entry.getKey() + " dir "; - StringBuffer sbuff = new StringBuffer(); - sbuff.append(toWrite); - for (String child: children) { - sbuff.append(child + " "); + Path relPath = relPathToRoot(new Path(entry.getKey()), parentPath); + if (relPath != null) { + String toWrite = relPath + " dir "; + HashSet children = entry.getValue(); + StringBuffer sbuff = new StringBuffer(); + sbuff.append(toWrite); + for (String child: children) { + sbuff.append(child + " "); + } + toWrite = sbuff.toString(); + srcWriter.append(new LongWritable(0L), new Text(toWrite)); } - toWrite = sbuff.toString(); - srcWriter.append(new LongWritable(0L), new Text(toWrite)); } } /**archive the given source paths into * the dest + * @param parentPath the parent path of all the source paths * @param srcPaths the src paths to be archived * @param dest the dest dir that will contain the archive */ - public void archive(List srcPaths, String archiveName, Path dest) - throws IOException { + void archive(Path parentPath, List srcPaths, + String archiveName, Path dest) throws IOException { checkPaths(conf, srcPaths); int numFiles = 0; long totalSize = 0; + FileSystem fs = parentPath.getFileSystem(conf); conf.set(DST_HAR_LABEL, archiveName); + conf.set(SRC_PARENT_LABEL, parentPath.makeQualified(fs).toString()); Path outputPath = new Path(dest, archiveName); FileOutputFormat.setOutputPath(conf, outputPath); FileSystem outFs = outputPath.getFileSystem(conf); if (outFs.exists(outputPath) || outFs.isFile(dest)) { - throw new IOException("Invalid Output."); + throw new IOException("Invalid Output: " + outputPath); } conf.set(DST_DIR_LABEL, outputPath.toString()); final String randomId = DistCp.getRandomId(); @@ -331,7 +375,7 @@ // create single list of files and dirs try { // write the top level dirs in first - writeTopLevelDirs(srcWriter, srcPaths); + writeTopLevelDirs(srcWriter, srcPaths, parentPath); srcWriter.sync(); // these are the input paths passed // from the command line @@ -339,14 +383,13 @@ // and then write them to the input file // one at a time for (Path src: srcPaths) { - FileSystem fs = src.getFileSystem(conf); ArrayList allFiles = new ArrayList(); recursivels(fs, src, allFiles); for (FileStatus stat: allFiles) { String toWrite = ""; long len = stat.isDir()? 0:stat.getLen(); if (stat.isDir()) { - toWrite = "" + fs.makeQualified(stat.getPath()) + " dir "; + toWrite = "" + relPathToRoot(stat.getPath(), parentPath) + " dir "; //get the children FileStatus[] list = fs.listStatus(stat.getPath()); StringBuffer sbuff = new StringBuffer(); @@ -357,7 +400,7 @@ toWrite = sbuff.toString(); } else { - toWrite += fs.makeQualified(stat.getPath()) + " file "; + toWrite += relPathToRoot(stat.getPath(), parentPath) + " file "; } srcWriter.append(new LongWritable(len), new Text(toWrite)); @@ -403,6 +446,7 @@ Path tmpOutputDir = null; Path tmpOutput = null; String partname = null; + Path rootPath = null; FSDataOutputStream partStream = null; FileSystem destFs = null; byte[] buffer; @@ -425,6 +469,12 @@ // directory partname = "part-" + partId; tmpOutput = new Path(tmpOutputDir, partname); + rootPath = (conf.get(SRC_PARENT_LABEL, null) == null) ? null : + new Path(conf.get(SRC_PARENT_LABEL)); + if (rootPath == null) { + throw new RuntimeException("Unable to read parent " + + "path for har from config"); + } try { destFs = tmpOutput.getFileSystem(conf); //this was a stale copy @@ -450,16 +500,7 @@ fsin.close(); } } - - // the relative path of p. basically - // getting rid of schema. Parsing and doing - // string manipulation is not good - so - // just use the path api to do it. - private Path makeRelative(Path p) { - Path retPath = new Path(p.toUri().getPath()); - return retPath; - } - + static class MapStat { private String pathname; private boolean isDir; @@ -481,6 +522,20 @@ } } } + + /** + * get rid of / in the beginning of path + * @param p the path + * @return return path without / + */ + private Path realPath(Path p, Path parent) { + Path rootPath = new Path(Path.SEPARATOR); + if (rootPath.compareTo(p) == 0) { + return parent; + } + return new Path(parent, new Path(p.toString().substring(1))); + } + // read files from the split input // and write it onto the part files. // also output hash(name) and string @@ -491,10 +546,10 @@ Reporter reporter) throws IOException { String line = value.toString(); MapStat mstat = new MapStat(line); - Path srcPath = new Path(mstat.pathname); - String towrite = null; - Path relPath = makeRelative(srcPath); + Path relPath = new Path(mstat.pathname); int hash = HarFileSystem.getHarHash(relPath); + String towrite = null; + Path srcPath = realPath(relPath, rootPath); long startPos = partStream.getPos(); if (mstat.isDir) { towrite = relPath.toString() + " " + "dir none " + 0 + " " + 0 + " "; @@ -609,27 +664,26 @@ outStream.close(); indexStream.close(); // try increasing the replication - fs.setReplication(index, (short) 10); - fs.setReplication(masterIndex, (short) 10); + fs.setReplication(index, (short) 5); + fs.setReplication(masterIndex, (short) 5); } } /** the main driver for creating the archives - * it takes at least two command line parameters. The src and the - * dest. It does an lsr on the source paths. + * it takes at least three command line parameters. The parent path, + * The src and the dest. It does an lsr on the source paths. * The mapper created archuves and the reducer creates * the archive index. */ public int run(String[] args) throws Exception { try { + Path parentPath = null; List srcPaths = new ArrayList(); Path destPath = null; - // check we were supposed to archive or - // unarchive String archiveName = null; - if (args.length < 4) { + if (args.length < 5) { System.out.println(usage); throw new IOException("Invalid usage."); } @@ -642,17 +696,34 @@ System.out.println(usage); throw new IOException("Invalid name for archives. " + archiveName); } - for (int i = 2; i < args.length; i++) { + int i = 2; + //check to see if relative parent has been provided or not + //this is a required parameter. + if (! "-p".equals(args[i])) { + System.out.println(usage); + throw new IOException("Parent path not specified."); + } + parentPath = new Path(args[i+1]); + i+=2; + //read the rest of the paths + for (; i < args.length; i++) { if (i == (args.length - 1)) { destPath = new Path(args[i]); } else { - srcPaths.add(new Path(args[i])); + Path argPath = new Path(args[i]); + if (argPath.isAbsolute()) { + System.out.println(usage); + throw new IOException("source path " + argPath + + " is not relative to "+ parentPath); + } + srcPaths.add(new Path(parentPath, argPath)); } } if (srcPaths.size() == 0) { - System.out.println(usage); - throw new IOException("Invalid Usage: No input sources specified."); + // assuming if the user does not specify path for sources + // the whole parent directory needs to be archived. + srcPaths.add(parentPath); } // do a glob on the srcPaths and then pass it on List globPaths = new ArrayList(); @@ -663,7 +734,7 @@ globPaths.add(fs.makeQualified(status.getPath())); } } - archive(globPaths, archiveName, destPath); + archive(parentPath, globPaths, archiveName, destPath); } catch(IOException ie) { System.err.println(ie.getLocalizedMessage()); return -1;