Author: mahadev Date: Tue Apr 6 19:07:33 2010 New Revision: 931274 URL: http://svn.apache.org/viewvc?rev=931274&view=rev Log: MAPREDUCE-1585. Create Hadoop Archives version 2 with filenames URL-encoded (rodrigo via mahadev) Modified: hadoop/mapreduce/trunk/CHANGES.txt hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHadoopArchives.java hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/HadoopArchives.java Modified: hadoop/mapreduce/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/CHANGES.txt?rev=931274&r1=931273&r2=931274&view=diff ============================================================================== --- hadoop/mapreduce/trunk/CHANGES.txt (original) +++ hadoop/mapreduce/trunk/CHANGES.txt Tue Apr 6 19:07:33 2010 @@ -511,6 +511,9 @@ Trunk (unreleased changes) MAPREDUCE-1602. Fix the error message for the case that src does not exist. (szetszwo) + MAPREDUCE-1585. Create Hadoop Archives version 2 with filenames + URL-encoded (rodrigo via mahadev) + Release 0.21.0 - Unreleased INCOMPATIBLE CHANGES Modified: hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHadoopArchives.java URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHadoopArchives.java?rev=931274&r1=931273&r2=931274&view=diff ============================================================================== --- hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHadoopArchives.java (original) +++ hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHadoopArchives.java Tue Apr 6 19:07:33 2010 @@ -112,8 +112,9 @@ public class TestHadoopArchives extends createFile(sub1, "x", fs); createFile(sub1, "y", fs); createFile(sub1, "z", fs); - final Path sub2 = new Path(inputPath, "sub 2"); + final Path sub2 = new Path(inputPath, "sub 1 with suffix"); fs.mkdirs(sub2); + createFile(sub2, "z", fs); final Configuration conf = mapred.createJobConf(); final FsShell shell = new FsShell(conf); @@ -125,23 +126,9 @@ public class TestHadoopArchives extends final String prefix = "har://hdfs-" + uri.getHost() +":" + uri.getPort() + archivePath.toUri().getPath() + Path.SEPARATOR; - {//space replacement is not enabled - final String[] args = { - "-archiveName", - "fail.har", - "-p", - inputPathStr, - "*", - archivePath.toString() - }; - final HadoopArchives har = new HadoopArchives(mapred.createJobConf()); - assertEquals(-1, ToolRunner.run(har, args)); - } - {//Enable space replacement final String harName = "foo.har"; final String[] args = { - "-D" + HadoopArchives.SPACE_REPLACE_LABEL + "=true", "-archiveName", harName, "-p", @@ -154,87 +141,11 @@ public class TestHadoopArchives extends //compare results final List harPaths = lsr(shell, prefix + harName); - final List replaced = replace(originalPaths, HadoopArchives.SPACE_REPLACEMENT_DEFAULT); - assertEquals(replaced, harPaths); - } - - {//Replace space with space - final String[] args = { - "-D" + HadoopArchives.SPACE_REPLACE_LABEL + "=true", - "-D" + HadoopArchives.SPACE_REPLACEMENT_LABEL + "=p q", - "-Dhar.space.replace.enable=true", - "-archiveName", - "fail.har", - "-p", - inputPathStr, - "*", - archivePath.toString() - }; - final HadoopArchives har = new HadoopArchives(mapred.createJobConf()); - try { - ToolRunner.run(har, args); - fail(); - } catch(IllegalArgumentException iae) { - System.out.println("GOOD"); - iae.printStackTrace(); - } - } - - {//Replace space with Path.SEPARATOR - final String[] args = { - "-D" + HadoopArchives.SPACE_REPLACE_LABEL + "=true", - "-D" + HadoopArchives.SPACE_REPLACEMENT_LABEL + "=" + Path.SEPARATOR, - "-Dhar.space.replace.enable=true", - "-archiveName", - "fail.har", - "-p", - inputPathStr, - "*", - archivePath.toString() - }; - final HadoopArchives har = new HadoopArchives(mapred.createJobConf()); - try { - ToolRunner.run(har, args); - fail(); - } catch(IllegalArgumentException iae) { - System.out.println("GOOD"); - iae.printStackTrace(); - } + assertEquals(originalPaths, harPaths); } - {//Replace space with a valid replacement - final String harName = "bar.har"; - final String replacement = "+-"; - final String[] args = { - "-D" + HadoopArchives.SPACE_REPLACE_LABEL + "=true", - "-D" + HadoopArchives.SPACE_REPLACEMENT_LABEL + "=" + replacement, - "-Dhar.space.replace.enable=true", - "-archiveName", - harName, - "-p", - inputPathStr, - "*", - archivePath.toString() - }; - final HadoopArchives har = new HadoopArchives(mapred.createJobConf()); - assertEquals(0, ToolRunner.run(har, args)); - - //compare results - final List harPaths = lsr(shell, prefix + harName); - final List replaced = replace(originalPaths, replacement); - assertEquals(replaced, harPaths); - } } - private static List replace(List paths, String replacement) { - final List replaced = new ArrayList(); - for(int i = 0; i < paths.size(); i++) { - replaced.add(paths.get(i).replace(" ", replacement)); - } - Collections.sort(replaced); - return replaced; - } - private static List lsr(final FsShell shell, String dir ) throws Exception { System.out.println("lsr root=" + dir); Modified: hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java?rev=931274&r1=931273&r2=931274&view=diff ============================================================================== --- hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java (original) +++ hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java Tue Apr 6 19:07:33 2010 @@ -37,6 +37,7 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.*; import org.apache.hadoop.tools.HadoopArchives; import org.apache.hadoop.util.ToolRunner; +import org.mortbay.log.Log; /** * test the har file system @@ -62,7 +63,7 @@ public class TestHarFileSystem extends T getPath().substring(1), "test"); filea = new Path(inputPath,"a"); fileb = new Path(inputPath,"b"); - filec = new Path(inputPath,"c"); + filec = new Path(inputPath,"c c"); archivePath = new Path(fs.getHomeDirectory(), "tmp"); fs.mkdirs(inputPath); FSDataOutputStream out = fs.create(filea); @@ -117,7 +118,7 @@ public class TestHarFileSystem extends T private void checkBytes(Path harPath, Configuration conf) throws IOException { Path harFilea = new Path(harPath, "a"); Path harFileb = new Path(harPath, "b"); - Path harFilec = new Path(harPath, "c"); + Path harFilec = new Path(harPath, "c c"); FileSystem harFs = harFilea.getFileSystem(conf); FSDataInputStream fin = harFs.open(harFilea); byte[] b = new byte[4]; @@ -291,7 +292,7 @@ public class TestHarFileSystem extends T // fileb and filec Path harFilea = new Path(harPath, "a"); Path harFileb = new Path(harPath, "b"); - Path harFilec = new Path(harPath, "c"); + Path harFilec = new Path(harPath, "c c"); FileSystem harFs = harFilea.getFileSystem(conf); FSDataInputStream fin = harFs.open(harFilea); byte[] b = new byte[4]; @@ -311,6 +312,8 @@ public class TestHarFileSystem extends T assertTrue("strings are equal ", (b[0] == "c".getBytes()[0])); // ok all files match // run a map reduce job + FileSystem fsHar = harPath.getFileSystem(conf); + FileStatus[] bla = fsHar.listStatus(harPath); Path outdir = new Path(fs.getHomeDirectory(), "mapout"); JobConf jobconf = mapred.createJobConf(); FileInputFormat.addInputPath(jobconf, harPath); @@ -331,7 +334,7 @@ public class TestHarFileSystem extends T FSDataInputStream reduceIn = fs.open(reduceFile); b = new byte[6]; readBytes = reduceIn.read(b); - assertTrue("Should read 6 bytes.", readBytes == 6); + assertTrue("Should read 6 bytes instead of "+readBytes+".", readBytes == 6); //assuming all the 6 bytes were read. Text readTxt = new Text(b); assertTrue("a\nb\nc\n".equals(readTxt.toString())); Modified: hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/HadoopArchives.java URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/HadoopArchives.java?rev=931274&r1=931273&r2=931274&view=diff ============================================================================== --- hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/HadoopArchives.java (original) +++ hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/HadoopArchives.java Tue Apr 6 19:07:33 2010 @@ -22,6 +22,9 @@ import java.io.DataInput; import java.io.DataOutput; import java.io.FileNotFoundException; import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; +import java.net.URLEncoder; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; @@ -74,7 +77,7 @@ import org.apache.hadoop.util.ToolRunner * Hadoop archives look at {@link HarFileSystem}. */ public class HadoopArchives implements Tool { - public static final int VERSION = 1; + public static final int VERSION = 2; private static final Log LOG = LogFactory.getLog(HadoopArchives.class); private static final String NAME = "har"; @@ -90,22 +93,6 @@ public class HadoopArchives implements T static final String HAR_BLOCKSIZE_LABEL = NAME + ".block.size"; /**the size of the part files that will be created when archiving **/ static final String HAR_PARTSIZE_LABEL = NAME + ".partfile.size"; - static final String SPACE_REPLACE_LABEL = NAME + ".space.replace.enable"; - static final boolean SPACE_REPLACE_DEFAULT = false; - static final String SPACE_REPLACEMENT_LABEL = NAME + ".space.replacement"; - static final String SPACE_REPLACEMENT_DEFAULT = "_"; - static final String SPACE_REPLACEMENT_DESCRIPTION - = HadoopArchives.class.getSimpleName() + " (version=" + VERSION - + ") does not support source paths with the space character." - + "\n\nThere is a space replacement option, which can be enabled by" - + "\n -D" + SPACE_REPLACE_LABEL + "=true" - + "\nThe space replacement string can be specified by" - + "\n -D" + SPACE_REPLACEMENT_LABEL + "=" - + "\nThe default is \"" - + SPACE_REPLACEMENT_DEFAULT - + "\".\n*** Note that the original paths will not be changed" - + " by the space replacement option." - + " The resulted har contains only the replaced paths."; /** size of each part file size **/ long partSize = 2 * 1024 * 1024 * 1024l; @@ -118,11 +105,6 @@ public class HadoopArchives implements T private JobConf conf; - private String spaceReplacement = null; - - private boolean isSpaceReplaceEnabled() { - return spaceReplacement != null; - } public void setConf(Configuration conf) { if (conf instanceof JobConf) { @@ -315,13 +297,12 @@ public class HadoopArchives implements T * @param fullPath the full path * @param root the prefix root to be truncated * @return the relative path - * @throws IOException */ - private String relPathToRoot(Path fullPath, Path root) throws IOException { + private Path relPathToRoot(Path fullPath, Path root) { // just take some effort to do it // rather than just using substring // so that we do not break sometime later - final String justRoot = Path.SEPARATOR; + final Path justRoot = new Path(Path.SEPARATOR); if (fullPath.depth() == root.depth()) { return justRoot; } @@ -332,7 +313,7 @@ public class HadoopArchives implements T retPath = new Path(parent.getName(), retPath); parent = parent.getParent(); } - return new Path(justRoot, retPath.toString()).toString(); + return new Path(justRoot, retPath); } return null; } @@ -403,35 +384,20 @@ public class HadoopArchives implements T } Set>> keyVals = allpaths.entrySet(); for (Map.Entry> entry : keyVals) { - final String relPath = relPathToRoot(new Path(entry.getKey()), parentPath); + final Path relPath = relPathToRoot(new Path(entry.getKey()), parentPath); if (relPath != null) { final String[] children = new String[entry.getValue().size()]; int i = 0; for(String child: entry.getValue()) { children[i++] = child; } - append(srcWriter, 0L, relPath, children); + append(srcWriter, 0L, relPath.toString(), children); } } } - //check whether the path contains the space character. - private void checkSpace(String p) throws IOException { - //check only if space replacement is disabled. - if (!isSpaceReplaceEnabled() && p.indexOf(' ') >= 0) { - throw new IOException("Source \"" + p + "\" contains the space character. " - + SPACE_REPLACEMENT_DESCRIPTION); - } - } - private void append(SequenceFile.Writer srcWriter, long len, String path, String[] children) throws IOException { - checkSpace(path); - if (children != null) { - for(String child: children) { - checkSpace(child); - } - } srcWriter.append(new LongWritable(len), new HarEntry(path, children)); } @@ -544,7 +510,7 @@ public class HadoopArchives implements T for (FileStatusDir statDir: allFiles) { FileStatus stat = statDir.getFileStatus(); long len = stat.isDir()? 0:stat.getLen(); - final String path = relPathToRoot(stat.getPath(), parentPath); + final Path path = relPathToRoot(stat.getPath(), parentPath); final String[] children; if (stat.isDir()) { //get the children @@ -557,7 +523,7 @@ public class HadoopArchives implements T else { children = null; } - append(srcWriter, len, path, children); + append(srcWriter, len, path.toString(), children); srcWriter.sync(); numFiles++; totalSize += len; @@ -596,8 +562,6 @@ public class HadoopArchives implements T static class HArchivesMapper implements Mapper { private JobConf conf = null; - private String spaceReplacement; - int partId = -1 ; Path tmpOutputDir = null; Path tmpOutput = null; @@ -615,8 +579,6 @@ public class HadoopArchives implements T // tmp files. public void configure(JobConf conf) { this.conf = conf; - this.spaceReplacement = conf.get(SPACE_REPLACEMENT_LABEL, - SPACE_REPLACEMENT_DEFAULT); // this is tightly tied to map reduce // since it does not expose an api @@ -676,8 +638,9 @@ public class HadoopArchives implements T return new Path(parent, new Path(p.toString().substring(1))); } - private String replaceSpaces(String s) { - return s.replace(" ", spaceReplacement); + private static String encodeName(String s) + throws UnsupportedEncodingException { + return URLEncoder.encode(s,"UTF-8"); } // read files from the split input @@ -690,15 +653,15 @@ public class HadoopArchives implements T Reporter reporter) throws IOException { Path relPath = new Path(value.path); int hash = HarFileSystem.getHarHash(relPath); - String towrite = replaceSpaces(relPath.toString()); + String towrite = encodeName(relPath.toString()); Path srcPath = realPath(relPath, rootPath); long startPos = partStream.getPos(); if (value.isDir()) { - towrite += " dir none " + 0 + " " + 0 + " "; + towrite += " dir none 0 0 "; StringBuffer sbuff = new StringBuffer(); sbuff.append(towrite); for (String child: value.children) { - sbuff.append(replaceSpaces(child) + " "); + sbuff.append(encodeName(child) + " "); } towrite = sbuff.toString(); //reading directories is also progress @@ -884,22 +847,6 @@ public class HadoopArchives implements T + srcPaths); } - //process space replacement configuration - if (conf.getBoolean(SPACE_REPLACE_LABEL, SPACE_REPLACE_DEFAULT)) { - spaceReplacement = conf.get(SPACE_REPLACEMENT_LABEL, - SPACE_REPLACEMENT_DEFAULT); - if (spaceReplacement.indexOf(' ') >= 0) { - throw new IllegalArgumentException("spaceReplacement = \"" - + spaceReplacement + "\" cannot contain the space character."); - } - if (spaceReplacement.indexOf(Path.SEPARATOR) >= 0) { - throw new IllegalArgumentException("spaceReplacement = \"" - + spaceReplacement + "\" cannot contain the path separator \"" - + Path.SEPARATOR + "\"."); - } - LOG.info(SPACE_REPLACEMENT_LABEL + " = " + spaceReplacement); - } - archive(parentPath, globPaths, archiveName, destPath); } catch(IOException ie) { System.err.println(ie.getLocalizedMessage());