hadoop-mapreduce-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cdoug...@apache.org
Subject svn commit: r794942 - in /hadoop/mapreduce/trunk: CHANGES.txt src/test/mapred/org/apache/hadoop/cli/testMRConf.xml src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java src/tools/org/apache/hadoop/tools/HadoopArchives.java
Date Fri, 17 Jul 2009 02:04:15 GMT
Author: cdouglas
Date: Fri Jul 17 02:04:15 2009
New Revision: 794942

URL: http://svn.apache.org/viewvc?rev=794942&view=rev
Log:
MAPREDUCE-739. Allow relative paths to be created in archives. Contributed by Mahadev Konar

Modified:
    hadoop/mapreduce/trunk/CHANGES.txt
    hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/cli/testMRConf.xml
    hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java
    hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/HadoopArchives.java

Modified: hadoop/mapreduce/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/CHANGES.txt?rev=794942&r1=794941&r2=794942&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/CHANGES.txt (original)
+++ hadoop/mapreduce/trunk/CHANGES.txt Fri Jul 17 02:04:15 2009
@@ -114,6 +114,9 @@
     MAPREDUCE-353. Makes the shuffle read and connection timeouts
     configurable. (Ravi Gummadi via ddas)
 
+    MAPREDUCE-739. Allow relative paths to be created in archives. (Mahadev
+    Konar via cdouglas)
+
   BUG FIXES
     MAPREDUCE-703. Sqoop requires dependency on hsqldb in ivy.
     (Aaron Kimball via matei)

Modified: hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/cli/testMRConf.xml
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/cli/testMRConf.xml?rev=794942&r1=794941&r2=794942&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/cli/testMRConf.xml (original)
+++ hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/cli/testMRConf.xml Fri Jul 17
02:04:15 2009
@@ -1,6 +1,21 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <?xml-stylesheet type="text/xsl" href="testConf.xsl"?>
-
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
 <configuration>
   <!-- Normal mode is test. To run just the commands and dump the output
        to the log, set it to nocompare -->
@@ -53,7 +68,7 @@
       <test-commands>
         <command>-fs NAMENODE -touchz /dir0/file0</command>
         <command>-fs NAMENODE -mkdir /dest</command>
-        <archive-command>-fs NAMENODE -archiveName dest/test.har /dir0/ </archive-command>
+        <archive-command>-fs NAMENODE -archiveName dest/test.har -p / dir0/ </archive-command>
       </test-commands>
       <cleanup-commands>
         <command>-fs NAMENODE -rmr /dir0 /dest</command>
@@ -61,11 +76,11 @@
       <comparators>
         <comparator>
           <type>RegexpComparator</type>
-          <expected-output>archive -archiveName NAME &lt;src&gt;\* &lt;dest&gt;</expected-output>
+          <expected-output>archive -archiveName NAME -p &lt;parent path&gt;
&lt;src&gt;\* &lt;dest&gt;</expected-output>
         </comparator>
         <comparator>
           <type>TokenComparator</type>
-          <expected-output>Invalid usage.</expected-output>
+          <expected-output>Invalid name for archives. dest/test.har</expected-output>
         </comparator>
       </comparators>
     </test>
@@ -81,8 +96,8 @@
         <command>-fs NAMENODE -touchz /dir0/dir1/file2</command>
         <command>-fs NAMENODE -touchz /dir0/dir2/file1</command>
         <command>-fs NAMENODE -mkdir /dest</command>
-        <archive-command>-fs NAMENODE -archiveName dir0.har /dir0/ /dest</archive-command>
-        <archive-command>-fs NAMENODE -archiveName dir0.har /dir0/ /dest</archive-command>
+        <archive-command>-fs NAMENODE -archiveName dir0.har -p / dir0/ /dest</archive-command>
+        <archive-command>-fs NAMENODE -archiveName dir0.har -p / dir0/ /dest</archive-command>
       </test-commands>
       <cleanup-commands>
         <command>-fs NAMENODE -rmr /dir0 /dest</command>
@@ -90,7 +105,7 @@
       <comparators>
         <comparator>
           <type>TokenComparator</type>
-          <expected-output>Invalid Output.</expected-output>
+          <expected-output>Invalid Output: /dest/dir0.har</expected-output>
         </comparator>
       </comparators>
     </test>
@@ -105,7 +120,7 @@
         <command>-fs NAMENODE -touchz /dir0/dir1/file1</command>
         <command>-fs NAMENODE -touchz /dir0/dir1/file2</command>
         <command>-fs NAMENODE -touchz /dir0/dir2/file1</command>
-        <archive-command>-fs NAMENODE -archiveName dir0.har /dir0/ /dir0/</archive-command>
+        <archive-command>-fs NAMENODE -archiveName dir0.har -p / dir0/ /dir0/</archive-command>
         <command>-fs NAMENODE -ls /dir0/</command>
       </test-commands>
       <cleanup-commands>
@@ -147,7 +162,7 @@
         <command>-fs NAMENODE -touchz /dir0/dir1/file2</command>
         <command>-fs NAMENODE -touchz /dir0/dir2/file1</command>
         <command>-fs NAMENODE -mkdir /dest</command>
-        <archive-command>-fs NAMENODE -archiveName dir0.har /dir0/ /file1</archive-command>
+        <archive-command>-fs NAMENODE -archiveName dir0.har -p / dir0/ /file1</archive-command>
       </test-commands>
       <cleanup-commands>
         <command>-fs NAMENODE -rmr /*</command>
@@ -155,7 +170,7 @@
       <comparators>
         <comparator>
           <type>TokenComparator</type>
-          <expected-output>Invalid Output.</expected-output>
+          <expected-output>Invalid Output: /file1/dir0.har</expected-output>
         </comparator>
       </comparators>
     </test>
@@ -171,7 +186,7 @@
         <command>-fs NAMENODE -touchz /dir0/dir1/file2</command>
         <command>-fs NAMENODE -touchz /dir0/dir2/file1</command>
         <command>-fs NAMENODE -mkdir /dest</command>
-        <archive-command>-fs NAMENODE -archiveName dir0.har /dir0/ /dest</archive-command>
+        <archive-command>-fs NAMENODE -archiveName dir0.har -p / dir0/ /dest</archive-command>
         <command>-fs NAMENODE -rm har:///dest/dir0.har/dir0/file0</command>
       </test-commands>
       <cleanup-commands>
@@ -196,7 +211,7 @@
         <command>-fs NAMENODE -touchz /dir0/dir1/file2</command>
         <command>-fs NAMENODE -touchz /dir0/dir2/file1</command>
         <command>-fs NAMENODE -mkdir /dest</command>
-        <archive-command>-fs NAMENODE -archiveName dir0.har /dir0/ /dest</archive-command>
+        <archive-command>-fs NAMENODE -archiveName dir0.har -p / dir0/ /dest</archive-command>
         <command>-fs NAMENODE -mv har:///dest/dir0.har/dir0/file0 har:///dest/dir0.har/dir0/file1</command>
       </test-commands>
       <cleanup-commands>
@@ -225,7 +240,7 @@
         <command>-fs NAMENODE -touchz /dir0/dir1/file2</command>
         <command>-fs NAMENODE -touchz /dir0/dir2/file1</command>
         <command>-fs NAMENODE -mkdir /dest</command>
-        <archive-command>-fs NAMENODE -archiveName dir0.har /dir0/ /dest</archive-command>
+        <archive-command>-fs NAMENODE -archiveName dir0.har -p / dir0/ /dest</archive-command>
         <command>-fs NAMENODE -count har:///dest/dir0.har/dir0/file0</command>
       </test-commands>
       <cleanup-commands>
@@ -244,7 +259,7 @@
       <test-commands>
         <command>-fs NAMENODE -mkdir /dir0</command>
         <command>-fs NAMENODE -touchz /dir0/file0</command>
-        <archive-command>-fs NAMENODE -archiveName dir0.har /dir0 </archive-command>
+        <archive-command>-fs NAMENODE -archiveName dir0.har -p /dir0 </archive-command>
       </test-commands>
       <cleanup-commands>
         <command>-fs NAMENODE -rmr /*</command>
@@ -252,7 +267,7 @@
       <comparators>
         <comparator>
           <type>RegexpComparator</type>
-          <expected-output>archive -archiveName NAME &lt;src&gt;\* &lt;dest&gt;</expected-output>
+          <expected-output>archive -archiveName NAME -p &lt;parent path&gt;
&lt;src&gt;\* &lt;dest&gt;</expected-output>
         </comparator>
         <comparator>
           <type>TokenComparator</type>
@@ -266,7 +281,7 @@
       <test-commands>
         <command>-fs NAMENODE -mkdir /dir0</command>
         <command>-fs NAMENODE -touchz /dir0/file0</command>
-        <archive-command>-fs NAMENODE -archiveName /dir0 /dest </archive-command>
+        <archive-command>-fs NAMENODE -archiveName -p / dir0 /dest </archive-command>
       </test-commands>
       <cleanup-commands>
         <command>-fs NAMENODE -rmr /*</command>
@@ -274,11 +289,11 @@
       <comparators>
         <comparator>
           <type>RegexpComparator</type>
-          <expected-output>archive -archiveName NAME &lt;src&gt;\* &lt;dest&gt;</expected-output>
+          <expected-output>archive -archiveName NAME -p &lt;parent path&gt;
&lt;src&gt;\* &lt;dest&gt;</expected-output>
         </comparator>
         <comparator>
           <type>TokenComparator</type>
-          <expected-output>Invalid usage.</expected-output>
+          <expected-output>archive -archiveName NAME -p &lt;parent path&gt;
&lt;src&gt;* &lt;dest&gt;</expected-output>
         </comparator>
       </comparators>
     </test>
@@ -291,7 +306,7 @@
         <command>-fs NAMENODE -touchz /dir0/file1</command>
         <command>-fs NAMENODE -touchz /dir0/file2</command>
         <command>-fs NAMENODE -mkdir /dir1</command>
-        <archive-command>-fs NAMENODE -archiveName test.har /dir0/file* /dir1</archive-command>
+        <archive-command>-fs NAMENODE -archiveName test.har -p / dir0/file* /dir1</archive-command>
         <command>-fs NAMENODE -ls /dir1</command>
       </test-commands>
       <cleanup-commands>
@@ -318,7 +333,7 @@
         <command>-fs NAMENODE -touchz /dir0/file0</command>
         <command>-fs NAMENODE -touchz /dir1/file1</command>
         <command>-fs NAMENODE -mkdir /dest</command>
-        <archive-command>-fs NAMENODE -archiveName test.har /dir* /dest </archive-command>
+        <archive-command>-fs NAMENODE -archiveName test.har -p / dir* /dest </archive-command>
         <command>-fs NAMENODE -ls /dest</command>
       </test-commands>
       <cleanup-commands>
@@ -341,7 +356,7 @@
       <test-commands>
         <command>-fs NAMENODE -touchz /file0</command>
         <command>-fs NAMENODE -mkdir /dir1</command>
-        <archive-command>-fs NAMENODE -archiveName test.har /file0 /dir1</archive-command>
+        <archive-command>-fs NAMENODE -archiveName test.har -p / file0 /dir1</archive-command>
         <command>-fs NAMENODE -ls /dir1</command>
       </test-commands>
       <cleanup-commands>
@@ -364,7 +379,7 @@
       <test-commands>
         <command>-fs NAMENODE -mkdir /dir0</command>
         <command>-fs NAMENODE -mkdir /dest</command>
-        <archive-command>-fs NAMENODE -archiveName test.har /dir0 /dest </archive-command>
+        <archive-command>-fs NAMENODE -archiveName test.har -p / dir0 /dest </archive-command>
         <command>-fs NAMENODE -ls /dest</command>
       </test-commands>
       <cleanup-commands>
@@ -385,7 +400,7 @@
     <test> <!-- TESTED -->
       <description>Archive: Invalid Source is specified</description>
       <test-commands>
-        <archive-command>-fs NAMENODE -archiveName test.har file0 /dest </archive-command>
+        <archive-command>-fs NAMENODE -archiveName test.har -p file0 /dest </archive-command>
       </test-commands>
       <cleanup-commands>
         <command>-fs NAMENODE -rmr /*</command>
@@ -403,19 +418,19 @@
       <test-commands>
         <command>-fs NAMENODE -mkdir /dir0</command>
         <command>-fs NAMENODE -mkdir /dest</command>
-        <archive-command>-fs NAMENODE -archiveName test /dir0 /dest </archive-command>
+        <archive-command>-fs NAMENODE -archiveName test -p / dir0 /dest </archive-command>
       </test-commands>
       <cleanup-commands>
         <command>-fs NAMENODE -rmr /*</command>
       </cleanup-commands>
       <comparators>
         <comparator>
-          <type>TokenComparator</type>
-          <expected-output>archive -archiveName NAME &lt;src&gt;* &lt;dest&gt;</expected-output>
+          <type>RegexpComparator</type>
+          <expected-output>archive -archiveName NAME -p &lt;parent path&gt;
&lt;src&gt;\* &lt;dest&gt;</expected-output>
         </comparator>
         <comparator>
           <type>TokenComparator</type>
-          <expected-output>Invalid name for archives. test</expected-output>
+          <expected-output>archive -archiveName NAME -p &lt;parent path&gt;
&lt;src&gt;* &lt;dest&gt;</expected-output>
         </comparator>
       </comparators>
     </test>
@@ -425,7 +440,7 @@
       <test-commands>
         <command>-fs NAMENODE -mkdir /dir0</command>
         <command>-fs NAMENODE -mkdir /dest</command>
-        <archive-command>-fs NAMENODE -archiveName test.har /dir0 /dest </archive-command>
+        <archive-command>-fs NAMENODE -archiveName test.har -p / dir0 /dest </archive-command>
         <command>-fs NAMENODE -rmr /dest/test.har</command>
         <command>-fs NAMENODE -ls /dest/</command>
       </test-commands>
@@ -445,7 +460,7 @@
       <test-commands>
         <command>-fs NAMENODE -mkdir /dir0</command>
         <command>-fs NAMENODE -mkdir /dest</command>
-        <archive-command>-fs NAMENODE -archiveName test.har /dir0 /dest </archive-command>
+        <archive-command>-fs NAMENODE -archiveName test.har -p / dir0 /dest </archive-command>
         <command>-fs NAMENODE -mv /dest/test.har /dest/test1.har</command>
         <command>-fs NAMENODE -ls /dest/</command>
       </test-commands>
@@ -475,7 +490,7 @@
         <command>-fs NAMENODE -touchz /dir0/dir1/file2</command>
         <command>-fs NAMENODE -touchz /dir0/dir2/file1</command>
         <command>-fs NAMENODE -mkdir /dest</command>
-        <archive-command>-fs NAMENODE -archiveName dir0.har /dir0/ /dest</archive-command>
+        <archive-command>-fs NAMENODE -archiveName dir0.har -p / dir0/ /dest</archive-command>
         <command>-fs NAMENODE -lsr har:///dest/dir0.har/dir0</command>
       </test-commands>
       <cleanup-commands>

Modified: hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java?rev=794942&r1=794941&r2=794942&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java
(original)
+++ hadoop/mapreduce/trunk/src/test/mapred/org/apache/hadoop/tools/TestHarFileSystem.java
Fri Jul 17 02:04:15 2009
@@ -19,6 +19,7 @@
 package org.apache.hadoop.tools;
 
 import java.io.IOException;
+import java.net.URI;
 import java.util.Iterator;
 
 import junit.framework.TestCase;
@@ -44,7 +45,7 @@
  * and then run a map reduce job
  */
 public class TestHarFileSystem extends TestCase {
-  private Path inputPath;
+  private Path inputPath, inputrelPath;
   private MiniDFSCluster dfscluster;
   private MiniMRCluster mapred;
   private FileSystem fs;
@@ -53,14 +54,26 @@
   
   protected void setUp() throws Exception {
     super.setUp();
-    dfscluster = new MiniDFSCluster(new JobConf(), 2, true, null);
+    dfscluster = new MiniDFSCluster(new Configuration(), 2, true, null);
     fs = dfscluster.getFileSystem();
     mapred = new MiniMRCluster(2, fs.getUri().toString(), 1);
     inputPath = new Path(fs.getHomeDirectory(), "test"); 
+    inputrelPath = new Path(fs.getHomeDirectory().toUri().
+        getPath().substring(1), "test");
     filea = new Path(inputPath,"a");
     fileb = new Path(inputPath,"b");
     filec = new Path(inputPath,"c");
     archivePath = new Path(fs.getHomeDirectory(), "tmp");
+    fs.mkdirs(inputPath);
+    FSDataOutputStream out = fs.create(filea); 
+    out.write("a".getBytes());
+    out.close();
+    out = fs.create(fileb);
+    out.write("b".getBytes());
+    out.close();
+    out = fs.create(filec);
+    out.write("c".getBytes());
+    out.close();
   }
   
   protected void tearDown() throws Exception {
@@ -100,45 +113,90 @@
     }
   }
   
-  public void testArchives() throws Exception {
-    fs.mkdirs(inputPath);
-    
-    FSDataOutputStream out = fs.create(filea); 
-    out.write("a".getBytes());
-    out.close();
-    out = fs.create(fileb);
-    out.write("b".getBytes());
-    out.close();
-    out = fs.create(filec);
-    out.write("c".getBytes());
-    out.close();
+  // test archives with a -p option
+  public void testRelativeArchives() throws Exception {
+    fs.delete(archivePath,true);
     Configuration conf = mapred.createJobConf();
     HadoopArchives har = new HadoopArchives(conf);
-    String[] args = new String[3];
+    String[] args = new String[6];
+    args[0] = "-archiveName";
+    args[1] = "foo.har";
+    args[2] = "-p";
+    args[3] =  fs.getHomeDirectory().toString();
+    args[4] = "test";
+    args[5] = archivePath.toString();
+    int ret = ToolRunner.run(har, args);
+    assertTrue("failed test", ret == 0);
+    Path finalPath = new Path(archivePath, "foo.har");
+    Path fsPath = new Path(inputPath.toUri().getPath());
+    Path filePath = new Path(finalPath, "test");
+    //make it a har path 
+    Path harPath = new Path("har://" + filePath.toUri().getPath());
+    assertTrue(fs.exists(new Path(finalPath, "_index")));
+    assertTrue(fs.exists(new Path(finalPath, "_masterindex")));
+    assertTrue(!fs.exists(new Path(finalPath, "_logs")));
+    args = new String[2];
+    args[0] = "-ls";
+    args[1] = harPath.toString();
+    FsShell shell = new FsShell(conf);
+    ret = ToolRunner.run(shell, args);
+    // fileb and filec
+    assertTrue(ret == 0);
+    Path harFilea = new Path(harPath, "a");
+    Path harFileb = new Path(harPath, "b");
+    Path harFilec = new Path(harPath, "c");
+    FileSystem harFs = harFilea.getFileSystem(conf);
+    FSDataInputStream fin = harFs.open(harFilea);
+    byte[] b = new byte[4];
+    int readBytes = fin.read(b);
+    fin.close();
+    assertTrue("strings are equal ", (b[0] == "a".getBytes()[0]));
+    fin = harFs.open(harFileb);
+    fin.read(b);
+    fin.close();
+    assertTrue("strings are equal ", (b[0] == "b".getBytes()[0]));
+    fin = harFs.open(harFilec);
+    fin.read(b);
+    fin.close();
+    assertTrue("strings are equal ", (b[0] == "c".getBytes()[0]));
+  }
+  
+ 
+  public void testArchivesWithMapred() throws Exception {
+    fs.delete(archivePath, true);
+    Configuration conf = mapred.createJobConf();
+    HadoopArchives har = new HadoopArchives(conf);
+    String[] args = new String[4];
+ 
     //check for destination not specfied
     args[0] = "-archiveName";
     args[1] = "foo.har";
-    args[2] = inputPath.toString();
+    args[2] = "-p";
+    args[3] = "/";
     int ret = ToolRunner.run(har, args);
     assertTrue(ret != 0);
-    args = new String[4];
+    args = new String[6];
     //check for wrong archiveName
     args[0] = "-archiveName";
     args[1] = "/d/foo.har";
-    args[2] = inputPath.toString();
-    args[3] = archivePath.toString();
+    args[2] = "-p";
+    args[3] = "/";
+    args[4] = inputrelPath.toString();
+    args[5] = archivePath.toString();
     ret = ToolRunner.run(har, args);
     assertTrue(ret != 0);
-//  se if dest is a file 
+    //  se if dest is a file 
     args[1] = "foo.har";
-    args[3] = filec.toString();
+    args[5] = filec.toString();
     ret = ToolRunner.run(har, args);
     assertTrue(ret != 0);
     //this is a valid run
     args[0] = "-archiveName";
     args[1] = "foo.har";
-    args[2] = inputPath.toString();
-    args[3] = archivePath.toString();
+    args[2] = "-p";
+    args[3] = "/";
+    args[4] = inputrelPath.toString();
+    args[5] = archivePath.toString();
     ret = ToolRunner.run(har, args);
     //checl for the existenece of the archive
     assertTrue(ret == 0);
@@ -151,13 +209,16 @@
     String relative = fsPath.toString().substring(1);
     Path filePath = new Path(finalPath, relative);
     //make it a har path 
-    Path harPath = new Path("har://" + filePath.toUri().getPath());
+    URI uri = fs.getUri();
+    Path harPath = new Path("har://" + "hdfs-" + uri.getHost() +":" +
+        uri.getPort() + filePath.toUri().getPath());
     assertTrue(fs.exists(new Path(finalPath, "_index")));
     assertTrue(fs.exists(new Path(finalPath, "_masterindex")));
     assertTrue(!fs.exists(new Path(finalPath, "_logs")));
     //creation tested
     //check if the archive is same
     // do ls and cat on all the files
+    
     FsShell shell = new FsShell(conf);
     args = new String[2];
     args[0] = "-ls";

Modified: hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/HadoopArchives.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/HadoopArchives.java?rev=794942&r1=794941&r2=794942&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/HadoopArchives.java (original)
+++ hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/HadoopArchives.java Fri Jul 17
02:04:15 2009
@@ -59,6 +59,8 @@
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 
+import com.sun.corba.se.spi.ior.MakeImmutable;
+
 
 /**
  * a archive creation utility.
@@ -77,12 +79,13 @@
   static final String SRC_COUNT_LABEL = NAME + ".src.count";
   static final String TOTAL_SIZE_LABEL = NAME + ".total.size";
   static final String DST_HAR_LABEL = NAME + ".archive.name";
+  static final String SRC_PARENT_LABEL = NAME + ".parent.path";
   // size of each part file
   // its fixed for now.
   static final long partSize = 2 * 1024 * 1024 * 1024l;
 
   private static final String usage = "archive"
-  + " -archiveName NAME <src>* <dest>" +
+  + " -archiveName NAME -p <parent path> <src>* <dest>" +
   "\n";
   
  
@@ -228,24 +231,53 @@
     return deepest;
   }
   
-  // this method is tricky. This method writes 
-  // the top level directories in such a way so that 
-  // the output only contains valid directoreis in archives.
-  // so for an input path specified by the user 
-  // as /user/hadoop
-  // we need to index 
-  // / as the root 
-  // /user as a directory
-  // /user/hadoop as a directory
-  // so for multiple input paths it makes sure that it
-  // does the right thing.
-  // so if the user specifies the input directories as 
-  // /user/harry and /user/hadoop
-  // we need to write / and user as its child
-  // and /user and harry and hadoop as its children
+  /**
+   * truncate the prefix root from the full path
+   * @param fullPath the full path
+   * @param root the prefix root to be truncated
+   * @return the relative path
+   */
+  private Path relPathToRoot(Path fullPath, Path root) {
+    // just take some effort to do it 
+    // rather than just using substring 
+    // so that we do not break sometime later
+    Path justRoot = new Path(Path.SEPARATOR);
+    if (fullPath.depth() == root.depth()) {
+      return justRoot;
+    }
+    else if (fullPath.depth() > root.depth()) {
+      Path retPath = new Path(fullPath.getName());
+      Path parent = fullPath.getParent();
+      for (int i=0; i < (fullPath.depth() - root.depth() -1); i++) {
+        retPath = new Path(parent.getName(), retPath);
+        parent = parent.getParent();
+      }
+      return new Path(justRoot, retPath);
+    }
+    return null;
+  }
+
+  /**
+   * this method writes all the valid top level directories 
+   * into the srcWriter for indexing. This method is a little
+   * tricky. example- 
+   * for an input with parent path /home/user/ and sources 
+   * as /home/user/source/dir1, /home/user/source/dir2 - this 
+   * will output <source, dir, dir1, dir2> (dir means that source is a dir
+   * with dir1 and dir2 as children) and <source/dir1, file, null>
+   * and <source/dir2, file, null>
+   * @param srcWriter the sequence file writer to write the
+   * directories to
+   * @param paths the source paths provided by the user. They
+   * are glob free and have full path (not relative paths)
+   * @param parentPath the parent path that you wnat the archives
+   * to be relative to. example - /home/user/dir1 can be archived with
+   * parent as /home or /home/user.
+   * @throws IOException
+   */
   private void writeTopLevelDirs(SequenceFile.Writer srcWriter, 
-      List<Path> paths) throws IOException {
-    //these are qualified paths 
+      List<Path> paths, Path parentPath) throws IOException {
+    //add all the directories 
     List<Path> justDirs = new ArrayList<Path>();
     for (Path p: paths) {
       if (!p.getFileSystem(getConf()).isFile(p)) {
@@ -255,17 +287,23 @@
         justDirs.add(new Path(p.getParent().toUri().getPath()));
       }
     }
-    
-    //get the largest depth path
-    // this is tricky
-    TreeMap<String, HashSet<String>> allpaths = new TreeMap<String, HashSet<String>>();
+    /* find all the common parents of paths that are valid archive
+     * paths. The below is done so that we do not add a common path
+     * twice and also we need to only add valid child of a path that
+     * are specified the user.
+     */
+    TreeMap<String, HashSet<String>> allpaths = new TreeMap<String, 
+                                                HashSet<String>>();
+    /* the largest depth of paths. the max number of times
+     * we need to iterate
+     */
     Path deepest = largestDepth(paths);
     Path root = new Path(Path.SEPARATOR);
-    for (int i = 0; i < deepest.depth(); i++) {
+    for (int i = parentPath.depth(); i < deepest.depth(); i++) {
       List<Path> parents = new ArrayList<Path>();
       for (Path p: justDirs) {
         if (p.compareTo(root) == 0){
-          //don nothing
+          //do nothing
         }
         else {
           Path parent = p.getParent();
@@ -285,34 +323,40 @@
     }
     Set<Map.Entry<String, HashSet<String>>> keyVals = allpaths.entrySet();
     for (Map.Entry<String, HashSet<String>> entry : keyVals) {
-      HashSet<String> children = entry.getValue();
-      String toWrite = entry.getKey() + " dir ";
-      StringBuffer sbuff = new StringBuffer();
-      sbuff.append(toWrite);
-      for (String child: children) {
-        sbuff.append(child + " ");
+      Path relPath = relPathToRoot(new Path(entry.getKey()), parentPath);
+      if (relPath != null) {
+        String toWrite = relPath + " dir ";
+        HashSet<String> children = entry.getValue();
+        StringBuffer sbuff = new StringBuffer();
+        sbuff.append(toWrite);
+        for (String child: children) {
+          sbuff.append(child + " ");
+        }
+        toWrite = sbuff.toString();
+        srcWriter.append(new LongWritable(0L), new Text(toWrite));
       }
-      toWrite = sbuff.toString();
-      srcWriter.append(new LongWritable(0L), new Text(toWrite));
     }
   }
   
   /**archive the given source paths into
    * the dest
+   * @param parentPath the parent path of all the source paths
    * @param srcPaths the src paths to be archived
    * @param dest the dest dir that will contain the archive
    */
-  public void archive(List<Path> srcPaths, String archiveName, Path dest) 
-  throws IOException {
+  void archive(Path parentPath, List<Path> srcPaths, 
+      String archiveName, Path dest) throws IOException {
     checkPaths(conf, srcPaths);
     int numFiles = 0;
     long totalSize = 0;
+    FileSystem fs = parentPath.getFileSystem(conf);
     conf.set(DST_HAR_LABEL, archiveName);
+    conf.set(SRC_PARENT_LABEL, parentPath.makeQualified(fs).toString());
     Path outputPath = new Path(dest, archiveName);
     FileOutputFormat.setOutputPath(conf, outputPath);
     FileSystem outFs = outputPath.getFileSystem(conf);
     if (outFs.exists(outputPath) || outFs.isFile(dest)) {
-      throw new IOException("Invalid Output.");
+      throw new IOException("Invalid Output: " + outputPath);
     }
     conf.set(DST_DIR_LABEL, outputPath.toString());
     final String randomId = DistCp.getRandomId();
@@ -331,7 +375,7 @@
     // create single list of files and dirs
     try {
       // write the top level dirs in first 
-      writeTopLevelDirs(srcWriter, srcPaths);
+      writeTopLevelDirs(srcWriter, srcPaths, parentPath);
       srcWriter.sync();
       // these are the input paths passed 
       // from the command line
@@ -339,14 +383,13 @@
       // and then write them to the input file 
       // one at a time
       for (Path src: srcPaths) {
-        FileSystem fs = src.getFileSystem(conf);
         ArrayList<FileStatus> allFiles = new ArrayList<FileStatus>();
         recursivels(fs, src, allFiles);
         for (FileStatus stat: allFiles) {
           String toWrite = "";
           long len = stat.isDir()? 0:stat.getLen();
           if (stat.isDir()) {
-            toWrite = "" + fs.makeQualified(stat.getPath()) + " dir ";
+            toWrite = "" + relPathToRoot(stat.getPath(), parentPath) + " dir ";
             //get the children 
             FileStatus[] list = fs.listStatus(stat.getPath());
             StringBuffer sbuff = new StringBuffer();
@@ -357,7 +400,7 @@
             toWrite = sbuff.toString();
           }
           else {
-            toWrite +=  fs.makeQualified(stat.getPath()) + " file ";
+            toWrite +=  relPathToRoot(stat.getPath(), parentPath) + " file ";
           }
           srcWriter.append(new LongWritable(len), new 
               Text(toWrite));
@@ -403,6 +446,7 @@
     Path tmpOutputDir = null;
     Path tmpOutput = null;
     String partname = null;
+    Path rootPath = null;
     FSDataOutputStream partStream = null;
     FileSystem destFs = null;
     byte[] buffer;
@@ -425,6 +469,12 @@
       // directory 
       partname = "part-" + partId;
       tmpOutput = new Path(tmpOutputDir, partname);
+      rootPath = (conf.get(SRC_PARENT_LABEL, null) == null) ? null :
+                  new Path(conf.get(SRC_PARENT_LABEL));
+      if (rootPath == null) {
+        throw new RuntimeException("Unable to read parent " +
+        		"path for har from config");
+      }
       try {
         destFs = tmpOutput.getFileSystem(conf);
         //this was a stale copy
@@ -450,16 +500,7 @@
         fsin.close();
       }
     }
-    
-    // the relative path of p. basically 
-    // getting rid of schema. Parsing and doing 
-    // string manipulation is not good - so
-    // just use the path api to do it.
-    private Path makeRelative(Path p) {
-      Path retPath = new Path(p.toUri().getPath());
-      return retPath;
-    }
-    
+       
     static class MapStat {
       private String pathname;
       private boolean isDir;
@@ -481,6 +522,20 @@
         }
       }
     }
+    
+    /**
+     * get rid of / in the beginning of path
+     * @param p the path
+     * @return return path without /
+     */
+    private Path realPath(Path p, Path parent) {
+      Path rootPath = new Path(Path.SEPARATOR);
+      if (rootPath.compareTo(p) == 0) {
+        return parent;
+      }
+      return new Path(parent, new Path(p.toString().substring(1)));
+    }
+
     // read files from the split input 
     // and write it onto the part files.
     // also output hash(name) and string 
@@ -491,10 +546,10 @@
         Reporter reporter) throws IOException {
       String line  = value.toString();
       MapStat mstat = new MapStat(line);
-      Path srcPath = new Path(mstat.pathname);
-      String towrite = null;
-      Path relPath = makeRelative(srcPath);
+      Path relPath = new Path(mstat.pathname);
       int hash = HarFileSystem.getHarHash(relPath);
+      String towrite = null;
+      Path srcPath = realPath(relPath, rootPath);
       long startPos = partStream.getPos();
       if (mstat.isDir) { 
         towrite = relPath.toString() + " " + "dir none " + 0 + " " + 0 + " ";
@@ -609,27 +664,26 @@
       outStream.close();
       indexStream.close();
       // try increasing the replication 
-      fs.setReplication(index, (short) 10);
-      fs.setReplication(masterIndex, (short) 10);
+      fs.setReplication(index, (short) 5);
+      fs.setReplication(masterIndex, (short) 5);
     }
     
   }
   
   /** the main driver for creating the archives
-   *  it takes at least two command line parameters. The src and the 
-   *  dest. It does an lsr on the source paths.
+   *  it takes at least three command line parameters. The parent path, 
+   *  The src and the dest. It does an lsr on the source paths.
    *  The mapper created archuves and the reducer creates 
    *  the archive index.
    */
 
   public int run(String[] args) throws Exception {
     try {
+      Path parentPath = null;
       List<Path> srcPaths = new ArrayList<Path>();
       Path destPath = null;
-      // check we were supposed to archive or 
-      // unarchive
       String archiveName = null;
-      if (args.length < 4) {
+      if (args.length < 5) {
         System.out.println(usage);
         throw new IOException("Invalid usage.");
       }
@@ -642,17 +696,34 @@
         System.out.println(usage);
         throw new IOException("Invalid name for archives. " + archiveName);
       }
-      for (int i = 2; i < args.length; i++) {
+      int i = 2;
+      //check to see if relative parent has been provided or not
+      //this is a required parameter. 
+      if (! "-p".equals(args[i])) {
+        System.out.println(usage);
+        throw new IOException("Parent path not specified.");
+      }
+      parentPath = new Path(args[i+1]);
+      i+=2;
+      //read the rest of the paths
+      for (; i < args.length; i++) {
         if (i == (args.length - 1)) {
           destPath = new Path(args[i]);
         }
         else {
-          srcPaths.add(new Path(args[i]));
+          Path argPath = new Path(args[i]);
+          if (argPath.isAbsolute()) {
+            System.out.println(usage);
+            throw new IOException("source path " + argPath +
+                " is not relative  to "+ parentPath);
+          }
+          srcPaths.add(new Path(parentPath, argPath));
         }
       }
       if (srcPaths.size() == 0) {
-        System.out.println(usage);
-        throw new IOException("Invalid Usage: No input sources specified.");
+        // assuming if the user does not specify path for sources
+        // the whole parent directory needs to be archived. 
+        srcPaths.add(parentPath);
       }
       // do a glob on the srcPaths and then pass it on
       List<Path> globPaths = new ArrayList<Path>();
@@ -663,7 +734,7 @@
           globPaths.add(fs.makeQualified(status.getPath()));
         }
       }
-      archive(globPaths, archiveName, destPath);
+      archive(parentPath, globPaths, archiveName, destPath);
     } catch(IOException ie) {
       System.err.println(ie.getLocalizedMessage());
       return -1;



Mime
View raw message