hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From szets...@apache.org
Subject svn commit: r915168 - in /hadoop/common/trunk: CHANGES.txt src/java/org/apache/hadoop/fs/HarFileSystem.java
Date Tue, 23 Feb 2010 03:54:14 GMT
Author: szetszwo
Date: Tue Feb 23 03:54:14 2010
New Revision: 915168

URL: http://svn.apache.org/viewvc?rev=915168&view=rev
Log:
HADOOP-6467. Improve the performance on HarFileSystem.listStatus(..).  Contributed by mahadev

Modified:
    hadoop/common/trunk/CHANGES.txt
    hadoop/common/trunk/src/java/org/apache/hadoop/fs/HarFileSystem.java

Modified: hadoop/common/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/CHANGES.txt?rev=915168&r1=915167&r2=915168&view=diff
==============================================================================
--- hadoop/common/trunk/CHANGES.txt (original)
+++ hadoop/common/trunk/CHANGES.txt Tue Feb 23 03:54:14 2010
@@ -163,6 +163,9 @@
 
   OPTIMIZATIONS
 
+    HADOOP-6467. Improve the performance on HarFileSystem.listStatus(..).
+    (mahadev via szetszwo)
+
   BUG FIXES
 
     HADOOP-6293. Fix FsShell -text to work on filesystems other than the

Modified: hadoop/common/trunk/src/java/org/apache/hadoop/fs/HarFileSystem.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/src/java/org/apache/hadoop/fs/HarFileSystem.java?rev=915168&r1=915167&r2=915168&view=diff
==============================================================================
--- hadoop/common/trunk/src/java/org/apache/hadoop/fs/HarFileSystem.java (original)
+++ hadoop/common/trunk/src/java/org/apache/hadoop/fs/HarFileSystem.java Tue Feb 23 03:54:14
2010
@@ -325,25 +325,12 @@
   @Override
   public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
       long len) throws IOException {
-    // need to look up the file in the underlying fs
-    // look up the index 
-    
-    // make sure this is a prt of this har filesystem
-    Path p = makeQualified(file.getPath());
-    Path harPath = getPathInHar(p);
-    String line = fileStatusInIndex(harPath);
-    if (line == null)  {
-      throw new FileNotFoundException("File " + file.getPath() + " not found");
-    }
-    HarStatus harStatus = new HarStatus(line);
-    if (harStatus.isDir()) {
-      return new BlockLocation[0];
-    }
-    FileStatus fsFile = fs.getFileStatus(new Path(archivePath,
-        harStatus.getPartName()));
-    BlockLocation[] rawBlocks = fs.getFileBlockLocations(fsFile, 
-        harStatus.getStartIndex() + start, len);
-    return fakeBlockLocations(rawBlocks, harStatus.getStartIndex());
+    // just fake block locations
+    // its fast and simpler
+    // doing various block location manipulation
+    // with part files adds a lot of overhead because 
+    // of the look ups of filestatus in index files
+    return new BlockLocation[]{ new BlockLocation() };
   }
   
   /**
@@ -387,6 +374,63 @@
     public int endHash;
   }
   
+  /**
+   * Get filestatuses of all the children of a given directory. This just reads
+   * through index file and reads line by line to get all statuses for children
+   * of a directory. Its a brute force way of getting all such filestatuses
+   * 
+   * @param parent
+   *          the parent path directory
+   * @param statuses
+   *          the list to add the children filestatuses to
+   * @param children
+   *          the string list of children for this parent
+   * @param archiveIndexStat
+   *          the archive index filestatus
+   */
+  private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
+      List<String> children, FileStatus archiveIndexStat) throws IOException {
+    // read the index file
+    FSDataInputStream aIn = null;
+    try {
+      aIn = fs.open(archiveIndex);
+      LineReader aLin;
+      long read = 0;
+      aLin = new LineReader(aIn, getConf());
+      String parentString = parent.getName();
+      Path harPath = new Path(parentString);
+      int harlen = harPath.depth();
+      Text line = new Text();
+      while (read < archiveIndexStat.getLen()) {
+        int tmp = aLin.readLine(line);
+        read += tmp;
+        String lineFeed = line.toString();
+        String child = lineFeed.substring(0, lineFeed.indexOf(" "));
+        if ((child.startsWith(parentString))) {
+          Path thisPath = new Path(child);
+          if (thisPath.depth() == harlen + 1) {
+            // bingo!
+            HarStatus hstatus = new HarStatus(lineFeed);
+            FileStatus childStatus = new FileStatus(hstatus.isDir() ? 0
+                : hstatus.getLength(), hstatus.isDir(), (int) archiveIndexStat
+                .getReplication(), archiveIndexStat.getBlockSize(),
+                archiveIndexStat.getModificationTime(), archiveIndexStat
+                    .getAccessTime(), new FsPermission(archiveIndexStat
+                    .getPermission()), archiveIndexStat.getOwner(),
+                archiveIndexStat.getGroup(), makeRelative(this.uri.toString(),
+                    new Path(hstatus.name)));
+            statuses.add(childStatus);
+          }
+          line.clear();
+        }
+      }
+    } finally {
+      if (aIn != null) {
+        aIn.close();
+      }
+    }
+  }
+  
   // make sure that this harPath is relative to the har filesystem
   // this only works for relative paths. This returns the line matching
   // the file in the index. Returns a null if there is not matching 
@@ -650,10 +694,8 @@
             archiveStatus.getOwner(), archiveStatus.getGroup(), 
             makeRelative(this.uri.toString(), new Path(hstatus.name))));
     else 
-      for (String child: hstatus.children) {
-        FileStatus tmp = getFileStatus(new Path(tmpPath, child));
-        statuses.add(tmp);
-      }
+      fileStatusesInIndex(hstatus, statuses, hstatus.children, archiveStatus);
+    
     return statuses.toArray(new FileStatus[statuses.size()]);
   }
   



Mime
View raw message