hadoop-mapreduce-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sc...@apache.org
Subject svn commit: r1034216 - in /hadoop/mapreduce/trunk: ./ src/contrib/raid/src/java/org/apache/hadoop/raid/ src/contrib/raid/src/test/org/apache/hadoop/raid/
Date Fri, 12 Nov 2010 00:01:26 GMT
Author: schen
Date: Fri Nov 12 00:01:25 2010
New Revision: 1034216

URL: http://svn.apache.org/viewvc?rev=1034216&view=rev
Log:
MAPREDUCE-2167. Faster directory traversal for raid node. (Ramkumar Vadali via schen)

Modified:
    hadoop/mapreduce/trunk/CHANGES.txt
    hadoop/mapreduce/trunk/src/contrib/raid/src/java/org/apache/hadoop/raid/DirectoryTraversal.java
    hadoop/mapreduce/trunk/src/contrib/raid/src/java/org/apache/hadoop/raid/RaidNode.java
    hadoop/mapreduce/trunk/src/contrib/raid/src/test/org/apache/hadoop/raid/TestDirectoryTraversal.java
    hadoop/mapreduce/trunk/src/contrib/raid/src/test/org/apache/hadoop/raid/TestRaidNode.java

Modified: hadoop/mapreduce/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/CHANGES.txt?rev=1034216&r1=1034215&r2=1034216&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/CHANGES.txt (original)
+++ hadoop/mapreduce/trunk/CHANGES.txt Fri Nov 12 00:01:25 2010
@@ -168,6 +168,9 @@ Trunk (unreleased changes)
 
     MAPREDUCE-2093. Herriot JT and TT clients should vend statistics. (cos)
 
+    MAPREDUCE-2167. Faster directory traversal for raid node. (Ramkumar Vadali
+    via schen)
+
   OPTIMIZATIONS
 
     MAPREDUCE-1354. Enhancements to JobTracker for better performance and

Modified: hadoop/mapreduce/trunk/src/contrib/raid/src/java/org/apache/hadoop/raid/DirectoryTraversal.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/raid/src/java/org/apache/hadoop/raid/DirectoryTraversal.java?rev=1034216&r1=1034215&r2=1034216&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/contrib/raid/src/java/org/apache/hadoop/raid/DirectoryTraversal.java
(original)
+++ hadoop/mapreduce/trunk/src/contrib/raid/src/java/org/apache/hadoop/raid/DirectoryTraversal.java
Fri Nov 12 00:01:25 2010
@@ -20,9 +20,16 @@ package org.apache.hadoop.raid;
 
 import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Stack;
+import java.util.concurrent.Executor;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.Semaphore;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -45,6 +52,9 @@ public class DirectoryTraversal {
   private List<FileStatus> paths;
   private int pathIdx = 0;  // Next path to process.
   private Stack<Node> stack = new Stack<Node>();
+  private ExecutorService executor;
+
+  private int numThreads;
 
   /**
    * A FileFilter object can be used to choose files during directory traversal.
@@ -88,28 +98,98 @@ public class DirectoryTraversal {
    * @param startPaths A list of paths that need to be traversed
    */
   public DirectoryTraversal(FileSystem fs, List<FileStatus> startPaths) {
+    this(fs, startPaths, 1);
+  }
+
+  public DirectoryTraversal(
+    FileSystem fs, List<FileStatus> startPaths, int numThreads) {
     this.fs = fs;
     paths = startPaths;
     pathIdx = 0;
+    this.numThreads = numThreads;
+    executor = Executors.newFixedThreadPool(numThreads);
   }
 
-  public List<FileStatus> getFilteredFiles(FileFilter filter, int limit)
-      throws IOException {
-    List<FileStatus> filtered = new LinkedList<FileStatus>();
-    int num = 0;
-    while (num < limit) {
-      FileStatus next = getNextFile();
-      if (next == null) {
+  public List<FileStatus> getFilteredFiles(FileFilter filter, int limit) {
+    List<FileStatus> filtered = new ArrayList<FileStatus>();
+
+    // We need this semaphore to block when the number of running workitems
+    // is equal to the number of threads. FixedThreadPool limits the number
+    // of threads, but not the queue size. This way we will limit the memory
+    // usage.
+    Semaphore slots = new Semaphore(numThreads);
+
+    while (true) {
+      synchronized(filtered) {
+        if (filtered.size() >= limit) break;
+      }
+      FilterFileWorkItem work = null;
+      try {
+        Node next = getNextDirectoryNode();
+        if (next == null) {
+          break;
+        }
+        work = new FilterFileWorkItem(filter, next, filtered, slots);
+        slots.acquire();
+      } catch (InterruptedException ie) {
+        break;
+      } catch (IOException e) {
         break;
       }
-      if (filter.check(next)) {
-        num++;
-        filtered.add(next);
+      executor.execute(work);
+    }
+
+    try {
+      // Wait for all submitted items to finish.
+      slots.acquire(numThreads);
+      // If this traversal is finished, shutdown the executor.
+      if (doneTraversal()) {
+        executor.shutdown();
+        executor.awaitTermination(1, TimeUnit.HOURS);
       }
+    } catch (InterruptedException ie) {
     }
+
     return filtered;
   }
 
+  class FilterFileWorkItem implements Runnable {
+    FileFilter filter;
+    Node dir;
+    List<FileStatus> filtered;
+    Semaphore slots;
+
+    FilterFileWorkItem(FileFilter filter, Node dir, List<FileStatus> filtered,
+      Semaphore slots) {
+      this.slots = slots;
+      this.filter = filter;
+      this.dir = dir;
+      this.filtered = filtered;
+    }
+
+    @SuppressWarnings("deprecation")
+    public void run() {
+      try {
+        LOG.info("Initiating file filtering for " + dir.path.getPath());
+        for (FileStatus f: dir.elements) {
+          if (!f.isFile()) {
+            continue;
+          }
+          if (filter.check(f)) {
+            synchronized(filtered) {
+              filtered.add(f);
+            }
+          }
+        }
+      } catch (Exception e) {
+        LOG.error("Error in directory traversal: " 
+          + StringUtils.stringifyException(e));
+      } finally {
+        slots.release();
+      }
+    }
+  }
+
   /**
    * Return the next file.
    * @throws IOException
@@ -168,6 +248,15 @@ public class DirectoryTraversal {
    * @throws IOException
    */
   public FileStatus getNextDirectory() throws IOException {
+    Node dirNode = getNextDirectoryNode();
+    if (dirNode != null) {
+      return dirNode.path;
+    }
+    return null;
+  }
+
+  private Node getNextDirectoryNode() throws IOException {
+
     // Check if traversal is done.
     while (!doneTraversal()) {
       // If traversal is not done, check if the stack is not empty.
@@ -190,7 +279,7 @@ public class DirectoryTraversal {
           }
         } else {
           stack.pop();
-          return node.path;
+          return node;
         }
       }
       // If the stack is empty, do we have more paths?
@@ -215,7 +304,6 @@ public class DirectoryTraversal {
       return;
     }
     Path p = stat.getPath();
-    LOG.info("Traversing to directory " + p);
     FileStatus[] elements = fs.listStatus(p);
     Node newNode = new Node(stat, (elements == null? new FileStatus[0]: elements));
     stack.push(newNode);

Modified: hadoop/mapreduce/trunk/src/contrib/raid/src/java/org/apache/hadoop/raid/RaidNode.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/raid/src/java/org/apache/hadoop/raid/RaidNode.java?rev=1034216&r1=1034215&r2=1034216&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/contrib/raid/src/java/org/apache/hadoop/raid/RaidNode.java
(original)
+++ hadoop/mapreduce/trunk/src/contrib/raid/src/java/org/apache/hadoop/raid/RaidNode.java
Fri Nov 12 00:01:25 2010
@@ -445,7 +445,8 @@ public abstract class RaidNode implement
 
         // Set the time for a new traversal.
         scanState.fullScanStartTime = now();
-        DirectoryTraversal dt = new DirectoryTraversal(fs, selectedPaths);
+        DirectoryTraversal dt = new DirectoryTraversal(fs, selectedPaths,
+          conf.getInt("raid.directorytraversal.threads", 4));
         DirectoryTraversal.FileFilter filter =
           filterForPolicy(selectStartTime, info, allPolicies, scanState.stats);
         returnSet = dt.getFilteredFiles(filter, selectLimit);

Modified: hadoop/mapreduce/trunk/src/contrib/raid/src/test/org/apache/hadoop/raid/TestDirectoryTraversal.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/raid/src/test/org/apache/hadoop/raid/TestDirectoryTraversal.java?rev=1034216&r1=1034215&r2=1034216&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/contrib/raid/src/test/org/apache/hadoop/raid/TestDirectoryTraversal.java
(original)
+++ hadoop/mapreduce/trunk/src/contrib/raid/src/test/org/apache/hadoop/raid/TestDirectoryTraversal.java
Fri Nov 12 00:01:25 2010
@@ -58,7 +58,7 @@ public class TestDirectoryTraversal exte
       LOG.info("Enumerating files");
       List<FileStatus> startPaths = new LinkedList<FileStatus>();
       startPaths.add(fs.getFileStatus(topDir));
-      DirectoryTraversal dt = new DirectoryTraversal(fs, startPaths);
+      DirectoryTraversal dt = new DirectoryTraversal(fs, startPaths, 2);
 
       List<FileStatus> selected = new LinkedList<FileStatus>();
       while (true) {

Modified: hadoop/mapreduce/trunk/src/contrib/raid/src/test/org/apache/hadoop/raid/TestRaidNode.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/raid/src/test/org/apache/hadoop/raid/TestRaidNode.java?rev=1034216&r1=1034215&r2=1034216&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/contrib/raid/src/test/org/apache/hadoop/raid/TestRaidNode.java
(original)
+++ hadoop/mapreduce/trunk/src/contrib/raid/src/test/org/apache/hadoop/raid/TestRaidNode.java
Fri Nov 12 00:01:25 2010
@@ -632,13 +632,25 @@ public class TestRaidNode extends TestCa
 
     RaidNode cnode = null;
     try {
-      createTestFiles("/user/dhruba/raidtest/", "/destraid/user/dhruba/raidtest");
+      createTestFiles(
+        "/user/dhruba/raidtest/1/", "/destraid/user/dhruba/raidtest/1");
+      createTestFiles(
+        "/user/dhruba/raidtest/2/", "/destraid/user/dhruba/raidtest/2");
+      createTestFiles(
+        "/user/dhruba/raidtest/3/", "/destraid/user/dhruba/raidtest/3");
+      createTestFiles(
+        "/user/dhruba/raidtest/4/", "/destraid/user/dhruba/raidtest/4");
       LOG.info("Test testSuspendTraversal created test files");
 
       Configuration localConf = new Configuration(conf);
       localConf.set(RaidNode.RAID_LOCATION_KEY, "/destraid");
       localConf.setInt("raid.distraid.max.files", 3);
-      final int numJobsExpected = 4; // 10 test files: 4 jobs with 3 files each.
+      localConf.setInt("raid.directorytraversal.threads", 1);
+      // This is too dependent on the implementation of getFilteredFiles().
+      // It relies on the threading behavior where two directories are traversed
+      // before returning because the list of files is modified in a separate
+      // thread from the one that decides if there are enough files.
+      final int numJobsExpected = 2;
       cnode = RaidNode.createRaidNode(null, localConf);
 
       long start = System.currentTimeMillis();



Mime
View raw message