hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From yhema...@apache.org
Subject svn commit: r773891 - in /hadoop/core/branches/branch-0.20: ./ src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/ src/contrib/capacity-scheduler/src/test/org/apache/hadoop/mapred/
Date Tue, 12 May 2009 13:38:20 GMT
Author: yhemanth
Date: Tue May 12 13:38:19 2009
New Revision: 773891

URL: http://svn.apache.org/viewvc?rev=773891&view=rev
Log:
Merge -r 773888:773889 from trunk to branch 0.20 to fix HADOOP-5641.

Modified:
    hadoop/core/branches/branch-0.20/   (props changed)
    hadoop/core/branches/branch-0.20/CHANGES.txt   (contents, props changed)
    hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/CapacityTaskScheduler.java
    hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/MemoryMatcher.java
    hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/test/org/apache/hadoop/mapred/TestCapacityScheduler.java

Propchange: hadoop/core/branches/branch-0.20/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Tue May 12 13:38:19 2009
@@ -1,2 +1,2 @@
 /hadoop/core/branches/branch-0.19:713112
-/hadoop/core/trunk:727001,727117,727191,727212,727217,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,736426,738328,738697,740077,740157,741703,741762,743745,743816,743892,744894,745180,746010,746206,746227,746233,746274,746338,746902-746903,746925,746944,746968,746970,747279,747289,747802,748084,748090,748783,749262,749318,749863,750533,752073,752609,752834,752836,752913,752932,753112-753113,753346,754645,754847,754927,755035,755226,755348,755370,755418,755426,755790,755905,755938,755960,755986,755998,756352,757448,757624,757849,758156,759398,759932,760502,760783,761046,761482,761632,762216,762879,763107,763502,764967,765016,765809,765951,771607,771661,772844,772876,772920
+/hadoop/core/trunk:727001,727117,727191,727212,727217,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,736426,738328,738697,740077,740157,741703,741762,743745,743816,743892,744894,745180,746010,746206,746227,746233,746274,746338,746902-746903,746925,746944,746968,746970,747279,747289,747802,748084,748090,748783,749262,749318,749863,750533,752073,752609,752834,752836,752913,752932,753112-753113,753346,754645,754847,754927,755035,755226,755348,755370,755418,755426,755790,755905,755938,755960,755986,755998,756352,757448,757624,757849,758156,759398,759932,760502,760783,761046,761482,761632,762216,762879,763107,763502,764967,765016,765809,765951,771607,771661,772844,772876,772920,773889

Modified: hadoop/core/branches/branch-0.20/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.20/CHANGES.txt?rev=773891&r1=773890&r2=773891&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.20/CHANGES.txt (original)
+++ hadoop/core/branches/branch-0.20/CHANGES.txt Tue May 12 13:38:19 2009
@@ -61,6 +61,9 @@
     KILLED (this used to happen when the SetupTask would come back with a 
     success after the job has been killed). (Amar Kamat via ddas)
 
+    HADOOP-5641. Fix a NullPointerException in capacity scheduler's memory
+    based scheduling code when jobs get retired. (yhemanth)
+
 Release 0.20.0 - 2009-04-15
 
   INCOMPATIBLE CHANGES

Propchange: hadoop/core/branches/branch-0.20/CHANGES.txt
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Tue May 12 13:38:19 2009
@@ -1,3 +1,3 @@
 /hadoop/core/branches/branch-0.18/CHANGES.txt:727226
 /hadoop/core/branches/branch-0.19/CHANGES.txt:713112
-/hadoop/core/trunk/CHANGES.txt:727001,727117,727191,727212,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,735082,736426,738602,738697,739416,740077,740157,741703,741762,743296,743745,743816,743892,744894,745180,745268,746010,746193,746206,746227,746233,746274,746902-746903,746925,746944,746968,746970,747279,747289,747802,748084,748090,748783,749262,749318,749863,750533,752073,752514,752555,752590,752609,752834,752836,752913,752932,753112-753113,753346,754645,754847,754927,755035,755226,755348,755370,755418,755426,755790,755905,755938,755986,755998,756352,757448,757624,757849,758156,759398,759932,760502,760783,761046,761482,761632,762216,762879,763107,763502,764967,765016,765809,765951,771607,772844,772876,772920
+/hadoop/core/trunk/CHANGES.txt:727001,727117,727191,727212,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,735082,736426,738602,738697,739416,740077,740157,741703,741762,743296,743745,743816,743892,744894,745180,745268,746010,746193,746206,746227,746233,746274,746902-746903,746925,746944,746968,746970,747279,747289,747802,748084,748090,748783,749262,749318,749863,750533,752073,752514,752555,752590,752609,752834,752836,752913,752932,753112-753113,753346,754645,754847,754927,755035,755226,755348,755370,755418,755426,755790,755905,755938,755986,755998,756352,757448,757624,757849,758156,759398,759932,760502,760783,761046,761482,761632,762216,762879,763107,763502,764967,765016,765809,765951,771607,772844,772876,772920,773889

Modified: hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/CapacityTaskScheduler.java
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/CapacityTaskScheduler.java?rev=773891&r1=773890&r2=773891&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/CapacityTaskScheduler.java
(original)
+++ hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/CapacityTaskScheduler.java
Tue May 12 13:38:19 2009
@@ -413,7 +413,8 @@
             }
           }
           else {
-            // mem requirements not met. Rather than look at the next job, 
+            // mem requirements not met or could not be computed for this TT
+            // Rather than look at the next job, 
             // we return nothing to the TT, with the hope that we improve 
             // chances of finding a suitable TT for this job. This lets us
             // avoid starving jobs with high mem requirements.         

Modified: hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/MemoryMatcher.java
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/MemoryMatcher.java?rev=773891&r1=773890&r2=773891&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/MemoryMatcher.java
(original)
+++ hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/MemoryMatcher.java
Tue May 12 13:38:19 2009
@@ -112,7 +112,8 @@
    * residing on the given TaskTracker.
    * 
    * @param taskTracker
-   * @return amount of memory that is used by the residing tasks
+   * @return amount of memory that is used by the residing tasks,
+   *          null if memory cannot be computed for some reason.
    */
   private synchronized Memory getMemReservedForTasks(
       TaskTrackerStatus taskTracker) {
@@ -141,6 +142,26 @@
       // accounted in used memory.
       if ((task.getRunState() == TaskStatus.State.RUNNING)
           || (task.getRunState() == TaskStatus.State.COMMIT_PENDING)) {
+        JobInProgress job = scheduler.taskTrackerManager.getJob(
+                                              task.getTaskID().getJobID());
+        if (job == null) {
+          // This scenario can happen if a job was completed/killed
+          // and retired from JT's memory. In this state, we can ignore 
+          // the running task status and compute memory for the rest of 
+          // the tasks. However, any scheduling done with this computation
+          // could result in over-subscribing of memory for tasks on this
+          // TT (as the unaccounted for task is still running).
+          // So, it is safer to not schedule anything for this TT
+          // One of the ways of doing that is to return null from here
+          // and check for null in the calling method.
+          LOG.info("Task tracker: " + taskTracker.getHost() + " is reporting "
+                    + "a running / commit pending task: " + task.getTaskID()
+                    + " but no corresponding job was found. "
+                    + "Maybe job was retired. Not computing "
+                    + "memory values for this TT.");
+          return null;
+        }
+        
         JobConf jConf =
             scheduler.taskTrackerManager.getJob(task.getTaskID().getJobID())
                 .getJobConf();
@@ -194,6 +215,16 @@
     }
 
     Memory memReservedForTasks = getMemReservedForTasks(taskTracker);
+    if (memReservedForTasks == null) {
+      // For some reason, maybe because we could not find the job
+      // corresponding to a running task (as can happen if the job
+      // is retired in between), we could not compute the memory state
+      // on this TT. Treat this as an error, and fail memory
+      // requirements.
+      LOG.info("Could not compute memory for taskTracker: " 
+                + taskTracker.getHost() + ". Failing memory requirements.");
+      return false;
+    }
     long vmemUsedOnTT = memReservedForTasks.vmem;
     long pmemUsedOnTT = memReservedForTasks.pmem;
 
@@ -243,4 +274,4 @@
         + jobVMemForTask + " jobPMemForTask = " + jobPMemForTask);
     return true;
   }
-}
\ No newline at end of file
+}

Modified: hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/test/org/apache/hadoop/mapred/TestCapacityScheduler.java
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/test/org/apache/hadoop/mapred/TestCapacityScheduler.java?rev=773891&r1=773890&r2=773891&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/test/org/apache/hadoop/mapred/TestCapacityScheduler.java
(original)
+++ hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/test/org/apache/hadoop/mapred/TestCapacityScheduler.java
Tue May 12 13:38:19 2009
@@ -379,6 +379,10 @@
       job.kill();
     }
 
+    public void removeJob(JobID jobid) {
+      jobs.remove(jobid);
+    }
+    
     @Override
     public JobInProgress getJob(JobID jobid) {
       return jobs.get(jobid);
@@ -1785,6 +1789,84 @@
     assertNull(scheduler.assignTasks(tracker("tt1")));
   }
 
+  /**
+   * Testcase to verify fix for a NPE (HADOOP-5641), when memory based
+   * scheduling is enabled and jobs are retired from memory when tasks
+   * are still active on some Tasktrackers.
+   *  
+   * @throws IOException
+   */
+  public void testMemoryMatchingWithRetiredJobs() throws IOException {
+    // create a cluster with a single node.
+    LOG.debug("Starting cluster with 1 tasktracker, 2 map and 2 reduce slots");
+    taskTrackerManager = new FakeTaskTrackerManager(1, 2, 2);
+    TaskTrackerStatus.ResourceStatus ttStatus =
+        taskTrackerManager.getTaskTracker("tt1").getResourceStatus();
+    LOG.debug("Assume TT has 4 GB virtual mem and 2 GB RAM");
+    ttStatus.setTotalVirtualMemory(4 * 1024 * 1024 * 1024L);
+    ttStatus.setReservedVirtualMemory(0);
+    ttStatus.setTotalPhysicalMemory(2 * 1024 * 1024 * 1024L);
+    ttStatus.setReservedPhysicalMemory(0);
+
+    // create scheduler
+    ArrayList<FakeQueueInfo> queues = new ArrayList<FakeQueueInfo>();
+    queues.add(new FakeQueueInfo("default", 100.0f, true, 100));
+    taskTrackerManager.addQueues(new String[] { "default" });
+    resConf.setFakeQueues(queues);
+    scheduler.setTaskTrackerManager(taskTrackerManager);
+    // enabled memory-based scheduling
+    LOG.debug("By default, jobs get 0.5 GB per task vmem" +
+        " and 2 GB max vmem, with 50% of it for RAM");
+    scheduler.getConf().setLong(JobConf.MAPRED_TASK_DEFAULT_MAXVMEM_PROPERTY,
+        512 * 1024 * 1024L);
+    scheduler.getConf().setLong(JobConf.UPPER_LIMIT_ON_TASK_VMEM_PROPERTY,
+        2 * 1024 * 1024 * 1024L);
+    resConf.setDefaultPercentOfPmemInVmem(50.0f);
+    resConf.setLimitMaxPmemForTasks(1 * 1024 * 1024 * 1024L);
+    scheduler.setResourceManagerConf(resConf);
+    scheduler.start();
+    
+    // submit a normal job
+    LOG.debug("Submitting a normal job with 2 maps and 2 reduces");
+    JobConf jConf = new JobConf();
+    jConf.setNumMapTasks(2);
+    jConf.setNumReduceTasks(2);
+    jConf.setQueueName("default");
+    jConf.setUser("u1");
+    FakeJobInProgress job1 = submitJobAndInit(JobStatus.PREP, jConf);
+
+    // 1st cycle - 1 map gets assigned.
+    Task t = checkAssignment("tt1", "attempt_test_0001_m_000001_0 on tt1");
+    
+    // kill this job !
+    taskTrackerManager.killJob(job1.getJobID());
+    
+    // retire the job
+    taskTrackerManager.removeJob(job1.getJobID());
+    
+    // submit another job.
+    LOG.debug("Submitting another normal job with 1 map and 1 reduce");
+    jConf = new JobConf();
+    jConf.setNumMapTasks(1);
+    jConf.setNumReduceTasks(1);
+    jConf.setQueueName("default");
+    jConf.setUser("u1");
+    FakeJobInProgress job2 = submitJobAndInit(JobStatus.PREP, jConf);
+    
+    // 2nd cycle - nothing should get assigned. Memory matching code
+    // will see the job is missing and fail memory requirements.
+    assertNull(scheduler.assignTasks(tracker("tt1")));
+    // calling again should not make a difference, as the task is still running
+    assertNull(scheduler.assignTasks(tracker("tt1")));
+    
+    // finish the task on the tracker.
+    taskTrackerManager.finishTask("tt1", t.getTaskID().toString(), job1);
+    // now a new task can be assigned.
+    t = checkAssignment("tt1", "attempt_test_0002_m_000001_0 on tt1");
+    // reduce can be assigned.
+    t = checkAssignment("tt1", "attempt_test_0002_r_000001_0 on tt1");
+  }
+  
   protected TaskTrackerStatus tracker(String taskTrackerName) {
     return taskTrackerManager.getTaskTracker(taskTrackerName);
   }



Mime
View raw message