hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From acmur...@apache.org
Subject svn commit: r705420 - in /hadoop/core/trunk/src/contrib/hod: CHANGES.txt bin/hod hodlib/Hod/hadoop.py hodlib/NodePools/torque.py
Date Fri, 17 Oct 2008 00:21:18 GMT
Author: acmurthy
Date: Thu Oct 16 17:21:17 2008
New Revision: 705420

URL: http://svn.apache.org/viewvc?rev=705420&view=rev
Log:
HADOOP-3217. Decrease the rate at which the hod queries the resource manager for job status.
Contributed by Hemanth Yamijala.

Modified:
    hadoop/core/trunk/src/contrib/hod/CHANGES.txt
    hadoop/core/trunk/src/contrib/hod/bin/hod
    hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py
    hadoop/core/trunk/src/contrib/hod/hodlib/NodePools/torque.py

Modified: hadoop/core/trunk/src/contrib/hod/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/CHANGES.txt?rev=705420&r1=705419&r2=705420&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/CHANGES.txt (original)
+++ hadoop/core/trunk/src/contrib/hod/CHANGES.txt Thu Oct 16 17:21:17 2008
@@ -110,6 +110,13 @@
     HADOOP-3703. Fixes logcondense.py to use the new format of hadoop dfs -lsr
     command line output format. (Vinod Kumar Vavilapalli via yhemanth)
 
+Release 0.17.3 - Unreleased 
+
+  BUG FIXES
+
+    HADOOP-3217. Decrease the rate at which the hod queries the resource
+    manager for job status. (Hemanth Yamijala via acmurthy) 
+
 Release 0.17.0 - 2008-05-18
 
   INCOMPATIBLE CHANGES

Modified: hadoop/core/trunk/src/contrib/hod/bin/hod
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/bin/hod?rev=705420&r1=705419&r2=705420&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/bin/hod (original)
+++ hadoop/core/trunk/src/contrib/hod/bin/hod Thu Oct 16 17:21:17 2008
@@ -159,7 +159,19 @@
               True, 10, False, True, 'W'),
 
              ('log-rollover-count', 'pos_int', 'Specifies the number of rolled-over log files
of HOD client. A zero value disables rollover.',
-              True, 5, False, True, 'L')),
+              True, 5, False, True, 'L'),
+
+             ('job-status-query-interval', 'pos_int', 'Specifies the time between checking
for job status', 
+              False, 30, False, True),
+
+             ('job-command-failure-interval', 'pos_int', 'Specifies the time between checking
for failed job status or submission commands', 
+              False, 10, False, True),
+
+             ('job-status-query-failure-retries', 'pos_int', 'Specifies the number of times
job status failure queries are retried', 
+              False, 3, False, True),
+
+             ('job-submission-failure-retries', 'pos_int', 'Specifies the number of times
job submission failure queries are retried',
+              False, 3, False, True)),
 
             'resource_manager' : (
              ('id', 'string', 'Batch scheduler ID: torque|condor.',

Modified: hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py?rev=705420&r1=705419&r2=705420&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py (original)
+++ hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py Thu Oct 16 17:21:17 2008
@@ -180,25 +180,29 @@
     return serviceData
   
   def __check_job_status(self):
-    initWaitCount = 20
-    count = 0
+    failureCount = 0
     status = False
     state = 'Q'
     userLimitsFirstFlag = True
 
-    while state == 'Q':
+    while (state=='Q') or (state==False):
       if hodInterrupt.isSet():
         raise HodInterruptException()
 
       jobInfo = self.__nodePool.getJobInfo()
       state = jobInfo['job_state']
-      if (state==False) or (state!='Q'):
+      self.__log.debug('job state %s' % state)
+      if state == False:
+        failureCount += 1
+        if (failureCount >= self.__cfg['hod']['job-status-query-failure-retries']):
+          self.__log.debug('Number of retries reached max limit while querying job status')
+          break
+        time.sleep(self.__cfg['hod']['job-command-failure-interval'])
+      elif state!='Q':
         break
-      count = count + 1
-      if count < initWaitCount:
-        time.sleep(0.5)
       else:
-        time.sleep(10)
+        self.__log.debug('querying for job status after job-status-query-interval')
+        time.sleep(self.__cfg['hod']['job-status-query-interval'])
 
       if self.__cfg['hod'].has_key('job-feasibility-attr') and \
                       self.__cfg['hod']['job-feasibility-attr']:
@@ -255,7 +259,7 @@
           time.sleep(1)
           count = count + 1
           # check to see if the job exited by any chance in that time:
-          if (count % 10 == 0):
+          if (count % self.__cfg['hod']['job-status-query-interval'] == 0):
             if not self.__check_job_status():
               break
     return ringmasterXRS
@@ -273,9 +277,9 @@
         serviceAddress = xmlrpcClient.getServiceAddr(serviceName)
         if serviceAddress:
           if serviceAddress == 'not found':
-            time.sleep(.5)
+            time.sleep(1)
           # check to see if the job exited by any chance in that time:
-            if (i % 10 == 0):
+            if ((i+1) % self.__cfg['hod']['job-status-query-interval'] == 0):
               if not self.__check_job_status():
                 break
           else:
@@ -486,6 +490,7 @@
     
   def allocate(self, clusterDir, min, max=None):
     status = 0
+    failureCount = 0
     self.__svcrgyClient = self.__get_svcrgy_client()
         
     self.__log.debug("allocate %s %s %s" % (clusterDir, min, max))
@@ -499,6 +504,23 @@
       if self.__cfg['hod'].has_key('walltime'):
         walltime = self.__cfg['hod']['walltime']
       self.jobId, exitCode = self.__nodePool.submitNodeSet(nodeSet, walltime)
+      # if the job submission returned an error other than no resources
+      # retry a couple of times
+      while (self.jobId is False) and (exitCode != 188):
+        if hodInterrupt.isSet():
+          raise HodInterruptException()
+
+        failureCount += 1
+        if (failureCount >= self.__cfg['hod']['job-status-query-failure-retries']):
+          self.__log.debug("failed submitting job more than the retries. exiting")
+          break
+        else:
+          # wait a bit before retrying
+          time.sleep(self.__cfg['hod']['job-command-failure-interval'])
+          if hodInterrupt.isSet():
+            raise HodInterruptException()
+          self.jobId, exitCode = self.__nodePool.submitNodeSet(nodeSet, walltime)
+
       if self.jobId:
         jobStatus = None
         try:
@@ -632,7 +654,7 @@
         if exitCode == 188:
           self.__log.critical("Request execeeded maximum resource allocation.")
         else:
-          self.__log.critical("Insufficient resources available.")
+          self.__log.critical("Job submission failed with exit code %s" % exitCode)
         status = 4
       else:    
         self.__log.critical("Scheduler failure, allocation failed.\n\n")        

Modified: hadoop/core/trunk/src/contrib/hod/hodlib/NodePools/torque.py
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/NodePools/torque.py?rev=705420&r1=705419&r2=705420&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/hodlib/NodePools/torque.py (original)
+++ hadoop/core/trunk/src/contrib/hod/hodlib/NodePools/torque.py Thu Oct 16 17:21:17 2008
@@ -276,33 +276,22 @@
     return id
 
   def getJobInfo(self, jobId=None):
-    #torque error code when credentials fail, a temporary condition sometimes.
-    credFailureErrorCode = 171
+
     jobNonExistentErrorCode = 153
-    credFailureRetries = 10
-    i = 0
-    self.__jobInfo = None
+    self.__jobInfo = { 'job_state' : False }
     
     if jobId == None:
       jobId = self.getServiceId()
 
-    while i < credFailureRetries:
-      qstatInfo, exitCode = self.__torque.qstat(jobId)
-      if exitCode == 0:
-        self.__jobInfo = qstatInfo
-        break
-      elif exitCode == jobNonExistentErrorCode:
-        # This really means that the job completed
-        # However, setting only job_state for now, not 
-        # any other attributes, as none seem required.
-        self.__jobInfo = { 'job_state' : 'C' }
-        break
-      else:
-        if exitCode == credFailureErrorCode:
-          time.sleep(1)
-          i = i+1
-        else:
-          break
+    qstatInfo, exitCode = self.__torque.qstat(jobId)
+    if exitCode == 0:
+      self.__jobInfo = qstatInfo
+    elif exitCode == jobNonExistentErrorCode:
+      # This really means that the job completed
+      # However, setting only job_state for now, not 
+      # any other attributes, as none seem required.
+      self.__jobInfo = { 'job_state' : 'C' }
+
     return self.__jobInfo
 
   def deleteJob(self, jobId):



Mime
View raw message