hadoop-common-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From d...@apache.org
Subject svn commit: r668558 - in /hadoop/core/branches/branch-0.18/src/contrib/hod: CHANGES.txt hodlib/HodRing/hodRing.py
Date Tue, 17 Jun 2008 08:13:01 GMT
Author: ddas
Date: Tue Jun 17 01:13:01 2008
New Revision: 668558

URL: http://svn.apache.org/viewvc?rev=668558&view=rev
Log:
Merge -r 668553:668554 from trunk onto 0.18 branch. Fixes HADOOP-3531.

Modified:
    hadoop/core/branches/branch-0.18/src/contrib/hod/CHANGES.txt
    hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/HodRing/hodRing.py

Modified: hadoop/core/branches/branch-0.18/src/contrib/hod/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/contrib/hod/CHANGES.txt?rev=668558&r1=668557&r2=668558&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/src/contrib/hod/CHANGES.txt (original)
+++ hadoop/core/branches/branch-0.18/src/contrib/hod/CHANGES.txt Tue Jun 17 01:13:01 2008
@@ -34,6 +34,9 @@
     HADOOP-3523. Fixes auto-deallocation of cluster if job id is not found in
     Torque's job list (Hemanth Yamijala via ddas)
 
+    HADOOP-3531. Fixes a bug related to handling JobTracker failures because of
+    timing issues on slow nodes. (Hemanth Yamijala via ddas)
+
 Release 0.17.0 - 2008-05-18
 
   INCOMPATIBLE CHANGES

Modified: hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/HodRing/hodRing.py
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/HodRing/hodRing.py?rev=668558&r1=668557&r2=668558&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/HodRing/hodRing.py (original)
+++ hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/HodRing/hodRing.py Tue Jun 17
01:13:01 2008
@@ -435,31 +435,12 @@
       if self.__hadoopThread.exit_code() != 0:
         status = False
     else:
-      code = self.__hadoopThread.exit_code()
-      if code != 0 and code != None:
-        status = False
+      status = self.getCommandStatus()
         
     self.log.debug("hadoop run status: %s" % status)    
     
     if status == False:
-      self.log.error('hadoop error: %s' % (
-                       self.__hadoopThread.exit_status_string()))
-      # read the contents of redirected stderr to print information back to user
-      if os.path.exists(self.err):
-        f = None
-        try:
-          f = open(self.err)
-          lines = f.readlines()
-          # format
-          for line in lines:
-            self.stdErrContents = "%s%s" % (self.stdErrContents, line)
-        finally:
-          if f is not None:
-            f.close()
-      self.log.error('See %s.out and/or %s.err for details. They are ' % \
-                     (self.name, self.name) + \
-                     'located at subdirectories under either ' + \
-                     'hodring.work-dirs or hodring.log-destination-uri.')
+      self.handleFailedCommand()
    
     if (status == True) or (not desc.isIgnoreFailures()):
       return status
@@ -476,6 +457,33 @@
     list.extend(self.workdirs)
     list.append(self.confdir)
 
+  def getCommandStatus(self):
+    status = True
+    ec = self.__hadoopThread.exit_code()
+    if (ec != 0) and (ec != None):
+      status = False
+    return status
+
+  def handleFailedCommand(self):
+    self.log.error('hadoop error: %s' % (
+                     self.__hadoopThread.exit_status_string()))
+    # read the contents of redirected stderr to print information back to user
+    if os.path.exists(self.err):
+      f = None
+      try:
+        f = open(self.err)
+        lines = f.readlines()
+        # format
+        for line in lines:
+          self.stdErrContents = "%s%s" % (self.stdErrContents, line)
+      finally:
+        if f is not None:
+          f.close()
+    self.log.error('See %s.out and/or %s.err for details. They are ' % \
+                   (self.name, self.name) + \
+                   'located at subdirectories under either ' + \
+                   'hodring.work-dirs or hodring.log-destination-uri.')
+
 class HodRing(hodBaseService):
   """The main class for hodring that
   polls the commands it runs"""
@@ -636,9 +644,9 @@
 
       # ok.. now command is running. If this HodRing got jobtracker, 
       # Check if it is ready for accepting jobs, and then only return
-      self.__check_jobtracker(desc, id-1)
+      self.__check_jobtracker(desc, id-1, pkgdir)
       
-  def __check_jobtracker(self, desc, id):
+  def __check_jobtracker(self, desc, id, pkgdir):
     # Check jobtracker status. Return properly if it is ready to accept jobs.
     # Currently Checks for Jetty to come up, the last thing that can be checked
     # before JT completes initialisation. To be perfectly reliable, we need 
@@ -649,7 +657,8 @@
       self.log.debug("Waiting for jobtracker to initialise")
       version = desc.getVersion()
       self.log.debug("jobtracker version : %s" % version)
-      attrs = self.getRunningValues()[id].getFilledInKeyValues()
+      hadoopCmd = self.getRunningValues()[id]
+      attrs = hadoopCmd.getFilledInKeyValues()
       attrs = parseEquals(attrs)
       jobTrackerAddr = attrs['mapred.job.tracker']
       self.log.debug("jobtracker rpc server : %s" % jobTrackerAddr)
@@ -669,6 +678,20 @@
       jettyStatus = False
       jettyStatusmsg = ""
       while sleepTime <= 32:
+        # There is a possibility that the command might fail after a while.
+        # This code will check if the command failed so that a better
+        # error message can be returned to the user.
+        if not hadoopCmd.getCommandStatus():
+          self.log.critical('Hadoop command found to have failed when ' \
+                            'checking for jobtracker status')
+          hadoopCmd.handleFailedCommand()
+          addnInfo = ""
+          if hadoopCmd.stdErrContents is not "":
+            addnInfo = " Information from stderr of the command:\n%s" \
+                                        % (hadoopCmd.stdErrContents)
+          raise Exception("Could not launch the %s using %s/bin/hadoop.%s" \
+                                        % (desc.getName(), pkgdir, addnInfo))
+          
         try:
           jettyConn = httplib.HTTPConnection(jettyAddr)
           jettyConn.request("HEAD", "/jobtracker.jsp")



Mime
View raw message