Return-Path: Delivered-To: apmail-hadoop-core-commits-archive@www.apache.org Received: (qmail 23039 invoked from network); 6 Jun 2008 10:58:08 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.2) by minotaur.apache.org with SMTP; 6 Jun 2008 10:58:08 -0000 Received: (qmail 54277 invoked by uid 500); 6 Jun 2008 10:58:10 -0000 Delivered-To: apmail-hadoop-core-commits-archive@hadoop.apache.org Received: (qmail 54245 invoked by uid 500); 6 Jun 2008 10:58:10 -0000 Mailing-List: contact core-commits-help@hadoop.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: core-dev@hadoop.apache.org Delivered-To: mailing list core-commits@hadoop.apache.org Received: (qmail 54236 invoked by uid 99); 6 Jun 2008 10:58:10 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 06 Jun 2008 03:58:10 -0700 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 06 Jun 2008 10:57:21 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id CDF3723889C1; Fri, 6 Jun 2008 03:57:42 -0700 (PDT) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r663889 - in /hadoop/core/trunk/src/contrib/hod: CHANGES.txt bin/hod bin/ringmaster hodlib/GridServices/service.py hodlib/Hod/hadoop.py hodlib/Hod/hod.py hodlib/RingMaster/ringMaster.py Date: Fri, 06 Jun 2008 10:57:42 -0000 To: core-commits@hadoop.apache.org From: ddas@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20080606105742.CDF3723889C1@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: ddas Date: Fri Jun 6 03:57:42 2008 New Revision: 663889 URL: http://svn.apache.org/viewvc?rev=663889&view=rev Log: HADOOP-3184. Modified HOD to handle master failures on bad nodes by trying to bring them up on another node in the ring. Contributed by Hemanth Yamijala. Modified: hadoop/core/trunk/src/contrib/hod/CHANGES.txt hadoop/core/trunk/src/contrib/hod/bin/hod hadoop/core/trunk/src/contrib/hod/bin/ringmaster hadoop/core/trunk/src/contrib/hod/hodlib/GridServices/service.py hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/ringMaster.py Modified: hadoop/core/trunk/src/contrib/hod/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/CHANGES.txt?rev=663889&r1=663888&r2=663889&view=diff ============================================================================== --- hadoop/core/trunk/src/contrib/hod/CHANGES.txt (original) +++ hadoop/core/trunk/src/contrib/hod/CHANGES.txt Fri Jun 6 03:57:42 2008 @@ -9,6 +9,10 @@ exist and to auto-deallocate a cluster while reallocating it, if it is already dead. (Hemanth Yamijala via mukund) + HADOOP-3184. Modified HOD to handle master failures on bad nodes by trying + to bring them up on another node in the ring. (Hemanth Yamijala via ddas) + + NEW FEATURES IMPROVEMENTS Modified: hadoop/core/trunk/src/contrib/hod/bin/hod URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/bin/hod?rev=663889&r1=663888&r2=663889&view=diff ============================================================================== --- hadoop/core/trunk/src/contrib/hod/bin/hod (original) +++ hadoop/core/trunk/src/contrib/hod/bin/hod Fri Jun 6 03:57:42 2008 @@ -225,7 +225,12 @@ False, 120, False, True), ('idleness-limit', 'pos_int', 'Limit after which to deallocate the cluster', - False, 3600, False, True)), + False, 3600, False, True), + + ('max-master-failures', 'pos_int', + 'Defines how many times a master can fail before' \ + ' failing cluster allocation', False, 5, True, True)), + 'gridservice-mapred' : ( ('external', 'bool', "Connect to an already running MapRed?", Modified: hadoop/core/trunk/src/contrib/hod/bin/ringmaster URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/bin/ringmaster?rev=663889&r1=663888&r2=663889&view=diff ============================================================================== --- hadoop/core/trunk/src/contrib/hod/bin/ringmaster (original) +++ hadoop/core/trunk/src/contrib/hod/bin/ringmaster Fri Jun 6 03:57:42 2008 @@ -113,7 +113,11 @@ False, 120, False, True), ('idleness-limit', 'pos_int', 'Limit after which to deallocate the cluster', - False, 3600, False, True)), + False, 3600, False, True), + + ('max-master-failures', 'pos_int', + 'Defines how many times a master can fail before' \ + ' failing cluster allocation', False, 5, True, True)), 'resource_manager' : ( ('id', 'string', 'Batch scheduler ID: torque|condor.', @@ -330,7 +334,8 @@ 'ringmaster', 'svcrgy-addr')) serviceClient = hodXRClient(serviceAddr) if serviceClient is not None: - serviceClient.setRMError([str(e),get_exception_string()]) + serviceClient.setRMError([local_fqdn(), str(e), \ + get_exception_string()]) log.info("Reported errors to service registry at %s" % serviceAddr) except Exception, e: log.error("Failed to report errors to service registry.") Modified: hadoop/core/trunk/src/contrib/hod/hodlib/GridServices/service.py URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/GridServices/service.py?rev=663889&r1=663888&r2=663889&view=diff ============================================================================== --- hadoop/core/trunk/src/contrib/hod/hodlib/GridServices/service.py (original) +++ hadoop/core/trunk/src/contrib/hod/hodlib/GridServices/service.py Fri Jun 6 03:57:42 2008 @@ -66,6 +66,7 @@ self.masterAddress = 'none' self.requiredNode = requiredNode self.failedMsg = None + self.masterFailureCount = 0 def getRequiredNode(self): return self.requiredNode @@ -136,6 +137,9 @@ """ set the master initialized to true. """ self.masterInitialized = True + # Reset failure related variables, as master is initialized successfully. + self.masterFailureCount = 0 + self.failedMsg = None def getMasterAddress(self): """ it needs to change to reflect @@ -152,11 +156,19 @@ return self.serviceDesc.isExternal() def setMasterFailed(self, err): + """Sets variables related to Master failure""" + self.masterFailureCount += 1 self.failedMsg = err + # When command is sent to HodRings, this would have been set to True. + # Reset it to reflect the correct status. + self.launchedMaster = False def getMasterFailed(self): return self.failedMsg - + + def getMasterFailureCount(self): + return self.masterFailureCount + class NodeRequest: """ A class to define a node request. """ Modified: hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py?rev=663889&r1=663888&r2=663889&view=diff ============================================================================== --- hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py (original) +++ hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py Fri Jun 6 03:57:42 2008 @@ -604,9 +604,11 @@ if status == 5 or status == 6: ringMasterErrors = self.__svcrgyClient.getRMError() if ringMasterErrors: - self.__log.critical("Cluster could not be allocated because of the following errors on the ringmaster host.\n%s" % \ - (ringMasterErrors[0])) - self.__log.debug("Stack trace on ringmaster: %s" % ringMasterErrors[1]) + self.__log.critical("Cluster could not be allocated because" \ + " of the following errors on the "\ + "ringmaster host %s.\n%s" % \ + (ringMasterErrors[0], ringMasterErrors[1])) + self.__log.debug("Stack trace on ringmaster: %s" % ringMasterErrors[2]) return status def __isRingMasterAlive(self, rmAddr): Modified: hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py?rev=663889&r1=663888&r2=663889&view=diff ============================================================================== --- hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py (original) +++ hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py Fri Jun 6 03:57:42 2008 @@ -227,6 +227,31 @@ return opList + def __adjustMasterFailureCountConfig(self, nodeCount): + # This method adjusts the ringmaster.max-master-failures variable + # to a value that is bounded by the a function of the number of + # nodes. + + maxFailures = self.__cfg['ringmaster']['max-master-failures'] + # Count number of masters required - depends on which services + # are external + masters = 0 + if not self.__cfg['gridservice-hdfs']['external']: + masters += 1 + if not self.__cfg['gridservice-mapred']['external']: + masters += 1 + + # So, if there are n nodes and m masters, we look atleast for + # all masters to come up. Therefore, atleast m nodes should be + # good, which means a maximum of n-m master nodes can fail. + maxFailedNodes = nodeCount - masters + + # The configured max number of failures is now bounded by this + # number. + self.__cfg['ringmaster']['max-master-failures'] = \ + min(maxFailures, maxFailedNodes) + + def _op_allocate(self, args): operation = "allocate" argLength = len(args) @@ -312,6 +337,9 @@ self.__cleanup() raise HodInterruptException() self.__log.debug("Service Registry started.") + + self.__adjustMasterFailureCountConfig(nodes) + try: allocateStatus = self.__cluster.allocate(clusterDir, min, max) except HodInterruptException, h: Modified: hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/ringMaster.py URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/ringMaster.py?rev=663889&r1=663888&r2=663889&view=diff ============================================================================== --- hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/ringMaster.py (original) +++ hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/ringMaster.py Fri Jun 6 03:57:42 2008 @@ -370,7 +370,12 @@ for v in self.serviceDict.itervalues(): if (not v.isExternal()): if v.isLaunchable(self.serviceDict): - if not v.isMasterLaunched(): + # If a master is still not launched, or the number of + # retries for launching master is not reached, + # launch master + if not v.isMasterLaunched() and \ + (v.getMasterFailureCount() <= \ + self.cfg['ringmaster']['max-master-failures']): cmdList = v.getMasterCommands(self.serviceDict) v.setlaunchedMaster() v.setMasterAddress(addr) @@ -441,7 +446,8 @@ def setHodRingErrors(self, addr, errors): """This method is called by the hodrings to update errors it encountered while starting up""" - self.log.critical("Hodring at %s failed with following errors:\n%s" % (addr, errors)) + self.log.critical("Hodring at %s failed with following errors:\n%s" \ + % (addr, errors)) lock = self.masterParamLock lock.acquire() try: @@ -452,7 +458,8 @@ idx = addr.rfind('_') if idx is not -1: addr = addr[:idx] - v.setMasterFailed("Hodring at %s failed with following errors:\n%s" % (addr, errors)) + v.setMasterFailed("Hodring at %s failed with following" \ + " errors:\n%s" % (addr, errors)) except: self.log.debug(get_exception_string()) pass @@ -478,8 +485,16 @@ pass else: self.log.debug("getServiceAddr service: %s" % service) + # Check if we should give up ! If the limit on max failures is hit, + # give up. err = service.getMasterFailed() - if err is not None: + if (err is not None) and \ + (service.getMasterFailureCount() > \ + self.cfg['ringmaster']['max-master-failures']): + self.log.critical("Detected errors (%s) beyond allowed number"\ + " of failures (%s). Flagging error to client" \ + % (service.getMasterFailureCount(), \ + self.cfg['ringmaster']['max-master-failures'])) addr = "Error: " + err elif (service.isMasterInitialized()): addr = service.getMasterAddrs()[0]