Return-Path: X-Original-To: apmail-cloudstack-commits-archive@www.apache.org Delivered-To: apmail-cloudstack-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 2546E10A9D for ; Mon, 25 Nov 2013 09:44:34 +0000 (UTC) Received: (qmail 66102 invoked by uid 500); 25 Nov 2013 09:44:32 -0000 Delivered-To: apmail-cloudstack-commits-archive@cloudstack.apache.org Received: (qmail 65950 invoked by uid 500); 25 Nov 2013 09:44:26 -0000 Mailing-List: contact commits-help@cloudstack.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@cloudstack.apache.org Delivered-To: mailing list commits@cloudstack.apache.org Received: (qmail 65940 invoked by uid 99); 25 Nov 2013 09:44:25 -0000 Received: from tyr.zones.apache.org (HELO tyr.zones.apache.org) (140.211.11.114) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 25 Nov 2013 09:44:25 +0000 Received: by tyr.zones.apache.org (Postfix, from userid 65534) id 0E6A3904DD0; Mon, 25 Nov 2013 09:44:24 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: jayapal@apache.org To: commits@cloudstack.apache.org Message-Id: X-Mailer: ASF-Git Admin Mailer Subject: git commit: updated refs/heads/master to 0be4a68 Date: Mon, 25 Nov 2013 09:44:24 +0000 (UTC) Updated Branches: refs/heads/master ab2c38c05 -> 0be4a685e CLOUDSTACK-5164 Unmonit for 30 minutes for a failed process Project: http://git-wip-us.apache.org/repos/asf/cloudstack/repo Commit: http://git-wip-us.apache.org/repos/asf/cloudstack/commit/0be4a685 Tree: http://git-wip-us.apache.org/repos/asf/cloudstack/tree/0be4a685 Diff: http://git-wip-us.apache.org/repos/asf/cloudstack/diff/0be4a685 Branch: refs/heads/master Commit: 0be4a685e8cb0caedf670e45075f1b4e52237f5c Parents: ab2c38c Author: Jayapal Authored: Mon Nov 25 14:58:12 2013 +0530 Committer: Jayapal Committed: Mon Nov 25 15:12:48 2013 +0530 ---------------------------------------------------------------------- .../config/opt/cloud/bin/monitor_service.sh | 2 +- .../debian/config/root/monitorServices.py | 199 +++++++++++++++---- 2 files changed, 161 insertions(+), 40 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cloudstack/blob/0be4a685/systemvm/patches/debian/config/opt/cloud/bin/monitor_service.sh ---------------------------------------------------------------------- diff --git a/systemvm/patches/debian/config/opt/cloud/bin/monitor_service.sh b/systemvm/patches/debian/config/opt/cloud/bin/monitor_service.sh index c4d99d2..51b6923 100755 --- a/systemvm/patches/debian/config/opt/cloud/bin/monitor_service.sh +++ b/systemvm/patches/debian/config/opt/cloud/bin/monitor_service.sh @@ -64,7 +64,7 @@ crontab -l | grep -v monitorServices.py | crontab - create_config $config #add cron job -(crontab -l ;echo -e "SHELL=/bin/bash\nPATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin\n */1 * * * * /usr/bin/python /root/monitorServices.py") | crontab - +(crontab -l ;echo -e "SHELL=/bin/bash\nPATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin\n */3 * * * * /usr/bin/python /root/monitorServices.py") | crontab - unlock_exit 0 $lock $locked http://git-wip-us.apache.org/repos/asf/cloudstack/blob/0be4a685/systemvm/patches/debian/config/root/monitorServices.py ---------------------------------------------------------------------- diff --git a/systemvm/patches/debian/config/root/monitorServices.py b/systemvm/patches/debian/config/root/monitorServices.py index 2cec672..4e1b7e0 100755 --- a/systemvm/patches/debian/config/root/monitorServices.py +++ b/systemvm/patches/debian/config/root/monitorServices.py @@ -19,14 +19,13 @@ -__author__ = 'jayapalreddy' from ConfigParser import SafeConfigParser from subprocess import * from os import path import time +import os -monitor_log='/var/log/monitor.log' class StatusCodes: SUCCESS = 0 FAILED = 1 @@ -35,42 +34,58 @@ class StatusCodes: STOPPED = 4 STARTING = 5 -class log: +class Log: INFO = 'INFO' ALERT = 'ALERT' CRIT = 'CRIT' NOTIF = 'NOTIF' - +class Config: + MONIT_AFTER_MINS = 30 + SLEEP_SEC = 1 + RETRY_ITERATIONS = 10 + RETRY_FOR_RESTART = 5 + MONITOR_LOG = '/var/log/monitor.log' + UNMONIT_PS_FILE = '/etc/unmonit_psList.txt' def getConfig( config_file_path = "/etc/monitor.conf" ): + """ + Reads the process configuration from the config file. + Config file contains the processes to be monitored. + + """ process_dict = {} parser = SafeConfigParser() parser.read( config_file_path ) - #print 'Read values:\n' for section in parser.sections(): - # print section process_dict[section] = {} for name, value in parser.items(section): process_dict[section][name] = value -# print ' %s = %r' % (name, value) +# printd (" %s = %r" % (name, value)) return process_dict def printd (msg): + """ + prints the debug messages + """ + #for debug + #print msg return 0 - f= open(monitor_log,'r+') + f= open(Config.MONITOR_LOG,'r+') f.seek(0, 2) f.write(str(msg)+"\n") f.close() def raisealert(severity, msg, process_name=None): + """ Writes the alert message""" + #timeStr=str(time.ctime()) if process_name is not None: log = '['+severity +']'+" " + '['+process_name+']' + " " + msg +"\n" @@ -82,9 +97,12 @@ def raisealert(severity, msg, process_name=None): def isPidMatchPidFile(pidfile, pids): + """ Compares the running process pid with the pid in pid file. + If a process with multiple pids then it matches with pid file + """ if pids is None or isinstance(pids,list) != True or len(pids) == 0: - print "Invalid Arguments" + printd ("Invalid Arguments") return StatusCodes.FAILED if not path.isfile(pidfile): #It seems there is no pid file for this service @@ -100,12 +118,18 @@ def isPidMatchPidFile(pidfile, pids): inp = fd.read() + + if not inp: + fd.close() + return StatusCodes.FAILED + printd("file content "+str(inp)) printd(pids) tocheck_pid = inp.strip() for item in pids: if str(tocheck_pid) == item.strip(): printd("pid file matched") + fd.close() return StatusCodes.SUCCESS fd.close() @@ -114,19 +138,22 @@ def isPidMatchPidFile(pidfile, pids): def checkProcessStatus( process ): + """ + Check the process running status, if not running tries to restart + """ process_name = process.get('processname') service_name = process.get('servicename') pidfile = process.get('pidfile') #temp_out = None restartFailed=False - pidFileMatched=1 + pidFileMatched=False + pids='' cmd='' if process_name is None: - print "\n Invalid Process Name" + printd ("\n Invalid Process Name") return StatusCodes.INVALID_INP else: - msg="checking the process " + process_name - printd(msg) + printd("checking the process " + process_name) cmd = 'pidof ' + process_name printd(cmd) #cmd = 'service ' + process_name + ' status' @@ -136,20 +163,19 @@ def checkProcessStatus( process ): #check there is only one pid or not if exitStatus == 0: + pids = temp_out.split(' ') msg="pids: " +temp_out; printd(msg) - pids = temp_out.split(' ') #there is more than one process so match the pid file - #if not matched set pidFileMatched=0 + #if not matched set pidFileMatched=False printd("Checking pid file") if isPidMatchPidFile(pidfile, pids) == StatusCodes.SUCCESS: - pidFileMatched = 1; + pidFileMatched = True; else: - pidFileMatched = 0; + pidFileMatched = False; - printd(pidFileMatched) - if exitStatus == 0 and pidFileMatched == 1: + if exitStatus == 0 and pidFileMatched == True: printd("The process is running ....") return StatusCodes.RUNNING else: @@ -157,28 +183,28 @@ def checkProcessStatus( process ): msg="The process " + process_name +" is not running trying recover " printd(msg) #Retry the process state for few seconds - for i in range(1,10): + for i in range(1, Config.RETRY_ITERATIONS): pout = Popen(cmd, shell=True, stdout=PIPE) exitStatus = pout.wait() temp_out = pout.communicate()[0] - if i < 5: # this is just for trying few more times + if i < Config.RETRY_FOR_RESTART: # this is just for trying few more times if exitStatus == 0: pids = temp_out.split(' ') if isPidMatchPidFile(pidfile, pids) == StatusCodes.SUCCESS: - pidFileMatched = 1; + pidFileMatched = True; printd("pid file is matched ...") - raisealert(log.ALERT, "The process detected as running", process_name) + raisealert(Log.ALERT, "The process detected as running", process_name) break else: printd("pid file is not matched ...") - pidFileMatched = 0; + pidFileMatched = False; + time.sleep(Config.SLEEP_SEC) continue - time.sleep(1) else: msg="The process " +process_name+" is not running trying recover " - raisealert(log.INFO,process_name,msg) + raisealert(Log.INFO,process_name,msg) if service_name == 'apache2': # Killing apache2 process with this the main service will not start @@ -189,7 +215,7 @@ def checkProcessStatus( process ): cmd = 'service ' + service_name + ' restart' - time.sleep(1) + time.sleep(Config.SLEEP_SEC) #return_val= check_call(cmd , shell=True) cout = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT) @@ -198,37 +224,135 @@ def checkProcessStatus( process ): if return_val == 0: printd("The process" + process_name +" recovered successfully ") msg="The process " +process_name+" is recovered successfully " - raisealert(log.INFO,msg,process_name) + raisealert(Log.INFO,msg,process_name) break; else: #retry restarting the process for few tries printd("process restart failing trying again ....") restartFailed=True - time.sleep(1) + time.sleep(Config.SLEEP_SEC) continue #for end here if restartFailed == True: msg="The process %s recover failed "%process_name - raisealert(log.ALERT,process_name,msg) + raisealert(Log.ALERT,process_name,msg) printd("Restart failed after number of retries") return StatusCodes.STOPPED return StatusCodes.RUNNING -def raiseAlert( process_name ): - print "process name %s is raised "%process_name def monitProcess( processes_info ): + """ + Monitors the processes which got from the config file + """ if len( processes_info ) == 0: - print "Invalid Input" + printd("Invalid Input") return StatusCodes.INVALID_INP + + dict_unmonit={} + umonit_update={} + + if not path.isfile(Config.UNMONIT_PS_FILE): + printd('Unmonit File not exist') + else: + #load the dictionary with unmonit process list + dict_unmonit = loadPsFromUnMonitFile() + + #time for noting process down time + csec = repr(time.time()).split('.')[0] + + unMonitPs=False + for process,properties in processes_info.items(): + #skip the process it its time stamp less than Config.MONIT_AFTER_MINS + printd ("checking the process %s \n" %process) + + if not is_emtpy(dict_unmonit): + if dict_unmonit.has_key(process): + ts = dict_unmonit[process] + printd("Time difference=%s" %str(int(csec) - int(ts))) + tmin = (int(csec) - int(ts) )/60 + + if ( int(csec) - int(ts) )/60 < Config.MONIT_AFTER_MINS: + raisealert(Log.ALERT, "The %s get monitor after %s minutes " %(process, Config.MONIT_AFTER_MINS)) + printd('process will be monitored after %s min' %(str(int(Config.MONIT_AFTER_MINS) - tmin))) + unMonitPs=True + continue + if checkProcessStatus( properties) != StatusCodes.RUNNING: - print "\n Process %s is not Running"%process + printd( "\n Process %s is not Running"%process) + #add this process into unmonit list + printd ("updating the process for unmonit %s\n" %process) + umonit_update[process]=csec + + + #if dict is not empty write to file else delete it + if not is_emtpy(umonit_update): + writePsListToUnmonitFile(umonit_update) + else: + if is_emtpy(umonit_update) and unMonitPs == False: + #delete file it is there + if path.isfile(Config.UNMONIT_PS_FILE): + printd("Removing the file %s" %Config.UNMONIT_PS_FILE) + os.remove(Config.UNMONIT_PS_FILE) + + + +def loadPsFromUnMonitFile(): + dict_unmonit = {} + + try: + fd = open(Config.UNMONIT_PS_FILE) + except: + printd("Failed to open file %s " %(Config.UNMONIT_PS_FILE)) + return StatusCodes.FAILED + + ps = fd.read() + + if not ps: + printd("File %s content is empty " %Config.UNMONIT_PS_FILE) + return StatusCodes.FAILED + + printd(ps) + plist = ps.split(',') + plist.remove('') + for i in plist: + dict_unmonit[i.split(':')[0]] = i.split(':')[1] + + fd.close(); + + return dict_unmonit; + + +def writePsListToUnmonitFile(umonit_update): + printd("Write updated unmonit list to file") + line='' + for i in umonit_update: + line+=str(i)+":"+str(umonit_update[i])+',' + printd(line) + try: + fd=open(Config.UNMONIT_PS_FILE,'w') + except: + printd("Failed to open file %s " %Config.UNMONIT_PS_FILE) + return StatusCodes.FAILED + + fd.write(line); + fd.close() + + +def is_emtpy(struct): + """ + Checks wether the given struct is empty or not + """ + if struct: + return False + else: + return True def main(): ''' @@ -238,14 +362,11 @@ def main(): printd("monitoring started") temp_dict = getConfig() - ''' - Step2: Get Previous Run Log - ''' ''' - Step3: Monitor and Raise Alert + Step2: Monitor and Raise Alert ''' - #raisealert(log.INFO, 'Monit started') + #raisealert(Log.INFO, 'Monit started') monitProcess( temp_dict )