Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 089C4200C42 for ; Fri, 17 Feb 2017 23:56:43 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id 0779F160B82; Fri, 17 Feb 2017 22:56:43 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id C28CC160B81 for ; Fri, 17 Feb 2017 23:56:41 +0100 (CET) Received: (qmail 96263 invoked by uid 500); 17 Feb 2017 22:56:40 -0000 Mailing-List: contact commits-help@ambari.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: ambari-dev@ambari.apache.org Delivered-To: mailing list commits@ambari.apache.org Received: (qmail 95567 invoked by uid 99); 17 Feb 2017 22:56:39 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 17 Feb 2017 22:56:39 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id B2895E0206; Fri, 17 Feb 2017 22:56:39 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: rzang@apache.org To: commits@ambari.apache.org Date: Fri, 17 Feb 2017 22:56:49 -0000 Message-Id: <15ce78db332e4b2ea34919f790fc230c@git.apache.org> In-Reply-To: <9e3a1ef6943440d4974783c5f5a6b68e@git.apache.org> References: <9e3a1ef6943440d4974783c5f5a6b68e@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [11/51] [abbrv] ambari git commit: AMBARI-19802. Debug: agent randomly losing heartbeat with the server. (Attila Doroszlai via stoader) archived-at: Fri, 17 Feb 2017 22:56:43 -0000 AMBARI-19802. Debug: agent randomly losing heartbeat with the server. (Attila Doroszlai via stoader) (cherry picked from commit d071727734468fd78fdf02a94f490b80cba8cdb2) Change-Id: I12749ac0225a1c8c13a79b71e44b84b75162d770 Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/44f45199 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/44f45199 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/44f45199 Branch: refs/heads/branch-feature-BUG-74026 Commit: 44f45199abbac060767f2cda997406ef8f092f60 Parents: f04d7ba Author: Attila Doroszlai Authored: Wed Feb 1 19:06:29 2017 +0100 Committer: Zuul Committed: Mon Feb 6 21:48:54 2017 -0800 ---------------------------------------------------------------------- .../src/main/python/ambari_agent/Controller.py | 51 +++++++++++++------- .../ambari_agent/StatusCommandsExecutor.py | 1 + 2 files changed, 34 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/44f45199/ambari-agent/src/main/python/ambari_agent/Controller.py ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/python/ambari_agent/Controller.py b/ambari-agent/src/main/python/ambari_agent/Controller.py index 09f15de..61a74e6 100644 --- a/ambari-agent/src/main/python/ambari_agent/Controller.py +++ b/ambari-agent/src/main/python/ambari_agent/Controller.py @@ -274,7 +274,7 @@ class Controller(threading.Thread): self.DEBUG_SUCCESSFULL_HEARTBEATS = 0 retry = False certVerifFailed = False - state_interval = self.config.get('heartbeat', 'state_interval_seconds', '60') + state_interval = int(self.config.get('heartbeat', 'state_interval_seconds', '60')) # last time when state was successfully sent to server last_state_timestamp = 0.0 @@ -289,27 +289,34 @@ class Controller(threading.Thread): getrecoverycommands_timestamp = 0.0 getrecoverycommands_interval = self.netutil.HEARTBEAT_IDLE_INTERVAL_DEFAULT_MAX_SEC + heartbeat_interval = self.netutil.HEARTBEAT_IDLE_INTERVAL_DEFAULT_MAX_SEC + while not self.DEBUG_STOP_HEARTBEATING: - heartbeat_interval = self.netutil.HEARTBEAT_IDLE_INTERVAL_DEFAULT_MAX_SEC + current_time = time.time() + logging_level = logging.DEBUG + if current_time - heartbeat_running_msg_timestamp > state_interval: + # log more steps every minute or so + logging_level = logging.INFO + heartbeat_running_msg_timestamp = current_time try: - crt_time = time.time() - if crt_time - heartbeat_running_msg_timestamp > int(state_interval): - logger.info("Heartbeat (response id = %s) with server is running...", self.responseId) - heartbeat_running_msg_timestamp = crt_time + logger.log(logging_level, "Heartbeat (response id = %s) with server is running...", self.responseId) send_state = False if not retry: - if crt_time - last_state_timestamp > int(state_interval): + if current_time - last_state_timestamp > state_interval: send_state = True - data = json.dumps( - self.heartbeat.build(self.responseId, send_state, self.hasMappedComponents)) + logger.log(logging_level, "Building heartbeat message") + + data = json.dumps(self.heartbeat.build(self.responseId, send_state, self.hasMappedComponents)) else: self.DEBUG_HEARTBEAT_RETRIES += 1 - - logger.debug("Sending Heartbeat (id = %s): %s", self.responseId, data) + if logger.isEnabledFor(logging.DEBUG): + logger.log(logging_level, "Sending Heartbeat (id = %s): %s", self.responseId, data) + else: + logger.log(logging_level, "Sending Heartbeat (id = %s)", self.responseId) response = self.sendRequest(self.heartbeatUrl, data) exitStatus = 0 @@ -321,8 +328,7 @@ class Controller(threading.Thread): serverId = int(response['responseId']) - - logger.debug('Heartbeat response received (id = %s)', serverId) + logger.log(logging_level, 'Heartbeat response received (id = %s)', serverId) cluster_size = int(response['clusterSize']) if 'clusterSize' in response.keys() else -1 @@ -331,8 +337,7 @@ class Controller(threading.Thread): if cluster_size > 0 \ else self.netutil.HEARTBEAT_IDLE_INTERVAL_DEFAULT_MAX_SEC - - logger.debug("Heartbeat interval is %s seconds", heartbeat_interval) + logger.log(logging_level, "Heartbeat interval is %s seconds", heartbeat_interval) if 'hasMappedComponents' in response.keys(): self.hasMappedComponents = response['hasMappedComponents'] is not False @@ -364,10 +369,11 @@ class Controller(threading.Thread): else: self.responseId = serverId if send_state: - last_state_timestamp = time.time() + last_state_timestamp = current_time # if the response contains configurations, update the in-memory and # disk-based configuration cache (execution and alert commands have this) + logger.log(logging_level, "Updating configurations from heartbeat") self.cluster_configuration.update_configurations_from_heartbeat(response) response_keys = response.keys() @@ -375,6 +381,8 @@ class Controller(threading.Thread): # there's case when canceled task can be processed in Action Queue.execute before adding rescheduled task to queue # this can cause command failure instead result suppression # so canceling and putting rescheduled commands should be executed atomically + if 'cancelCommands' in response_keys or 'executionCommands' in response_keys: + logger.log(logging_level, "Adding cancel/execution commands") with self.actionQueue.lock: if 'cancelCommands' in response_keys: self.cancelCommandInQueue(response['cancelCommands']) @@ -388,9 +396,10 @@ class Controller(threading.Thread): # try storing execution command details and desired state self.addToStatusQueue(response['statusCommands']) - if crt_time - getrecoverycommands_timestamp > int(getrecoverycommands_interval): - getrecoverycommands_timestamp = crt_time + if current_time - getrecoverycommands_timestamp > getrecoverycommands_interval: + getrecoverycommands_timestamp = current_time if not self.actionQueue.tasks_in_progress_or_pending(): + logger.log(logging_level, "Adding recovery commands") recovery_commands = self.recovery_manager.get_recovery_commands() for recovery_command in recovery_commands: logger.info("Adding recovery command %s for component %s", @@ -398,9 +407,11 @@ class Controller(threading.Thread): self.addToQueue([recovery_command]) if 'alertDefinitionCommands' in response_keys: + logger.log(logging_level, "Updating alert definitions") self.alert_scheduler_handler.update_definitions(response) if 'alertExecutionCommands' in response_keys: + logger.log(logging_level, "Executing alert commands") self.alert_scheduler_handler.execute_alert(response['alertExecutionCommands']) if "true" == response['restartAgent']: @@ -414,6 +425,7 @@ class Controller(threading.Thread): if "recoveryConfig" in response: # update the list of components enabled for recovery + logger.log(logging_level, "Updating recovery config") self.recovery_manager.update_configuration_from_registration(response) retry = False @@ -455,12 +467,15 @@ class Controller(threading.Thread): # Sleep for some time timeout = heartbeat_interval - self.netutil.MINIMUM_INTERVAL_BETWEEN_HEARTBEATS + logger.log(logging_level, "Waiting %s for next heartbeat", timeout) if 0 == self.heartbeat_stop_callback.wait(timeout, self.netutil.MINIMUM_INTERVAL_BETWEEN_HEARTBEATS): # Stop loop when stop event received logger.info("Stop event received") self.DEBUG_STOP_HEARTBEATING=True + logger.log(logging_level, "Wait for next heartbeat over") + def spawnStatusCommandsExecutorProcess(self): ''' Starts a new StatusCommandExecutor child process. In case there is a running instance http://git-wip-us.apache.org/repos/asf/ambari/blob/44f45199/ambari-agent/src/main/python/ambari_agent/StatusCommandsExecutor.py ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/python/ambari_agent/StatusCommandsExecutor.py b/ambari-agent/src/main/python/ambari_agent/StatusCommandsExecutor.py index fbb29f4..2f15770 100644 --- a/ambari-agent/src/main/python/ambari_agent/StatusCommandsExecutor.py +++ b/ambari-agent/src/main/python/ambari_agent/StatusCommandsExecutor.py @@ -47,6 +47,7 @@ class StatusCommandsExecutor(multiprocessing.Process): def run(self): try: bind_debug_signal_handlers() + logger.info("StatusCommandsExecutor starting") while True: command = self.actionQueue.statusCommandQueue.get(True) # blocks until status status command appears logger.debug("Running status command for {0}".format(command['componentName']))