Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 23A5A200C30 for ; Tue, 7 Mar 2017 12:11:11 +0100 (CET) Received: by cust-asf.ponee.io (Postfix) id 22333160B82; Tue, 7 Mar 2017 11:11:11 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 522EF160B68 for ; Tue, 7 Mar 2017 12:11:10 +0100 (CET) Received: (qmail 65955 invoked by uid 500); 7 Mar 2017 11:11:09 -0000 Mailing-List: contact commits-help@ambari.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: ambari-dev@ambari.apache.org Delivered-To: mailing list commits@ambari.apache.org Received: (qmail 65945 invoked by uid 99); 7 Mar 2017 11:11:09 -0000 Received: from git1-us-west.apache.org (HELO git1-us-west.apache.org) (140.211.11.23) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 07 Mar 2017 11:11:09 +0000 Received: by git1-us-west.apache.org (ASF Mail Server at git1-us-west.apache.org, from userid 33) id 71C32DFD9E; Tue, 7 Mar 2017 11:11:09 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: lpuskas@apache.org To: commits@ambari.apache.org Message-Id: <802d132ecb014b9fa3f28de550ca81a0@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: ambari git commit: AMBARI-20319 Server startup script keeps waiting even if DB consistency has failed Date: Tue, 7 Mar 2017 11:11:09 +0000 (UTC) archived-at: Tue, 07 Mar 2017 11:11:11 -0000 Repository: ambari Updated Branches: refs/heads/trunk e6dcdf633 -> c21f77dfb AMBARI-20319 Server startup script keeps waiting even if DB consistency has failed Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/c21f77df Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/c21f77df Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/c21f77df Branch: refs/heads/trunk Commit: c21f77dfb6ca4fdd012af4313696eabcc11c1fbc Parents: e6dcdf6 Author: Balazs Bence Sari Authored: Tue Mar 7 11:30:12 2017 +0100 Committer: lpuskas Committed: Tue Mar 7 12:04:04 2017 +0100 ---------------------------------------------------------------------- .../ambari/server/controller/AmbariServer.java | 4 +++ .../src/main/python/ambari_server/utils.py | 14 +++++++--- .../src/main/python/ambari_server_main.py | 28 +++++++++++++------- 3 files changed, 32 insertions(+), 14 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/c21f77df/ambari-server/src/main/java/org/apache/ambari/server/controller/AmbariServer.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/controller/AmbariServer.java b/ambari-server/src/main/java/org/apache/ambari/server/controller/AmbariServer.java index 9540ca3..a2441bd 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/controller/AmbariServer.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/controller/AmbariServer.java @@ -1072,6 +1072,10 @@ public class AmbariServer { ComponentSSLConfiguration.instance().init(server.configs); server.run(); } catch (Throwable t) { + // Writing to system console is needed because loggers may not get flushed on exit and diagnostic information + // may get lost. + System.err.println("An unexpected error occured during starting Ambari Server."); + t.printStackTrace(); LOG.error("Failed to run the Ambari Server", t); if (server != null) { server.stop(); http://git-wip-us.apache.org/repos/asf/ambari/blob/c21f77df/ambari-server/src/main/python/ambari_server/utils.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/python/ambari_server/utils.py b/ambari-server/src/main/python/ambari_server/utils.py index 6408285..b51e37e 100644 --- a/ambari-server/src/main/python/ambari_server/utils.py +++ b/ambari-server/src/main/python/ambari_server/utils.py @@ -122,7 +122,10 @@ def save_pid(pid, pidfile): def save_main_pid_ex(pids, pidfile, exclude_list=[], skip_daemonize=False): """ - Save pid which is not included to exclude_list to pidfile. + Saves and returns the first (and supposingly only) pid from the list of pids + which is not included in the exclude_list. + + pidfile is the name of the file to save the pid to exclude_list contains list of full executable paths which should be excluded """ @@ -133,7 +136,7 @@ def save_main_pid_ex(pids, pidfile, exclude_list=[], skip_daemonize=False): for item in pids: if pid_exists(item["pid"]) and (item["exe"] not in exclude_list): pfile.write("%s\n" % item["pid"]) - pid_saved = True + pid_saved = item["pid"] logger.info("Ambari server started with PID " + str(item["pid"])) if pid_exists(item["pid"]) and (item["exe"] in exclude_list) and not skip_daemonize: try: @@ -157,7 +160,7 @@ def get_live_pids_count(pids): """ return len([pid for pid in pids if pid_exists(pid)]) -def wait_for_ui_start(ambari_server_ui_port, timeout=1): +def wait_for_ui_start(ambari_server_ui_port, pid, timeout=1): tstart = time.time() while int(time.time()-tstart) <= timeout: @@ -173,7 +176,10 @@ def wait_for_ui_start(ambari_server_ui_port, timeout=1): sys.stdout.write('.') sys.stdout.flush() - time.sleep(1) + if pid_exists(pid): + time.sleep(1) + else: + break return False http://git-wip-us.apache.org/repos/asf/ambari/blob/c21f77df/ambari-server/src/main/python/ambari_server_main.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/python/ambari_server_main.py b/ambari-server/src/main/python/ambari_server_main.py index 0cd19cc..0eb4243 100644 --- a/ambari-server/src/main/python/ambari_server_main.py +++ b/ambari-server/src/main/python/ambari_server_main.py @@ -21,6 +21,7 @@ import os import subprocess import sys import logging +import time from ambari_commons.exceptions import FatalException from ambari_commons.logging_utils import get_debug_mode, print_warning_msg, print_info_msg, set_debug_mode_from_options @@ -115,6 +116,8 @@ CHECK_DATABASE_SKIPPED_PROPERTY = "check_database_skipped" AMBARI_SERVER_DIE_MSG = "Ambari Server java process died with exitcode {0}. Check {1} for more information." AMBARI_SERVER_NOT_STARTED_MSG = "Ambari Server java process hasn't been started or can't be determined." +AMBARI_SERVER_STOPPED = "Ambari Server java process has stopped. Please check the logs for more information." +AMBARI_SERVER_UI_TIMEOUT = "Server not yet listening on http port {0} after {1} seconds. Exiting." AMBARI_SERVER_STARTED_SUCCESS_MSG = "Ambari Server has started successfully" # linux open-file limit @@ -211,27 +214,32 @@ def wait_for_server_start(pidFile, scmStatus): sys.stdout.write('Waiting for server start...') sys.stdout.flush() pids = [] - server_started = False + pid = None # looking_for_pid() might return partrial pid list on slow hardware for i in range(1, SERVER_START_RETRIES): pids = looking_for_pid(SERVER_SEARCH_PATTERN, SERVER_START_TIMEOUT) - if save_main_pid_ex(pids, pidFile, locate_all_file_paths('sh', '/bin') + - locate_all_file_paths('bash', '/bin') + - locate_all_file_paths('dash', '/bin'), IS_FOREGROUND): - server_started = True + pid = save_main_pid_ex(pids, pidFile, locate_all_file_paths('sh', '/bin') + + locate_all_file_paths('bash', '/bin') + + locate_all_file_paths('dash', '/bin'), IS_FOREGROUND) + if pid: break else: sys.stdout.write("Unable to determine server PID. Retrying...\n") sys.stdout.flush() exception = None - if server_started: + if pid: ambari_server_ui_port = get_ambari_server_ui_port(properties) web_server_startup_timeout = get_web_server_startup_timeout(properties) - - if not wait_for_ui_start(int(ambari_server_ui_port), web_server_startup_timeout): - exception = FatalException(1, "Server not yet listening on http port " + ambari_server_ui_port + \ - " after " + str(web_server_startup_timeout) + " seconds. Exiting.") + waitStart = time.time() + if not wait_for_ui_start(int(ambari_server_ui_port), pid, web_server_startup_timeout): + waitTime = int(time.time()-waitStart) + # Java process stopped, due to a DB check or other startup issue + if waitTime < web_server_startup_timeout: + exception = FatalException(-1, AMBARI_SERVER_STOPPED) + # UI didn't come up on time + else: + exception = FatalException(1, AMBARI_SERVER_UI_TIMEOUT.format(ambari_server_ui_port, web_server_startup_timeout)) elif get_live_pids_count(pids) <= 0: exitcode = check_exitcode(os.path.join(configDefaults.PID_DIR, EXITCODE_NAME)) exception = FatalException(-1, AMBARI_SERVER_DIE_MSG.format(exitcode, configDefaults.SERVER_OUT_FILE))