impala-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From taras...@apache.org
Subject [06/18] incubator-impala git commit: IMPALA-3677: Write minidump on SIGUSR1
Date Thu, 14 Jul 2016 19:05:03 GMT
IMPALA-3677: Write minidump on SIGUSR1

Sending SIGUSR1 to any of the impala daemons (catalogd, impalad,
statestored) will trigger a minidump write.

The hotspot JVM also uses SIGUSR1 internally. However the documentation
explains, that existing signal handlers will be transparently wrapped by
the JVM and no spurious signals should be received by the daemon signal
handler:
http://www.oracle.com/technetwork/java/javase/signals-139944.html

Example: killall -SIGUSR1 catalogd

Change-Id: I40149e48e391451de21a5c8bda18e2307fc89513
Reviewed-on: http://gerrit.cloudera.org:8080/3312
Reviewed-by: Lars Volker <lv@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/948a6c34
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/948a6c34
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/948a6c34

Branch: refs/heads/master
Commit: 948a6c34fcaa7ba481254e0a7b0d2cf987a4b690
Parents: db4294c
Author: Lars Volker <lv@cloudera.com>
Authored: Sat Jun 4 22:18:04 2016 +0200
Committer: Taras Bobrovytsky <tarasbob@apache.org>
Committed: Thu Jul 14 19:04:44 2016 +0000

----------------------------------------------------------------------
 be/src/util/minidump.cc               | 29 ++++++++++++++--
 tests/custom_cluster/test_breakpad.py | 55 ++++++++++++++++++++++++++----
 2 files changed, 74 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/948a6c34/be/src/util/minidump.cc
----------------------------------------------------------------------
diff --git a/be/src/util/minidump.cc b/be/src/util/minidump.cc
index 3a7577f..f516a56 100644
--- a/be/src/util/minidump.cc
+++ b/be/src/util/minidump.cc
@@ -21,10 +21,11 @@
 #include <google_breakpad/common/minidump_format.h>
 #include <third_party/lss/linux_syscall_support.h>
 #include <ctime>
+#include <fstream>
 #include <glob.h>
 #include <iomanip>
-#include <fstream>
 #include <map>
+#include <signal.h>
 
 #include "common/logging.h"
 #include "common/version.h"
@@ -48,6 +49,10 @@ DECLARE_int32(minidump_size_limit_hint_kb);
 
 namespace impala {
 
+/// Breakpad ExceptionHandler. It registers its own signal handlers to write minidump
+/// files during process crashes, but also can be used to write minidumps directly.
+static google_breakpad::ExceptionHandler* minidump_exception_handler = NULL;
+
 /// Callback for breakpad. It is called by breakpad whenever a minidump file has been
 /// written and should not be called directly. It logs the event before breakpad crashes
 /// the process. Due to the process being in a failed state we write to stdout/stderr and
@@ -80,6 +85,21 @@ static bool DumpCallback(const google_breakpad::MinidumpDescriptor&
descriptor,
   return succeeded;
 }
 
+/// Signal handler to write a minidump file outside of crashes.
+static void HandleSignal(int signal) {
+  minidump_exception_handler->WriteMinidump(FLAGS_minidump_path, DumpCallback, NULL);
+}
+
+/// Register our signal handler to write minidumps on SIGUSR1.
+static void SetupSignalHandler() {
+  DCHECK(minidump_exception_handler != NULL);
+  struct sigaction sig_action;
+  memset(&sig_action, 0, sizeof(sig_action));
+  sigemptyset(&sig_action.sa_mask);
+  sig_action.sa_handler = &HandleSignal;
+  sigaction(SIGUSR1, &sig_action, NULL);
+}
+
 /// Check the number of minidump files and removes the oldest ones to maintain an upper
 /// bound on the number of files.
 static void CheckAndRemoveMinidumps(int max_minidumps) {
@@ -201,9 +221,12 @@ Status RegisterMinidump(const char* cmd_line_path) {
   }
 
   // Intentionally leaked. We want this to have the lifetime of the process.
-  google_breakpad::ExceptionHandler* eh =
+  DCHECK(minidump_exception_handler == NULL);
+  minidump_exception_handler =
       new google_breakpad::ExceptionHandler(desc, NULL, DumpCallback, NULL, true, -1);
-  (void)eh;
+
+  // Setup signal handler for SIGUSR1.
+  SetupSignalHandler();
 
   return Status::OK();
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/948a6c34/tests/custom_cluster/test_breakpad.py
----------------------------------------------------------------------
diff --git a/tests/custom_cluster/test_breakpad.py b/tests/custom_cluster/test_breakpad.py
index 84cbbe1..91a5f9f 100644
--- a/tests/custom_cluster/test_breakpad.py
+++ b/tests/custom_cluster/test_breakpad.py
@@ -21,7 +21,7 @@ import tempfile
 import time
 
 from resource import setrlimit, RLIMIT_CORE, RLIM_INFINITY
-from signal import SIGSEGV, SIGKILL
+from signal import SIGKILL, SIGSEGV, SIGUSR1
 from tests.common.skip import SkipIfBuildType
 from subprocess import CalledProcessError
 
@@ -85,12 +85,12 @@ class TestBreakpad(CustomClusterTestSuite):
     processes = self.cluster.impalads + [self.cluster.catalogd, self.cluster.statestored]
     processes = filter(None, processes)
     self.kill_processes(processes, signal)
-    self.assert_all_processes_killed()
+    signal is SIGUSR1 or self.assert_all_processes_killed()
 
   def kill_processes(self, processes, signal):
     for process in processes:
       process.kill(signal)
-    self.wait_for_all_processes_dead(processes)
+    signal is SIGUSR1 or self.wait_for_all_processes_dead(processes)
 
   def wait_for_all_processes_dead(self, processes, timeout=300):
     for process in processes:
@@ -104,6 +104,25 @@ class TestBreakpad(CustomClusterTestSuite):
         raise RuntimeError("Unable to kill %s (pid %d) after %d seconds." %
             (psutil_process.name, psutil_process.pid, timeout))
 
+  def get_num_processes(self, daemon):
+    self.cluster.refresh()
+    if daemon == 'impalad':
+      return len(self.cluster.impalads)
+    elif daemon == 'catalogd':
+      return self.cluster.catalogd and 1 or 0
+    elif daemon == 'statestored':
+      return self.cluster.statestored and 1 or 0
+    raise RuntimeError("Unknown daemon name: %s" % daemon)
+
+  def wait_for_num_processes(self, daemon, num_expected, timeout=60):
+    end = time.time() + timeout
+    self.cluster.refresh()
+    num_processes = self.get_num_processes(daemon)
+    while num_processes != num_expected and time.time() <= end:
+      time.sleep(1)
+      num_processes = self.get_num_processes(daemon)
+    return num_processes
+
   def assert_all_processes_killed(self):
     self.cluster.refresh()
     assert not self.cluster.impalads
@@ -130,7 +149,7 @@ class TestBreakpad(CustomClusterTestSuite):
     assert self.count_all_minidumps() == 0
     self.start_cluster()
     assert self.count_all_minidumps() == 0
-    cluster_size = len(self.cluster.impalads)
+    cluster_size = self.get_num_processes('impalad')
     self.kill_cluster(SIGSEGV)
     self.assert_num_logfile_entries(1)
     assert self.count_minidumps('impalad') == cluster_size
@@ -138,6 +157,28 @@ class TestBreakpad(CustomClusterTestSuite):
     assert self.count_minidumps('catalogd') == 1
 
   @pytest.mark.execute_serially
+  def test_sigusr1_writes_minidump(self):
+    """Check that when a daemon receives SIGUSR1 it writes a minidump file."""
+    assert self.count_all_minidumps() == 0
+    self.start_cluster()
+    assert self.count_all_minidumps() == 0
+    cluster_size = self.get_num_processes('impalad')
+    self.kill_cluster(SIGUSR1)
+    # Breakpad forks to write its minidump files, wait for all the clones to terminate.
+    assert self.wait_for_num_processes('impalad', cluster_size, 5) == cluster_size
+    assert self.wait_for_num_processes('catalogd', 1, 5) == 1
+    assert self.wait_for_num_processes('statestored', 1, 5) == 1
+    # Make sure impalad still answers queries.
+    client = self.create_impala_client()
+    self.execute_query_expect_success(client, "SELECT COUNT(*) FROM functional.alltypes")
+    # Kill the cluster. Sending SIGKILL will not trigger minidumps to be written.
+    self.kill_cluster(SIGKILL)
+    self.assert_num_logfile_entries(1)
+    assert self.count_minidumps('impalad') == cluster_size
+    assert self.count_minidumps('statestored') == 1
+    assert self.count_minidumps('catalogd') == 1
+
+  @pytest.mark.execute_serially
   def test_minidump_relative_path(self):
     """Check that setting 'minidump_path' to a relative value results in minidump files
     written to 'log_dir'.
@@ -148,7 +189,7 @@ class TestBreakpad(CustomClusterTestSuite):
     # configuration, which is a FLAGS_log_dir/minidumps.
     self.start_cluster_with_args()
     assert self.count_all_minidumps(minidump_base_dir) == 0
-    cluster_size = len(self.cluster.impalads)
+    cluster_size = self.get_num_processes('impalad')
     self.kill_cluster(SIGSEGV)
     self.assert_num_logfile_entries(1)
     assert self.count_minidumps('impalad', minidump_base_dir) == cluster_size
@@ -164,7 +205,7 @@ class TestBreakpad(CustomClusterTestSuite):
     self.kill_cluster(SIGSEGV)
     self.assert_num_logfile_entries(1)
     self.start_cluster()
-    expected_impalads = min(len(self.cluster.impalads), 2)
+    expected_impalads = min(self.get_num_processes('impalad'), 2)
     assert self.count_minidumps('impalad') == expected_impalads
     assert self.count_minidumps('statestored') == 1
     assert self.count_minidumps('catalogd') == 1
@@ -183,7 +224,7 @@ class TestBreakpad(CustomClusterTestSuite):
     the cluster. Clean up the single minidump file and return its size.
     """
     self.cluster.refresh()
-    assert len(self.cluster.impalads) > 0
+    assert self.get_num_processes('impalad') > 0
     # Make one impalad write a minidump.
     self.kill_processes(self.cluster.impalads[:1], SIGSEGV)
     # Kill the rest of the cluster.


Mime
View raw message