ambari-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From aonis...@apache.org
Subject git commit: AMBARI-5108. HBaseRegionServer requires multiple retries to be stopped during reassigning NameNode after EnablingHA (aonishuk)
Date Tue, 18 Mar 2014 15:06:48 GMT
Repository: ambari
Updated Branches:
  refs/heads/trunk 6609364a0 -> 00fe3df41


AMBARI-5108. HBaseRegionServer requires multiple retries to be stopped
during reassigning NameNode after EnablingHA (aonishuk)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/00fe3df4
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/00fe3df4
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/00fe3df4

Branch: refs/heads/trunk
Commit: 00fe3df41b1f75c063d4a8f84102ec97f4f98559
Parents: 6609364
Author: Andrew Onischuk <aonishuk@hortonworks.com>
Authored: Sun Mar 16 10:59:25 2014 -0700
Committer: Andrew Onischuk <aonishuk@hortonworks.com>
Committed: Sun Mar 16 10:59:25 2014 -0700

----------------------------------------------------------------------
 .../core/providers/system.py                    | 11 +++++++++-
 .../core/resources/system.py                    |  6 ++++++
 .../python/resource_management/core/shell.py    | 21 +++++++++++++-------
 .../HBASE/package/scripts/hbase_service.py      | 21 ++++++++++++--------
 .../stacks/2.0.6/HBASE/test_hbase_master.py     | 16 +++++++++++----
 .../2.0.6/HBASE/test_hbase_regionserver.py      | 16 +++++++++++----
 6 files changed, 67 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/00fe3df4/ambari-agent/src/main/python/resource_management/core/providers/system.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/python/resource_management/core/providers/system.py b/ambari-agent/src/main/python/resource_management/core/providers/system.py
index a37ba85..fee24be 100644
--- a/ambari-agent/src/main/python/resource_management/core/providers/system.py
+++ b/ambari-agent/src/main/python/resource_management/core/providers/system.py
@@ -27,6 +27,7 @@ import os
 import pwd
 import time
 import shutil
+from subprocess import TimeoutExpired
 from resource_management.core import shell
 from resource_management.core.base import Fail
 from resource_management.core.providers import Provider
@@ -231,7 +232,7 @@ class ExecuteProvider(Provider):
         shell.checked_call(self.resource.command, logoutput=self.resource.logoutput,
                             cwd=self.resource.cwd, env=self.resource.environment,
                             preexec_fn=_preexec_fn(self.resource), user=self.resource.user,
-                            wait_for_finish=self.resource.wait_for_finish)
+                            wait_for_finish=self.resource.wait_for_finish, timeout=self.resource.timeout)
         break
       except Fail as ex:
         if i == self.resource.tries-1: # last try
@@ -239,6 +240,14 @@ class ExecuteProvider(Provider):
         else:
           Logger.info("Retrying after %d seconds. Reason: %s" % (self.resource.try_sleep,
str(ex)))
           time.sleep(self.resource.try_sleep)
+      except TimeoutExpired:
+        err_msg = ("Execution of '%s' was killed due timeout after %d seconds") % (self.resource.command,
self.resource.timeout)
+        
+        if self.resource.on_timeout:
+          Logger.info("Executing '%s'. Reason: %s" % (self.resource.on_timeout, err_msg))
+          shell.checked_call(self.resource.on_timeout)
+        else:
+          raise Fail(err_msg)
        
 
 class ExecuteScriptProvider(Provider):

http://git-wip-us.apache.org/repos/asf/ambari/blob/00fe3df4/ambari-agent/src/main/python/resource_management/core/resources/system.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/python/resource_management/core/resources/system.py b/ambari-agent/src/main/python/resource_management/core/resources/system.py
index 45d7a60..0952c48 100644
--- a/ambari-agent/src/main/python/resource_management/core/resources/system.py
+++ b/ambari-agent/src/main/python/resource_management/core/resources/system.py
@@ -85,6 +85,12 @@ class Execute(Resource):
   actions = Resource.actions + ["run"]
   logoutput = BooleanArgument(default=False)
   """
+  if on_timeout is not set leads to failing after x seconds,
+  otherwise calls on_timeout
+  """
+  timeout = ResourceArgument() # seconds
+  on_timeout = ResourceArgument()
+  """
   Wait for command to finish or not. 
   
   NOTE:

http://git-wip-us.apache.org/repos/asf/ambari/blob/00fe3df4/ambari-agent/src/main/python/resource_management/core/shell.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/python/resource_management/core/shell.py b/ambari-agent/src/main/python/resource_management/core/shell.py
index 68d3f7b..77f2a9c 100644
--- a/ambari-agent/src/main/python/resource_management/core/shell.py
+++ b/ambari-agent/src/main/python/resource_management/core/shell.py
@@ -24,20 +24,21 @@ __all__ = ["checked_call", "call"]
 
 import subprocess
 import pipes
+from subprocess import TimeoutExpired
 from exceptions import Fail
 from resource_management.core.logger import Logger
 
 def checked_call(command, logoutput=False, 
-         cwd=None, env=None, preexec_fn=None, user=None, wait_for_finish=True):
-  return _call(command, logoutput, True, cwd, env, preexec_fn, user, wait_for_finish)
+         cwd=None, env=None, preexec_fn=None, user=None, wait_for_finish=True, timeout=None):
+  return _call(command, logoutput, True, cwd, env, preexec_fn, user, wait_for_finish, timeout)
 
 def call(command, logoutput=False, 
-         cwd=None, env=None, preexec_fn=None, user=None, wait_for_finish=True):
-  return _call(command, logoutput, False, cwd, env, preexec_fn, user, wait_for_finish)
+         cwd=None, env=None, preexec_fn=None, user=None, wait_for_finish=True, timeout=None):
+  return _call(command, logoutput, False, cwd, env, preexec_fn, user, wait_for_finish, timeout)
   
 
 def _call(command, logoutput=False, throw_on_failure=True, 
-         cwd=None, env=None, preexec_fn=None, user=None, wait_for_finish=True):
+         cwd=None, env=None, preexec_fn=None, user=None, wait_for_finish=True, timeout=None):
   """
   Execute shell command
   
@@ -63,8 +64,14 @@ def _call(command, logoutput=False, throw_on_failure=True,
 
   if not wait_for_finish:
     return None, None
+  
 
-  out = proc.communicate()[0].strip('\n')
+  try:
+    out = proc.communicate(timeout=timeout)[0].strip('\n')
+  except TimeoutExpired as ex:
+    proc.terminate()
+    raise ex
+    
   code = proc.returncode
   
   if logoutput and out:
@@ -74,4 +81,4 @@ def _call(command, logoutput=False, throw_on_failure=True,
     err_msg = ("Execution of '%s' returned %d. %s") % (command[-1], code, out)
     raise Fail(err_msg)
   
-  return code, out
+  return code, out
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/ambari/blob/00fe3df4/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HBASE/package/scripts/hbase_service.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HBASE/package/scripts/hbase_service.py
b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HBASE/package/scripts/hbase_service.py
index 17f0056..d0a6b50 100644
--- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HBASE/package/scripts/hbase_service.py
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HBASE/package/scripts/hbase_service.py
@@ -29,18 +29,23 @@ def hbase_service(
     role = name
     cmd = format("{daemon_script} --config {conf_dir}")
     pid_file = format("{pid_dir}/hbase-{hbase_user}-{role}.pid")
-    
-    daemon_cmd = None
-    no_op_test = None
+    no_op_test = format("ls {pid_file} >/dev/null 2>&1 && ps `cat {pid_file}`
>/dev/null 2>&1")
     
     if action == 'start':
       daemon_cmd = format("{cmd} start {role}")
-      no_op_test = format("ls {pid_file} >/dev/null 2>&1 && ps `cat {pid_file}`
>/dev/null 2>&1")
-    elif action == 'stop':
-      daemon_cmd = format("{cmd} stop {role} && rm -f {pid_file}")
-
-    if daemon_cmd is not None:
+      
       Execute ( daemon_cmd,
         not_if = no_op_test,
         user = params.hbase_user
       )
+    elif action == 'stop':
+      daemon_cmd = format("{cmd} stop {role}")
+
+      Execute ( daemon_cmd,
+        user = params.hbase_user,
+        # BUGFIX: hbase regionserver sometimes hangs when nn is in safemode
+        timeout = 30,
+        on_timeout = format("{no_op_test} && kill -9 `cat {pid_file}`")
+      )
+      
+      Execute (format("rm -f {pid_file}"))
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/ambari/blob/00fe3df4/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_master.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_master.py b/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_master.py
index 1084ed3..2d77e99 100644
--- a/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_master.py
+++ b/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_master.py
@@ -53,9 +53,13 @@ class TestHBaseMaster(RMFTestCase):
                    config_file="default.json"
     )
     
-    self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf
stop master && rm -f /var/run/hbase/hbase-hbase-master.pid',
-      not_if = None,
+    self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf
stop master',
       user = 'hbase',
+      on_timeout = 'ls /var/run/hbase/hbase-hbase-master.pid >/dev/null 2>&1 &&
ps `cat /var/run/hbase/hbase-hbase-master.pid` >/dev/null 2>&1 && kill -9
`cat /var/run/hbase/hbase-hbase-master.pid`', 
+      timeout = 30,
+    )
+    
+    self.assertResourceCalled('Execute', 'rm -f /var/run/hbase/hbase-hbase-master.pid',
     )
     self.assertNoMoreResources()
 
@@ -136,9 +140,13 @@ class TestHBaseMaster(RMFTestCase):
                    config_file="secured.json"
     )
 
-    self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf
stop master && rm -f /var/run/hbase/hbase-hbase-master.pid',
-      not_if = None,
+    self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf
stop master',
       user = 'hbase',
+      on_timeout = 'ls /var/run/hbase/hbase-hbase-master.pid >/dev/null 2>&1 &&
ps `cat /var/run/hbase/hbase-hbase-master.pid` >/dev/null 2>&1 && kill -9
`cat /var/run/hbase/hbase-hbase-master.pid`', 
+      timeout = 30,
+    )
+    
+    self.assertResourceCalled('Execute', 'rm -f /var/run/hbase/hbase-hbase-master.pid',
     )
     self.assertNoMoreResources()
 

http://git-wip-us.apache.org/repos/asf/ambari/blob/00fe3df4/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py b/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py
index 4ced781..920312a 100644
--- a/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py
+++ b/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py
@@ -53,9 +53,13 @@ class TestHbaseRegionServer(RMFTestCase):
                    config_file="default.json"
     )
     
-    self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf
stop regionserver && rm -f /var/run/hbase/hbase-hbase-regionserver.pid',
-      not_if = None,
+    self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf
stop regionserver',
       user = 'hbase',
+      on_timeout = 'ls /var/run/hbase/hbase-hbase-regionserver.pid >/dev/null 2>&1
&& ps `cat /var/run/hbase/hbase-hbase-regionserver.pid` >/dev/null 2>&1
&& kill -9 `cat /var/run/hbase/hbase-hbase-regionserver.pid`', 
+      timeout = 30,
+    )
+    
+    self.assertResourceCalled('Execute', 'rm -f /var/run/hbase/hbase-hbase-regionserver.pid',
     )
     self.assertNoMoreResources()
     
@@ -90,9 +94,13 @@ class TestHbaseRegionServer(RMFTestCase):
                    config_file="secured.json"
     )
 
-    self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf
stop regionserver && rm -f /var/run/hbase/hbase-hbase-regionserver.pid',
-      not_if = None,
+    self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf
stop regionserver',
       user = 'hbase',
+      on_timeout = 'ls /var/run/hbase/hbase-hbase-regionserver.pid >/dev/null 2>&1
&& ps `cat /var/run/hbase/hbase-hbase-regionserver.pid` >/dev/null 2>&1
&& kill -9 `cat /var/run/hbase/hbase-hbase-regionserver.pid`', 
+      timeout = 30,
+    )
+    
+    self.assertResourceCalled('Execute', 'rm -f /var/run/hbase/hbase-hbase-regionserver.pid',
     )
     self.assertNoMoreResources()
 


Mime
View raw message