accumulo-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From vi...@apache.org
Subject [1/2] accumulo git commit: ACCUMULO-3569 initial pass at integrating auto-restarts
Date Tue, 07 Apr 2015 15:25:33 GMT
Repository: accumulo
Updated Branches:
  refs/heads/master edc080c83 -> 998a31cdc


ACCUMULO-3569 initial pass at integrating auto-restarts

Updating methods to be more flexible

Needed a few more tweaks

Fixing to work on all processes

Updating for logigng changes


Project: http://git-wip-us.apache.org/repos/asf/accumulo/repo
Commit: http://git-wip-us.apache.org/repos/asf/accumulo/commit/05853827
Tree: http://git-wip-us.apache.org/repos/asf/accumulo/tree/05853827
Diff: http://git-wip-us.apache.org/repos/asf/accumulo/diff/05853827

Branch: refs/heads/master
Commit: 058538270223c2fe6f82e8c3a39baac73ede1669
Parents: edc080c
Author: John Vines <vines@apache.org>
Authored: Fri Feb 6 15:58:27 2015 -0500
Committer: John Vines <vines@apache.org>
Committed: Tue Apr 7 11:17:10 2015 -0400

----------------------------------------------------------------------
 assemble/bin/accumulo_watcher.sh        | 127 +++++++++++++++++++++++++++
 assemble/bin/start-server.sh            |   9 +-
 assemble/conf/templates/accumulo-env.sh |  14 +++
 3 files changed, 148 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/accumulo/blob/05853827/assemble/bin/accumulo_watcher.sh
----------------------------------------------------------------------
diff --git a/assemble/bin/accumulo_watcher.sh b/assemble/bin/accumulo_watcher.sh
new file mode 100755
index 0000000..9a3dc9f
--- /dev/null
+++ b/assemble/bin/accumulo_watcher.sh
@@ -0,0 +1,127 @@
+#! /usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+LOGHOST=$1
+shift
+process=$1
+
+SOURCE="${BASH_SOURCE[0]}"
+while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
+   bin="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
+   SOURCE="$(readlink "$SOURCE")"
+   [[ $SOURCE != /* ]] && SOURCE="$bin/$SOURCE" # if $SOURCE was a relative symlink,
we need to resolve it relative to the path where the symlink file was located
+done
+bin="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
+# Stop: Resolve Script Directory
+
+. "${bin}"/config.sh
+
+ERRFILE=${ACCUMULO_LOG_DIR}/${process}_${LOGHOST}.err
+OUTFILE=${ACCUMULO_LOG_DIR}/${process}_${LOGHOST}.out
+DEBUGLOG=${ACCUMULO_LOG_DIR}/${process}_$(hostname).debug.log
+export COMMAND="${bin}/accumulo \"\$@\""
+
+logger -s "starting process $process at $(date)"
+stopRunning=""
+while [ -z "$stopRunning" ];
+do
+  eval $COMMAND 2> $ERRFILE
+  exit=$?
+  unset cause
+  if [ "$exit" -eq 0 ]; then
+    potentialStopRunning="Clean Exit"
+  elif [ "$exit" -eq 1 ]; then
+    potentialStopRunning="Unexpected error"
+  elif [ "$exit" -eq 130 ]; then
+    stopRunning="Control C detected, exiting"
+  elif [ "$exit" -eq 143 ]; then
+    stopRunning="Process terminated, exiting"
+  elif [ "$exit" -eq 137 ]; then
+    potentialStopRunning="Process killed, exiting"
+  fi
+  if [ -z "$stopRunning" ]; then
+    stopRunning=$potentialStopRunning;
+
+    if [ $exit -eq 1 ]; then
+      source="exit code"
+      cause="Unexpected Exception"
+    elif tail -n50 $OUTFILE | grep "java.lang.OutOfMemoryError:" > /dev/null; then
+      source="logs"
+      cause="Out of memory exception"
+    elif [ "$process" = "tserver" ]; then
+      if tail -n50 $DEBUGLOG | grep "ERROR: Lost tablet server lock (reason =" > /dev/null
; then
+        source="logs"
+        cause="ZKLock lost"
+      fi
+    elif [ "$process" = "master" ]; then
+      if tail -n50 $DEBUGLOG | grep "ERROR: Master lock in zookeeper lost (reason =" >
/dev/null ; then
+        source="logs"
+        cause="ZKLock lost"
+      fi
+    elif [ "$process" = "gc" ]; then
+      if tail -n50 $DEBUGLOG | grep "FATAL: GC lock in zookeeper lost (reason =" > /dev/null
; then
+        source="logs"
+        cause="ZKLock lost"
+      fi
+    elif [ "$process" = "monitor" ]; then
+      if tail -n50 $DEBUGLOG | grep "ERROR:  Monitor lock in zookeeper lost (reason =" >
/dev/null ; then
+        source="logs"
+        cause="ZKLock lost"
+      fi
+    elif [ $exit -ne 0 ]; then
+      source="exit code"
+      cause="Unknown error"
+    fi
+    case $cause in
+      #Unknown exit code
+      "Unknown error")
+        #window doesn't matter when retries = 0
+        RETRIES=0
+        ;;
+
+      "Unexpected Exception")
+        WINDOW=$UNEXPECTED_TIMESPAN
+        RETRIES=$UNEXPECTED_RETRIES
+        ;;
+
+      "Out of memory exception") 
+        WINDOW=$OOM_TIMESPAN
+        RETRIES=$OOM_RETRIES
+        ;;
+
+      "ZKLock lost")
+        WINDOW=$ZKLOCK_TIMESPAN
+        RETRIES=$ZKLOCK_RETRIES
+        ;;
+    esac
+
+    if [ -n "$cause" ]; then
+      stopRunning=""
+      declare -i attempts
+      attempts="`jobs | grep "reason$cause" | wc -l`+1"
+      if [ "$RETRIES" -le $attempts ]; then
+        stopRunning="$process encountered $cause in $source with exit code $exit- quitting
($attempts/$RETRIES in $WINDOW seconds)"
+        # kill all sleeps now
+        for list in `jobs | cut -b 2-2`; do kill %$list; done
+      else
+        logger -s "$process encountered $cause in $source with exit code $exit- retrying
($attempts/$RETRIES in $WINDOW seconds)"
+        eval "(sleep $WINDOW ; echo "reason$cause" >> /dev/null) &" 
+      fi
+    fi 
+  fi
+done
+logger -s $stopRunning

http://git-wip-us.apache.org/repos/asf/accumulo/blob/05853827/assemble/bin/start-server.sh
----------------------------------------------------------------------
diff --git a/assemble/bin/start-server.sh b/assemble/bin/start-server.sh
index 1ed73de..c52bd64 100755
--- a/assemble/bin/start-server.sh
+++ b/assemble/bin/start-server.sh
@@ -72,11 +72,16 @@ fi
 
 if [[ -z "$PID" ]]; then
    echo "Starting $LONGNAME on $HOST"
+   COMMAND="${bin}/accumulo"
+   if [ "${ACCUMULO_WATCHER}" = "true" ]; then
+      COMMAND="${bin}/accumulo_watcher.sh ${LOGHOST}"
+   fi
+
    if [[ $HOST == localhost || $HOST == "$(hostname -f)" || $HOST = "$IP" ]]; then
-      "${bin}/accumulo" "${SERVICE}" --address "${ADDRESS}" >"${ACCUMULO_LOG_DIR}/${SERVICE}_${LOGHOST}.out"
2>"${ACCUMULO_LOG_DIR}/${SERVICE}_${LOGHOST}.err" & 
+      "$COMMAND" "${SERVICE}" --address "${ADDRESS}" >"${ACCUMULO_LOG_DIR}/${SERVICE}_${LOGHOST}.out"
2>"${ACCUMULO_LOG_DIR}/${SERVICE}_${LOGHOST}.err" & 
       MAX_FILES_OPEN=$(ulimit -n)
    else
-      $SSH "$HOST" "bash -c 'exec nohup ${bin}/accumulo ${SERVICE} --address ${ADDRESS} >${ACCUMULO_LOG_DIR}/${SERVICE}_${LOGHOST}.out
2>${ACCUMULO_LOG_DIR}/${SERVICE}_${LOGHOST}.err' &"
+      $SSH "$HOST" "bash -c 'exec nohup $COMMAND ${SERVICE} --address ${ADDRESS} >${ACCUMULO_LOG_DIR}/${SERVICE}_${LOGHOST}.out
2>${ACCUMULO_LOG_DIR}/${SERVICE}_${LOGHOST}.err' &"
       MAX_FILES_OPEN=$($SSH "$HOST" "/usr/bin/env bash -c 'ulimit -n'") 
    fi
 

http://git-wip-us.apache.org/repos/asf/accumulo/blob/05853827/assemble/conf/templates/accumulo-env.sh
----------------------------------------------------------------------
diff --git a/assemble/conf/templates/accumulo-env.sh b/assemble/conf/templates/accumulo-env.sh
index a21702b..5be96c0 100644
--- a/assemble/conf/templates/accumulo-env.sh
+++ b/assemble/conf/templates/accumulo-env.sh
@@ -61,3 +61,17 @@ export ACCUMULO_KILL_CMD='kill -9 %p'
 
 # Should the monitor bind to all network interfaces -- default: false
 # export ACCUMULO_MONITOR_BIND_ALL="true"
+
+# Should process be automatically restarted
+# export ACCUMULO_WATCHER="true"
+
+# What settings should we use for the watcher, if enabled
+export UNEXPECTED_TIMESPAN="3600"
+export UNEXPECTED_RETRIES="2"
+
+export OOM_TIMESPAN="3600"
+export OOM_RETRIES="5"
+
+export ZKLOCK_TIMESPAN="600"
+export ZKLOCK_RETRIES="5"
+


Mime
View raw message