Last active
August 29, 2015 14:07
-
-
Save hoylen/8bf4eb2e64ddbb8bdcf6 to your computer and use it in GitHub Desktop.
Init script to detect lost default network route and to restart network services to fix it. This script is useful when an unknown problem causes the default route to be lost (e.g. certain deployments of CentOS 7 and Fedora 20). Copy to /etc/init.d/route-repair and run "chkconfig route-repair on" and "service route-repair start".
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# | |
# chkconfig: 2345 15 85 | |
# description: Maintains default route by restarting network | |
# | |
# This script (when run with the "run-loop" argument) checks if the | |
# default route is still present, and if not the network service is | |
# restarted. The timestamp of when the failure was detected is | |
# logged. | |
# | |
# This script is also an init.d script that can be used to | |
# start the monitor process (i.e. it runs itself with the "run-loop" | |
# argument). To install and run: | |
# | |
# # cp route-repair.sh /etc/init.d/route-repair | |
# # chkconfig route-repair on | |
# # service route-repair start | |
# # service route-repair status | |
# | |
# Detected failures are written to the log file in /var/log/. | |
# | |
# This is a workaround for the problem with with CentOS 7 and Fedora 20 | |
# running on the "QRIScloud" availability zone. | |
# | |
# Copyright (C) 2014, QCIF Ltd. | |
#---------------------------------------------------------------- | |
# Seconds between checking | |
DELAY_BETWEEN_CHECKS=5 | |
# Maximum delay between retrying repair if it failed | |
REPAIR_DELAY_MAX=$((5 * 60)) | |
# Where to store the PID and log | |
PID_FILE=/var/run/`basename "$0" .sh`.pid | |
LOG_FILE=/var/log/`basename "$0" .sh`.log | |
PROG=`basename "$0"` | |
#---------------------------------------------------------------- | |
has_default_route () { | |
ip route show | grep '^default via' > /dev/null | |
if [ $? -ne 0 ]; then | |
return 1 # Default route is missing | |
else | |
return 0 | |
fi | |
} | |
check_and_fix () { | |
if ! has_default_route; then | |
# Default route is missing | |
# Log it | |
WHEN=`date "+%F %T"` | |
echo "$WHEN: default route lost detected" | |
# Attempt to repair it | |
REPAIR_DELAY=1 | |
while true; do | |
WHEN=`date "+%F %T"` | |
# Restart network service | |
systemctl restart network.service | |
if has_default_route; then | |
# Repair successful | |
if [ $REPAIR_DELAY -ne 1 ]; then | |
# This was not the first attempt to repair it | |
echo "$WHEN: repair successful" | |
fi | |
return 0 # success | |
else | |
# Repair failed: wait before trying again | |
if [ $REPAIR_DELAY -eq 1 ]; then | |
# First failure to repair | |
echo "$WHEN: repair failed" | |
fi | |
sleep $REPAIR_DELAY | |
REPAIR_DELAY=$((REPAIR_DELAY * 2)) | |
if [ $REPAIR_DELAY -gt $REPAIR_DELAY_MAX ]; then | |
REPAIR_DELAY=$REPAIR_DELAY_MAX | |
fi | |
fi | |
done | |
fi | |
} | |
#---------------------------------------------------------------- | |
if [ $# -ne 1 ]; then | |
COMMAND=help | |
else | |
COMMAND="$1" | |
fi | |
case "$COMMAND" in | |
start) | |
if [ -f "$PID_FILE" ]; then | |
echo "$PROG: error: already running" >&2 | |
exit 1 | |
fi | |
"$0" run-loop >> "$LOG_FILE" 2>&1 & | |
echo $! > "$PID_FILE" | |
;; | |
stop) | |
if [ -f "$PID_FILE" ]; then | |
PID=`cat "$PID_FILE"` | |
if kill -0 $PID > /dev/null 2>&1; then | |
kill `cat "$PID_FILE"` && rm "$PID_FILE" | |
else | |
rm "$PID_FILE" | |
fi | |
fi | |
;; | |
restart) | |
"$0" stop | |
"$0" start | |
;; | |
status) | |
if [ -f "$PID_FILE" ]; then | |
PID=`cat "$PID_FILE"` | |
if kill -0 $PID > /dev/null 2>&1; then | |
echo "$PROG: running" | |
exit 0 | |
else | |
if kill -0 $PID 2>&1 | grep 'No such process' >/dev/null; then | |
echo "$PROG: stopped (process died)" | |
elif kill -0 $PID 2>&1 | grep 'Operation not permitted' >/dev/null; then | |
echo "$PROG: insufficient permission: possibly running" | |
else | |
echo "$PROG: possibly stopped" | |
fi | |
exit 1 | |
fi | |
else | |
echo "$PROG: stopped" | |
exit 1 | |
fi | |
;; | |
run-once) | |
check_and_fix | |
;; | |
run-loop) | |
while true; do | |
check_and_fix | |
sleep $DELAY_BETWEEN_CHECKS | |
done | |
;; | |
help|--help|-h) | |
echo "Usage: $PROG run-once|run-loop|start|stop|restart|status|help" | |
;; | |
*) | |
echo "Error: unknown command: $COMMAND (--help for help)" >&2 | |
exit 1 | |
;; | |
esac | |
#EOF |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment