aboutsummaryrefslogtreecommitdiff
path: root/apps/mariadb/slave-watchdog.nix
diff options
context:
space:
mode:
Diffstat (limited to 'apps/mariadb/slave-watchdog.nix')
-rw-r--r--apps/mariadb/slave-watchdog.nix103
1 files changed, 103 insertions, 0 deletions
diff --git a/apps/mariadb/slave-watchdog.nix b/apps/mariadb/slave-watchdog.nix
new file mode 100644
index 0000000..8d1147e
--- /dev/null
+++ b/apps/mariadb/slave-watchdog.nix
@@ -0,0 +1,103 @@
+{ changeMaster, importDump }: ''
+set -euo pipefail
+
+ch="$1"
+status=$(mktemp)
+trap 'rm -f "$status"' EXIT
+
+slave_status () {
+ if ! mysql -e ';'; then
+ echo unknown; return
+ fi
+
+ if mysql -e "SHOW SLAVE '$1' STATUS\\G" | sed 's,^ *,,' > "$status"; then
+ if grep -oE '\bMaster_Server_Id:\s*[1-9][0-9]*' "$status" >&2; then
+ io_errno=$(awk '/Last_IO_Errno:/ {print $2}' "$status")
+ sql_errno=$(awk '/Last_SQL_Errno:/ {print $2}' "$status")
+ case "$io_errno:$sql_errno" in
+ 0:0)
+ echo ok
+ return
+ ;;
+ 0:*)
+ awk '/Last_SQL_Error:/ {print $0}' "$status" >&2
+ echo "sql_error:$sql_errno"
+ return
+ ;;
+ *:*)
+ awk '/Last_IO_Error:/ {print $0}' "$status" >&2
+ echo "io_error:$io_errno"
+ return
+ ;;
+ esac
+ fi
+ fi
+ echo none
+}
+
+sql_errors=0
+none_count=0
+while true; do
+ st=$(slave_status "$ch")
+
+ case "$st" in
+ ok|unknown)
+ echo "status: $st" >&2
+ exit
+ ;;
+ none)
+ # XXX existing slave might not be initialized yet after mariadb restarts
+ (( ++none_count ))
+ echo "status: $st (count: $none_count)" >&2
+ if [ "$none_count" -lt 10 ]; then
+ sleep 1m
+ continue
+ fi
+ mysql -v -N -e "CALL mysql.resetSlave('$ch')" >&2
+ ${changeMaster} "$ch" | mysql
+ if ${importDump} "$ch" | mysql; then
+ mysql -v -N -e "CALL mysql.startSlave('$ch')" >&2
+ exit
+ else
+ echo 'Import failed. Starting over' >&2
+ mysql -v -N -e "CALL mysql.resetSlave('$ch')" >&2
+ exit 1
+ fi
+ ;;
+ io_error:*)
+ echo "status: $st" >&2
+ mysql -v -N -e "CALL mysql.stopSlave('$ch')" >&2
+ ${changeMaster} "$ch" | mysql
+ mysql -v -N -e "CALL mysql.startSlave('$ch')" >&2
+ exit 1
+ ;;
+ sql_error:1205) # Lock wait timeout exceeded
+ echo "status: $st" >&2
+ mysql -v -N -e "CALL mysql.startSlave('$ch')" >&2
+ exit 1
+ ;;
+ sql_error:*)
+ (( ++sql_errors ))
+ echo "status: $st (count: $sql_errors)" >&2
+ if [ "$sql_errors" -le 1 ]; then
+ mysql -v -N -e "CALL mysql.pauseSlave('$ch')" >&2
+ sleep 1s
+ mysql -v -N -e "CALL mysql.startSlave('$ch')" >&2
+ elif [ "$sql_errors" -le 2 ]; then
+ mysql -v -N -e "CALL mysql.stopSlave('$ch')" >&2
+ # this *unlikely* *may* change replication option (ignore tables, etc.)
+ ${changeMaster} "$ch" | mysql
+ mysql -v -N -e "CALL mysql.startSlave('$ch')" >&2
+ else
+ echo '!!! Resetting slave !!!' >&2
+ mysql -v -N -e "CALL mysql.resetSlave('$ch')" >&2
+ exit 1
+ fi
+ sleep 2m
+ ;;
+ *) echo "BUG: $st" >&2; exit 255;;
+ esac
+ sleep 1s
+done
+''
+