aboutsummaryrefslogtreecommitdiff
path: root/apps/mariadb/slave-watchdog.nix
blob: 8d1147e942e5f6dbd2e77e88d30bdd3f9adb0404 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
{ changeMaster, importDump }: ''
set -euo pipefail

ch="$1"
status=$(mktemp)
trap 'rm -f "$status"' EXIT

slave_status () {
  if ! mysql -e ';'; then
    echo unknown; return
  fi

  if mysql -e "SHOW SLAVE '$1' STATUS\\G" | sed 's,^ *,,' > "$status"; then
    if grep -oE '\bMaster_Server_Id:\s*[1-9][0-9]*' "$status" >&2; then
      io_errno=$(awk '/Last_IO_Errno:/ {print $2}' "$status")
      sql_errno=$(awk '/Last_SQL_Errno:/ {print $2}' "$status")
      case "$io_errno:$sql_errno" in
        0:0)
          echo ok
          return
          ;;
        0:*)
          awk '/Last_SQL_Error:/ {print $0}' "$status" >&2
          echo "sql_error:$sql_errno"
          return
          ;;
        *:*)
          awk '/Last_IO_Error:/ {print $0}' "$status" >&2
          echo "io_error:$io_errno"
          return
          ;;
      esac
    fi
  fi
  echo none
}

sql_errors=0
none_count=0
while true; do
  st=$(slave_status "$ch")

  case "$st" in
    ok|unknown)
      echo "status: $st" >&2
      exit
      ;;
    none)
      # XXX existing slave might not be initialized yet after mariadb restarts
      (( ++none_count ))
      echo "status: $st (count: $none_count)" >&2
      if [ "$none_count" -lt 10 ]; then
        sleep 1m
        continue
      fi
      mysql -v -N -e "CALL mysql.resetSlave('$ch')" >&2
      ${changeMaster} "$ch" | mysql
      if ${importDump} "$ch" | mysql; then
        mysql -v -N -e "CALL mysql.startSlave('$ch')" >&2
        exit
      else
        echo 'Import failed. Starting over' >&2
        mysql -v -N -e "CALL mysql.resetSlave('$ch')" >&2
        exit 1
      fi
      ;;
    io_error:*)
      echo "status: $st" >&2
      mysql -v -N -e "CALL mysql.stopSlave('$ch')" >&2
      ${changeMaster} "$ch" | mysql
      mysql -v -N -e "CALL mysql.startSlave('$ch')" >&2
      exit 1
      ;;
    sql_error:1205) # Lock wait timeout exceeded
      echo "status: $st" >&2
      mysql -v -N -e "CALL mysql.startSlave('$ch')" >&2
      exit 1
      ;;
    sql_error:*)
      (( ++sql_errors ))
      echo "status: $st (count: $sql_errors)" >&2
      if [ "$sql_errors" -le 1 ]; then
        mysql -v -N -e "CALL mysql.pauseSlave('$ch')" >&2
        sleep 1s
        mysql -v -N -e "CALL mysql.startSlave('$ch')" >&2
      elif [ "$sql_errors" -le 2 ]; then
        mysql -v -N -e "CALL mysql.stopSlave('$ch')" >&2
        # this *unlikely* *may* change replication option (ignore tables, etc.)
        ${changeMaster} "$ch" | mysql
        mysql -v -N -e "CALL mysql.startSlave('$ch')" >&2
      else
        echo '!!! Resetting slave !!!' >&2
        mysql -v -N -e "CALL mysql.resetSlave('$ch')" >&2
        exit 1
      fi
      sleep 2m
      ;;
    *) echo "BUG: $st" >&2; exit 255;;
  esac
  sleep 1s
done
''