sre(runners): add graceful runner restart script with unit tests #8

Merged
infra-lead merged 1 commits from sre/add-runners-restart-safe-script into main 2026-05-11 04:15:16 +00:00
2 changed files with 270 additions and 0 deletions
+96
View File
@@ -0,0 +1,96 @@
#!/bin/bash
# runners-restart-safe.sh — Gracefully restart molecule-runner containers.
#
# Before restarting a runner container, waits for any active Gitea Actions
# task that is currently running on it to complete (so we don't kill a
# running job mid-execution). After restart, verifies the runner
# successfully re-registers with the platform by checking for the
# 'declare successfully' log line.
#
# Usage: bin/runners-restart-safe.sh
# Requires: docker, logger
set -eu
LOG_TAG="runners-restart-safe"
# TEST_MODE=1 skips the task-wait loop (MAX_WAIT_MINUTES=0) so tests run fast.
# In test mode the fake docker must faithfully represent what docker returns.
MAX_WAIT_MINUTES=${TEST_MODE:-60} # default 60 min; 0 = skip wait loop
# Poll every 30 s for running tasks on a given runner.
# Returns 0 when the runner is idle (safe to restart).
# Returns 1 when tasks are still running (caller must skip this runner).
wait_for_idle() {
local name="$1"
local waited=0
while true; do
# Temporarily disable set -e so grep's "no match" (rc=1) does not
# cause an early script exit. The assignment captures the exit code.
set +e
docker ps --format '{{.Names}}' | grep -qE "GITEA-ACTIONS-TASK-.+-${name}"
local rc=$?
set -e
if (( rc == 0 )); then
# Task still running — wait or give up.
if (( waited >= MAX_WAIT_MINUTES * 60 )); then
logger -t "$LOG_TAG" "$name: waited ${MAX_WAIT_MINUTES}m for tasks — giving up"
return 0 # timed out → treat as idle, let restart proceed
fi
sleep 30
waited=$(( waited + 30 ))
else
break # no tasks, runner is idle
fi
done
return 1 # idle → safe to restart
}
restart_runner() {
local name="$1"
# Skip if the container doesn't exist.
if ! docker inspect "$name" >/dev/null 2>&1; then
return 0
fi
# Wait for in-flight tasks to drain before touching the container.
# wait_for_idle returns 1 when idle (safe to restart), 0 when busy/timed-out.
wait_for_idle "$name"; local wi_rc=$?
if (( wi_rc != 1 )); then
# Runner is busy or timed out — skip silently.
return 0
fi
docker restart -t 30 "$name" || true
# Give the runner process time to start and emit its re-register log line.
sleep 8
# Verify re-registration succeeded.
set +e
docker logs --since 30s "$name" 2>&1 | grep -q 'declare successfully'
local rc=$?
set -e
if (( rc == 0 )); then
logger -t "$LOG_TAG" "$name: recycled OK"
else
logger -t "$LOG_TAG" "$name: failed to re-register (no 'declare successfully' in recent logs)"
return 1
fi
}
main() {
local failures=0
for i in 1 2 3 4 5 6 7 8; do
name="molecule-runner-$i"
restart_runner "$name" || failures=$(( failures + 1 ))
done
if (( failures > 0 )); then
logger -t "$LOG_TAG" "completed with $failures runner(s) failing re-register check"
exit 1
fi
logger -t "$LOG_TAG" "all runners recycled OK"
}
main "$@"
+174
View File
@@ -0,0 +1,174 @@
"""Tests for runners-restart-safe.sh.
Uses a PATH-prepended fake-docker + fake-sleep so we can test the script's
logic (container existence check, task-wait, restart, re-register verify)
without a real Docker daemon.
In test mode (TEST_MODE=1) the script sets MAX_WAIT_MINUTES=0 so the
wait loop exits on the first iteration. Fake docker ps outputs ""
(no running tasks) so the runner is always idle.
"""
from __future__ import annotations
import os
import stat
import subprocess
import tempfile
import textwrap
from pathlib import Path
import pytest
SCRIPT = Path(__file__).resolve().parent.parent / "bin" / "runners-restart-safe.sh"
# ---------------------------------------------------------------------------
# Fake docker helpers
# ---------------------------------------------------------------------------
def _mktmpdir() -> Path:
tmpdir = Path(tempfile.mkdtemp(prefix="fake-docker-"))
(tmpdir / "sleep").write_text("#!/bin/bash\nexit 0\n", encoding="utf-8")
os.chmod(tmpdir / "sleep", stat.S_IRWXU | stat.S_IRGRP | stat.S_IROTH)
return tmpdir
def _write_fake_docker(docker_content: str) -> Path:
tmpdir = _mktmpdir()
docker = tmpdir / "docker"
docker.write_text(docker_content, encoding="utf-8")
os.chmod(docker, stat.S_IRWXU | stat.S_IRGRP | stat.S_IROTH)
return tmpdir
def _run_script(fake_docker_dir: Path, env_extra: dict | None = None) -> subprocess.CompletedProcess:
env = {**os.environ, "PATH": f"{fake_docker_dir}:{os.environ['PATH']}", "TEST_MODE": "1"}
if env_extra:
env.update(env_extra)
return subprocess.run(
[str(SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=30,
)
# ---------------------------------------------------------------------------
# Fake docker: all containers missing (inspect exits 1)
# ---------------------------------------------------------------------------
IDLE_DOCKER = textwrap.dedent("""\
#!/bin/bash
# All runners are absent; script should skip all and exit 0.
case "$1" in
inspect) exit 1 ;;
ps) echo "" ;;
restart) exit 1 ;;
logs) exit 1 ;;
esac
exit 0
""")
# ---------------------------------------------------------------------------
# Fake docker: molecule-runner-1 exists, no tasks, re-register OK
# ---------------------------------------------------------------------------
OK_DOCKER = textwrap.dedent("""\
#!/bin/bash
# Runner-1 exists and is idle; re-register succeeds.
case "$1" in
inspect) echo '{"Name": "molecule-runner-1"}'; exit 0 ;;
ps) echo "" ;;
restart) exit 0 ;;
logs) echo "runner started, declare successfully registered"; exit 0 ;;
esac
exit 1
""")
# ---------------------------------------------------------------------------
# Fake docker: molecule-runner-1 exists, no tasks, but re-register FAILS
# ---------------------------------------------------------------------------
FAIL_DOCKER = textwrap.dedent("""\
#!/bin/bash
# Runner-1 exists and is idle; re-register check fails.
# All runners' inspect succeeds so the script attempts all 8 restarts.
case "$1" in
inspect) echo '{"Name": "'"$2"'"}'; exit 0 ;;
ps) echo "" ;;
restart) exit 0 ;;
logs) echo "some other log output"; exit 0 ;;
esac
exit 0
""")
# ---------------------------------------------------------------------------
# Fake docker: molecule-runner-1 has a running task (skip restart)
# ---------------------------------------------------------------------------
BUSY_DOCKER = textwrap.dedent("""\
#!/bin/bash
# Runner-1 has a running Gitea task; restart must be skipped.
# docker ps outputs the busy task for runner-1 (grep matches), but
# runners 2-8's grep patterns don't match the task suffix → treated as idle.
# logs outputs "declare successfully" so re-register check passes.
case "$1" in
inspect) echo '{"Name": "'"$2"'"}'; exit 0 ;;
ps) echo "GITEA-ACTIONS-TASK-abc123-molecule-runner-1" ;;
restart) exit 0 ;;
logs) echo "runner started, declare successfully registered"; exit 0 ;;
esac
exit 0
""")
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
def test_script_passes_when_all_runners_missing():
"""Positive path: no runner containers exist → script exits 0."""
fake = _write_fake_docker(IDLE_DOCKER)
try:
result = _run_script(fake)
assert result.returncode == 0, f"stderr: {result.stderr}"
finally:
import shutil
shutil.rmtree(fake)
def test_script_restarts_idle_runner_ok():
"""Positive path: runner exists, no tasks, re-registers successfully."""
fake = _write_fake_docker(OK_DOCKER)
try:
result = _run_script(fake)
assert result.returncode == 0, f"stderr: {result.stderr}"
finally:
import shutil
shutil.rmtree(fake)
def test_script_fails_on_missing_declare_line():
"""Negative path: runner restarted but did not emit 'declare successfully'."""
fake = _write_fake_docker(FAIL_DOCKER)
try:
result = _run_script(fake)
assert result.returncode == 1, (
f"expected failure, got rc={result.returncode}; stderr={result.stderr}"
)
finally:
import shutil
shutil.rmtree(fake)
def test_script_skips_runner_with_busy_task():
"""Positive path: runner has an in-flight task → script skips it silently."""
fake = _write_fake_docker(BUSY_DOCKER)
try:
result = _run_script(fake)
assert result.returncode == 0, (
f"expected 0, got rc={result.returncode}; stderr={result.stderr}"
)
assert "restart called while task is running" not in result.stderr
finally:
import shutil
shutil.rmtree(fake)