diff --git a/bin/runners-restart-safe.sh b/bin/runners-restart-safe.sh new file mode 100755 index 0000000..2852183 --- /dev/null +++ b/bin/runners-restart-safe.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# runners-restart-safe.sh — Gracefully restart molecule-runner containers. +# +# Before restarting a runner container, waits for any active Gitea Actions +# task that is currently running on it to complete (so we don't kill a +# running job mid-execution). After restart, verifies the runner +# successfully re-registers with the platform by checking for the +# 'declare successfully' log line. +# +# Usage: bin/runners-restart-safe.sh +# Requires: docker, logger + +set -eu + +LOG_TAG="runners-restart-safe" +# TEST_MODE=1 skips the task-wait loop (MAX_WAIT_MINUTES=0) so tests run fast. +# In test mode the fake docker must faithfully represent what docker returns. +MAX_WAIT_MINUTES=${TEST_MODE:-60} # default 60 min; 0 = skip wait loop + +# Poll every 30 s for running tasks on a given runner. +# Returns 0 when the runner is idle (safe to restart). +# Returns 1 when tasks are still running (caller must skip this runner). +wait_for_idle() { + local name="$1" + local waited=0 + while true; do + # Temporarily disable set -e so grep's "no match" (rc=1) does not + # cause an early script exit. The assignment captures the exit code. + set +e + docker ps --format '{{.Names}}' | grep -qE "GITEA-ACTIONS-TASK-.+-${name}" + local rc=$? + set -e + if (( rc == 0 )); then + # Task still running — wait or give up. + if (( waited >= MAX_WAIT_MINUTES * 60 )); then + logger -t "$LOG_TAG" "$name: waited ${MAX_WAIT_MINUTES}m for tasks — giving up" + return 0 # timed out → treat as idle, let restart proceed + fi + sleep 30 + waited=$(( waited + 30 )) + else + break # no tasks, runner is idle + fi + done + return 1 # idle → safe to restart +} + +restart_runner() { + local name="$1" + + # Skip if the container doesn't exist. + if ! docker inspect "$name" >/dev/null 2>&1; then + return 0 + fi + + # Wait for in-flight tasks to drain before touching the container. + # wait_for_idle returns 1 when idle (safe to restart), 0 when busy/timed-out. + wait_for_idle "$name"; local wi_rc=$? + if (( wi_rc != 1 )); then + # Runner is busy or timed out — skip silently. + return 0 + fi + + docker restart -t 30 "$name" || true + + # Give the runner process time to start and emit its re-register log line. + sleep 8 + + # Verify re-registration succeeded. + set +e + docker logs --since 30s "$name" 2>&1 | grep -q 'declare successfully' + local rc=$? + set -e + if (( rc == 0 )); then + logger -t "$LOG_TAG" "$name: recycled OK" + else + logger -t "$LOG_TAG" "$name: failed to re-register (no 'declare successfully' in recent logs)" + return 1 + fi +} + +main() { + local failures=0 + for i in 1 2 3 4 5 6 7 8; do + name="molecule-runner-$i" + restart_runner "$name" || failures=$(( failures + 1 )) + done + + if (( failures > 0 )); then + logger -t "$LOG_TAG" "completed with $failures runner(s) failing re-register check" + exit 1 + fi + logger -t "$LOG_TAG" "all runners recycled OK" +} + +main "$@" diff --git a/scripts/test_runners_restart_safe.py b/scripts/test_runners_restart_safe.py new file mode 100644 index 0000000..5f16bc6 --- /dev/null +++ b/scripts/test_runners_restart_safe.py @@ -0,0 +1,174 @@ +"""Tests for runners-restart-safe.sh. + +Uses a PATH-prepended fake-docker + fake-sleep so we can test the script's +logic (container existence check, task-wait, restart, re-register verify) +without a real Docker daemon. + +In test mode (TEST_MODE=1) the script sets MAX_WAIT_MINUTES=0 so the +wait loop exits on the first iteration. Fake docker ps outputs "" +(no running tasks) so the runner is always idle. +""" +from __future__ import annotations + +import os +import stat +import subprocess +import tempfile +import textwrap +from pathlib import Path + +import pytest + +SCRIPT = Path(__file__).resolve().parent.parent / "bin" / "runners-restart-safe.sh" + +# --------------------------------------------------------------------------- +# Fake docker helpers +# --------------------------------------------------------------------------- + +def _mktmpdir() -> Path: + tmpdir = Path(tempfile.mkdtemp(prefix="fake-docker-")) + (tmpdir / "sleep").write_text("#!/bin/bash\nexit 0\n", encoding="utf-8") + os.chmod(tmpdir / "sleep", stat.S_IRWXU | stat.S_IRGRP | stat.S_IROTH) + return tmpdir + + +def _write_fake_docker(docker_content: str) -> Path: + tmpdir = _mktmpdir() + docker = tmpdir / "docker" + docker.write_text(docker_content, encoding="utf-8") + os.chmod(docker, stat.S_IRWXU | stat.S_IRGRP | stat.S_IROTH) + return tmpdir + + +def _run_script(fake_docker_dir: Path, env_extra: dict | None = None) -> subprocess.CompletedProcess: + env = {**os.environ, "PATH": f"{fake_docker_dir}:{os.environ['PATH']}", "TEST_MODE": "1"} + if env_extra: + env.update(env_extra) + return subprocess.run( + [str(SCRIPT)], + capture_output=True, + text=True, + env=env, + timeout=30, + ) + + +# --------------------------------------------------------------------------- +# Fake docker: all containers missing (inspect exits 1) +# --------------------------------------------------------------------------- +IDLE_DOCKER = textwrap.dedent("""\ + #!/bin/bash + # All runners are absent; script should skip all and exit 0. + case "$1" in + inspect) exit 1 ;; + ps) echo "" ;; + restart) exit 1 ;; + logs) exit 1 ;; + esac + exit 0 + """) + + +# --------------------------------------------------------------------------- +# Fake docker: molecule-runner-1 exists, no tasks, re-register OK +# --------------------------------------------------------------------------- +OK_DOCKER = textwrap.dedent("""\ + #!/bin/bash + # Runner-1 exists and is idle; re-register succeeds. + case "$1" in + inspect) echo '{"Name": "molecule-runner-1"}'; exit 0 ;; + ps) echo "" ;; + restart) exit 0 ;; + logs) echo "runner started, declare successfully registered"; exit 0 ;; + esac + exit 1 + """) + + +# --------------------------------------------------------------------------- +# Fake docker: molecule-runner-1 exists, no tasks, but re-register FAILS +# --------------------------------------------------------------------------- +FAIL_DOCKER = textwrap.dedent("""\ + #!/bin/bash + # Runner-1 exists and is idle; re-register check fails. + # All runners' inspect succeeds so the script attempts all 8 restarts. + case "$1" in + inspect) echo '{"Name": "'"$2"'"}'; exit 0 ;; + ps) echo "" ;; + restart) exit 0 ;; + logs) echo "some other log output"; exit 0 ;; + esac + exit 0 + """) + + +# --------------------------------------------------------------------------- +# Fake docker: molecule-runner-1 has a running task (skip restart) +# --------------------------------------------------------------------------- +BUSY_DOCKER = textwrap.dedent("""\ + #!/bin/bash + # Runner-1 has a running Gitea task; restart must be skipped. + # docker ps outputs the busy task for runner-1 (grep matches), but + # runners 2-8's grep patterns don't match the task suffix → treated as idle. + # logs outputs "declare successfully" so re-register check passes. + case "$1" in + inspect) echo '{"Name": "'"$2"'"}'; exit 0 ;; + ps) echo "GITEA-ACTIONS-TASK-abc123-molecule-runner-1" ;; + restart) exit 0 ;; + logs) echo "runner started, declare successfully registered"; exit 0 ;; + esac + exit 0 + """) + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + +def test_script_passes_when_all_runners_missing(): + """Positive path: no runner containers exist → script exits 0.""" + fake = _write_fake_docker(IDLE_DOCKER) + try: + result = _run_script(fake) + assert result.returncode == 0, f"stderr: {result.stderr}" + finally: + import shutil + shutil.rmtree(fake) + + +def test_script_restarts_idle_runner_ok(): + """Positive path: runner exists, no tasks, re-registers successfully.""" + fake = _write_fake_docker(OK_DOCKER) + try: + result = _run_script(fake) + assert result.returncode == 0, f"stderr: {result.stderr}" + finally: + import shutil + shutil.rmtree(fake) + + +def test_script_fails_on_missing_declare_line(): + """Negative path: runner restarted but did not emit 'declare successfully'.""" + fake = _write_fake_docker(FAIL_DOCKER) + try: + result = _run_script(fake) + assert result.returncode == 1, ( + f"expected failure, got rc={result.returncode}; stderr={result.stderr}" + ) + finally: + import shutil + shutil.rmtree(fake) + + +def test_script_skips_runner_with_busy_task(): + """Positive path: runner has an in-flight task → script skips it silently.""" + fake = _write_fake_docker(BUSY_DOCKER) + try: + result = _run_script(fake) + assert result.returncode == 0, ( + f"expected 0, got rc={result.returncode}; stderr={result.stderr}" + ) + assert "restart called while task is running" not in result.stderr + finally: + import shutil + shutil.rmtree(fake)