From 397386cae2e2e4903aa48030c5c5c0a1c4d9126a Mon Sep 17 00:00:00 2001 From: Teknium Date: Tue, 14 Apr 2026 14:34:34 -0700 Subject: [PATCH] fix: gateway auto-recovers from unexpected SIGTERM via systemd (#5646) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: when the gateway received SIGTERM (from hermes update, external kill, WSL2 runtime, etc.), it exited with status 0. systemd's Restart=on-failure only restarts on non-zero exit, so the gateway stayed dead permanently. Users had to manually restart. Fix 1: Signal-initiated shutdown exits non-zero When SIGTERM/SIGINT is received and no restart was requested (via /restart, /update, or SIGUSR1), start_gateway() returns False which causes sys.exit(1). systemd sees a failure exit and auto-restarts after RestartSec=30. This is safe because systemctl stop tracks its own stop-requested state independently of exit code — Restart= never fires for a deliberate stop, regardless of exit code. Also logs 'Received SIGTERM/SIGINT — initiating shutdown' so the cause of unexpected shutdowns is visible in agent.log. Fix 2: PID file ownership guard remove_pid_file() now checks that the PID file belongs to the current process before removing it. During --replace handoffs, the old process's atexit handler could fire AFTER the new process wrote its PID file, deleting the new record. This left the gateway running but invisible to get_running_pid(), causing 'Another gateway already running' errors on next restart. Test plan: - All restart drain tests pass (13) - All gateway service tests pass (84) - All update gateway restart tests pass (34) --- gateway/run.py | 25 +++++++++++++++++++++++++ gateway/status.py | 20 ++++++++++++++++++-- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index 0cdfb714..da3560cf 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -9261,8 +9261,18 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = runner = GatewayRunner(config) + # Track whether a signal initiated the shutdown (vs. internal request). + # When an unexpected SIGTERM kills the gateway, we exit non-zero so + # systemd's Restart=on-failure revives the process. systemctl stop + # is safe: systemd tracks stop-requested state independently of exit + # code, so Restart= never fires for a deliberate stop. + _signal_initiated_shutdown = False + # Set up signal handlers def shutdown_signal_handler(): + nonlocal _signal_initiated_shutdown + _signal_initiated_shutdown = True + logger.info("Received SIGTERM/SIGINT — initiating shutdown") asyncio.create_task(runner.stop()) def restart_signal_handler(): @@ -9332,6 +9342,21 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = if runner.exit_code is not None: raise SystemExit(runner.exit_code) + # When a signal (SIGTERM/SIGINT) caused the shutdown and it wasn't a + # planned restart (/restart, /update, SIGUSR1), exit non-zero so + # systemd's Restart=on-failure revives the process. This covers: + # - hermes update killing the gateway mid-work + # - External kill commands + # - WSL2/container runtime sending unexpected signals + # systemctl stop is safe: systemd tracks "stop requested" state + # independently of exit code, so Restart= never fires for it. + if _signal_initiated_shutdown and not runner._restart_requested: + logger.info( + "Exiting with code 1 (signal-initiated shutdown without restart " + "request) so systemd Restart=on-failure can revive the gateway." + ) + return False # → sys.exit(1) in the caller + return True diff --git a/gateway/status.py b/gateway/status.py index a801cfe5..becf9e8c 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -266,9 +266,25 @@ def read_runtime_status() -> Optional[dict[str, Any]]: def remove_pid_file() -> None: - """Remove the gateway PID file if it exists.""" + """Remove the gateway PID file, but only if it belongs to this process. + + During --replace handoffs, the old process's atexit handler can fire AFTER + the new process has written its own PID file. Blindly removing the file + would delete the new process's record, leaving the gateway running with no + PID file (invisible to ``get_running_pid()``). + """ try: - _get_pid_path().unlink(missing_ok=True) + path = _get_pid_path() + record = _read_json_file(path) + if record is not None: + try: + file_pid = int(record["pid"]) + except (KeyError, TypeError, ValueError): + file_pid = None + if file_pid is not None and file_pid != os.getpid(): + # PID file belongs to a different process — leave it alone. + return + path.unlink(missing_ok=True) except Exception: pass