test(update-restart): model successful primary-kill in find_gateway_pids mocks

cmd_update gained a post-restart survivor sweep in hermes_cli/main.py
(around line 7568, for issue #17648): after the primary kill attempts,
sleep 3s, re-poll find_gateway_pids, and SIGKILL any PIDs from
killed_pids that are still alive. This catches gateways that ignore
SIGTERM (stuck drain, blocked I/O, zombies).

Three tests patched find_gateway_pids with a static return_value
(or a fake that ignored call count), so the sweep always saw the
killed PIDs as still alive and fired an extra SIGKILL — contaminating
the kill-call assertions:

- test_update_restarts_profile_manual_gateways saw kill called 1×
  (SIGKILL from sweep) when expecting 0
- test_update_profile_manual_gateway_falls_back_to_sigterm saw kill
  called 2× (SIGTERM + SIGKILL from sweep) when expecting 1
- test_update_kills_manual_pid_but_not_service_pid saw 2 kills on the
  manual PID instead of 1

Real behaviour: after SIGTERM lands, the process exits and the next
find_gateway_pids() returns []. The static mocks didn't model that.
Make each find_gateway_pids mock call-count-aware: first call returns
the live PID(s), subsequent calls return empty / service-only —
modelling a successful primary kill. The survivor sweep then sees
nothing to escalate, so the original assertions hold.

Tests that want to exercise the sweep path can opt in by keeping the
PID alive across calls; none of the existing tests need that.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
claude-ceo-assistant 2026-05-08 03:43:43 +00:00
parent 5f179d6d35
commit d70d6b1ab3

View File

@ -415,7 +415,14 @@ class TestCmdUpdateLaunchdRestart:
pid=12345,
)
with patch.object(gateway_cli, "find_gateway_pids", return_value=[12345]), \
# Model successful primary-kill path: first find returns the
# live PID, subsequent calls (post-restart survivor sweep, #17648)
# return [] — the process exited after the graceful drain.
find_calls = {"n": 0}
def _find_drained(*a, **kw):
find_calls["n"] += 1
return [12345] if find_calls["n"] == 1 else []
with patch.object(gateway_cli, "find_gateway_pids", side_effect=_find_drained), \
patch.object(gateway_cli, "find_profile_gateway_processes", return_value=[process]), \
patch.object(gateway_cli, "launch_detached_profile_gateway_restart", return_value=True) as restart, \
patch.object(gateway_cli, "_graceful_restart_via_sigusr1", return_value=True) as graceful, \
@ -425,7 +432,9 @@ class TestCmdUpdateLaunchdRestart:
captured = capsys.readouterr().out
restart.assert_called_once_with("coder", 12345)
graceful.assert_called_once()
# Graceful drain succeeded — no SIGTERM fallback needed.
# Graceful drain succeeded — no SIGTERM fallback, and no SIGKILL
# from the survivor sweep because the PID drops out of
# find_gateway_pids on the second call.
kill.assert_not_called()
assert "Restarting manual gateway profile(s): coder" in captured
assert "Restart manually: hermes gateway run" not in captured
@ -453,7 +462,13 @@ class TestCmdUpdateLaunchdRestart:
pid=12345,
)
with patch.object(gateway_cli, "find_gateway_pids", return_value=[12345]), \
# Model successful SIGTERM fallback: first find returns the live
# PID; subsequent calls return [] because SIGTERM landed.
find_calls = {"n": 0}
def _find_drained(*a, **kw):
find_calls["n"] += 1
return [12345] if find_calls["n"] == 1 else []
with patch.object(gateway_cli, "find_gateway_pids", side_effect=_find_drained), \
patch.object(gateway_cli, "find_profile_gateway_processes", return_value=[process]), \
patch.object(gateway_cli, "launch_detached_profile_gateway_restart", return_value=True) as restart, \
patch.object(gateway_cli, "_graceful_restart_via_sigusr1", return_value=False) as graceful, \
@ -463,8 +478,11 @@ class TestCmdUpdateLaunchdRestart:
captured = capsys.readouterr().out
restart.assert_called_once_with("coder", 12345)
graceful.assert_called_once()
# Graceful drain returned False → SIGTERM fallback.
# Graceful drain returned False → SIGTERM fallback. SIGTERM
# works (modelled by the find draining), so the survivor sweep
# finds nothing to escalate.
kill.assert_called_once()
assert kill.call_args.args[0] == 12345
assert "Restarting manual gateway profile(s): coder" in captured
@patch("shutil.which", return_value=None)
@ -872,9 +890,18 @@ class TestServicePidExclusion:
launchctl_loaded=True,
)
# Model successful primary-kill path: first find returns both
# PIDs (filtered by the excluded service); subsequent calls
# return only the service PID (manual exited after SIGTERM).
# Without this, the post-restart survivor sweep (#17648) sends
# an extra SIGKILL to the manual PID and breaks the call-count
# assertion below.
find_calls = {"n": 0}
def fake_find(exclude_pids=None, all_profiles=False):
_exclude = exclude_pids or set()
return [p for p in [SERVICE_PID, MANUAL_PID] if p not in _exclude]
find_calls["n"] += 1
live = [SERVICE_PID, MANUAL_PID] if find_calls["n"] == 1 else [SERVICE_PID]
return [p for p in live if p not in _exclude]
with patch.object(
gateway_cli, "_get_service_pids", return_value={SERVICE_PID}