test(update-restart): model successful primary-kill in find_gateway_pids mocks
cmd_update gained a post-restart survivor sweep in hermes_cli/main.py (around line 7568, for issue #17648): after the primary kill attempts, sleep 3s, re-poll find_gateway_pids, and SIGKILL any PIDs from killed_pids that are still alive. This catches gateways that ignore SIGTERM (stuck drain, blocked I/O, zombies). Three tests patched find_gateway_pids with a static return_value (or a fake that ignored call count), so the sweep always saw the killed PIDs as still alive and fired an extra SIGKILL — contaminating the kill-call assertions: - test_update_restarts_profile_manual_gateways saw kill called 1× (SIGKILL from sweep) when expecting 0 - test_update_profile_manual_gateway_falls_back_to_sigterm saw kill called 2× (SIGTERM + SIGKILL from sweep) when expecting 1 - test_update_kills_manual_pid_but_not_service_pid saw 2 kills on the manual PID instead of 1 Real behaviour: after SIGTERM lands, the process exits and the next find_gateway_pids() returns []. The static mocks didn't model that. Make each find_gateway_pids mock call-count-aware: first call returns the live PID(s), subsequent calls return empty / service-only — modelling a successful primary kill. The survivor sweep then sees nothing to escalate, so the original assertions hold. Tests that want to exercise the sweep path can opt in by keeping the PID alive across calls; none of the existing tests need that. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
5f179d6d35
commit
d70d6b1ab3
@ -415,7 +415,14 @@ class TestCmdUpdateLaunchdRestart:
|
||||
pid=12345,
|
||||
)
|
||||
|
||||
with patch.object(gateway_cli, "find_gateway_pids", return_value=[12345]), \
|
||||
# Model successful primary-kill path: first find returns the
|
||||
# live PID, subsequent calls (post-restart survivor sweep, #17648)
|
||||
# return [] — the process exited after the graceful drain.
|
||||
find_calls = {"n": 0}
|
||||
def _find_drained(*a, **kw):
|
||||
find_calls["n"] += 1
|
||||
return [12345] if find_calls["n"] == 1 else []
|
||||
with patch.object(gateway_cli, "find_gateway_pids", side_effect=_find_drained), \
|
||||
patch.object(gateway_cli, "find_profile_gateway_processes", return_value=[process]), \
|
||||
patch.object(gateway_cli, "launch_detached_profile_gateway_restart", return_value=True) as restart, \
|
||||
patch.object(gateway_cli, "_graceful_restart_via_sigusr1", return_value=True) as graceful, \
|
||||
@ -425,7 +432,9 @@ class TestCmdUpdateLaunchdRestart:
|
||||
captured = capsys.readouterr().out
|
||||
restart.assert_called_once_with("coder", 12345)
|
||||
graceful.assert_called_once()
|
||||
# Graceful drain succeeded — no SIGTERM fallback needed.
|
||||
# Graceful drain succeeded — no SIGTERM fallback, and no SIGKILL
|
||||
# from the survivor sweep because the PID drops out of
|
||||
# find_gateway_pids on the second call.
|
||||
kill.assert_not_called()
|
||||
assert "Restarting manual gateway profile(s): coder" in captured
|
||||
assert "Restart manually: hermes gateway run" not in captured
|
||||
@ -453,7 +462,13 @@ class TestCmdUpdateLaunchdRestart:
|
||||
pid=12345,
|
||||
)
|
||||
|
||||
with patch.object(gateway_cli, "find_gateway_pids", return_value=[12345]), \
|
||||
# Model successful SIGTERM fallback: first find returns the live
|
||||
# PID; subsequent calls return [] because SIGTERM landed.
|
||||
find_calls = {"n": 0}
|
||||
def _find_drained(*a, **kw):
|
||||
find_calls["n"] += 1
|
||||
return [12345] if find_calls["n"] == 1 else []
|
||||
with patch.object(gateway_cli, "find_gateway_pids", side_effect=_find_drained), \
|
||||
patch.object(gateway_cli, "find_profile_gateway_processes", return_value=[process]), \
|
||||
patch.object(gateway_cli, "launch_detached_profile_gateway_restart", return_value=True) as restart, \
|
||||
patch.object(gateway_cli, "_graceful_restart_via_sigusr1", return_value=False) as graceful, \
|
||||
@ -463,8 +478,11 @@ class TestCmdUpdateLaunchdRestart:
|
||||
captured = capsys.readouterr().out
|
||||
restart.assert_called_once_with("coder", 12345)
|
||||
graceful.assert_called_once()
|
||||
# Graceful drain returned False → SIGTERM fallback.
|
||||
# Graceful drain returned False → SIGTERM fallback. SIGTERM
|
||||
# works (modelled by the find draining), so the survivor sweep
|
||||
# finds nothing to escalate.
|
||||
kill.assert_called_once()
|
||||
assert kill.call_args.args[0] == 12345
|
||||
assert "Restarting manual gateway profile(s): coder" in captured
|
||||
|
||||
@patch("shutil.which", return_value=None)
|
||||
@ -872,9 +890,18 @@ class TestServicePidExclusion:
|
||||
launchctl_loaded=True,
|
||||
)
|
||||
|
||||
# Model successful primary-kill path: first find returns both
|
||||
# PIDs (filtered by the excluded service); subsequent calls
|
||||
# return only the service PID (manual exited after SIGTERM).
|
||||
# Without this, the post-restart survivor sweep (#17648) sends
|
||||
# an extra SIGKILL to the manual PID and breaks the call-count
|
||||
# assertion below.
|
||||
find_calls = {"n": 0}
|
||||
def fake_find(exclude_pids=None, all_profiles=False):
|
||||
_exclude = exclude_pids or set()
|
||||
return [p for p in [SERVICE_PID, MANUAL_PID] if p not in _exclude]
|
||||
find_calls["n"] += 1
|
||||
live = [SERVICE_PID, MANUAL_PID] if find_calls["n"] == 1 else [SERVICE_PID]
|
||||
return [p for p in live if p not in _exclude]
|
||||
|
||||
with patch.object(
|
||||
gateway_cli, "_get_service_pids", return_value={SERVICE_PID}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user