fix(tools/environments): SIGKILL-only on KeyboardInterrupt; restore Physikal Apr 2026 orphan-bug fix (partial close hermes-agent#9) #10

Merged
claude-ceo-assistant merged 1 commits from fix/sigkill-cleanup-and-survivor-sweep-grace into main 2026-05-08 19:17:54 +00:00
3 changed files with 88 additions and 54 deletions

View File

@ -6680,6 +6680,17 @@ def _run_pre_update_backup(args) -> None:
print()
class _SurvivorSweepSkipped(Exception):
"""Internal sentinel: post-restart survivor sweep was skipped.
Raised when ``time.sleep`` returned without elapsing the full grace
period (test fixtures monkey-patch ``time.sleep`` to no-op; signal
handlers can interrupt it). Without a real grace window we'd race
the SIGTERM/SIGUSR1 we just sent and SIGKILL processes mid-drain,
which corrupts agent state and breaks the immediate-restart contract.
"""
def cmd_update(args):
"""Update Hermes Agent to the latest version.
@ -7557,8 +7568,25 @@ def _cmd_update_impl(args, gateway_mode: bool):
# graceful paths a brief window to complete, then SIGKILL
# any remaining pre-update PIDs so the watcher / service
# manager can relaunch with fresh code.
#
# The grace period MUST be a real wall-clock 3s. Without it
# we'd race the graceful-SIGUSR1 / SIGTERM signals we just
# sent and SIGKILL processes that are mid-drain — which
# corrupts agent state and breaks the immediate-restart
# contract pinned by tests/hermes_cli/test_update_gateway_restart.py.
# If ``time.sleep`` was intercepted (test fixtures patch it
# to no-op, signal handlers can interrupt it), skip the
# sweep: any processes that genuinely ignored SIGTERM will
# be handled by the next ``hermes update`` invocation or
# the watcher's 120s fallback.
try:
_t0 = _time.monotonic()
_time.sleep(3.0)
_grace_elapsed = _time.monotonic() - _t0
if _grace_elapsed < 2.5:
# No real grace happened — bail out before escalating.
raise _SurvivorSweepSkipped()
_service_pids_after = _get_service_pids()
_surviving = find_gateway_pids(
exclude_pids=_service_pids_after, all_profiles=True,
@ -7566,8 +7594,20 @@ def _cmd_update_impl(args, gateway_mode: bool):
# Scope to PIDs we already tried to kill during this
# update (killed_pids). Anything new is a gateway that
# started AFTER our restart attempt — respecting user
# intent, we don't kill those.
_stuck = [pid for pid in _surviving if pid in killed_pids]
# intent, we don't kill those. Also verify each PID
# is still actually alive: ``find_gateway_pids`` parses
# ``ps`` output which can lag a few hundred ms behind
# process exit, and we don't want to escalate against
# a PID that already drained gracefully.
_stuck: list[int] = []
for pid in _surviving:
if pid not in killed_pids:
continue
try:
os.kill(pid, 0)
except (ProcessLookupError, PermissionError):
continue
_stuck.append(pid)
if _stuck:
print()
print(
@ -7581,6 +7621,8 @@ def _cmd_update_impl(args, gateway_mode: bool):
# Give the OS a beat to reap the processes so the
# watchers see them exit and respawn.
_time.sleep(1.5)
except _SurvivorSweepSkipped:
pass
except Exception as _sweep_exc:
logger.debug("Post-restart survivor sweep failed: %s", _sweep_exc)

View File

@ -30,12 +30,33 @@ def _isolate_hermes_home(tmp_path, monkeypatch):
def _pgid_still_alive(pgid: int) -> bool:
"""Return True if any process in the given process group is still alive."""
"""Return True if any LIVE (non-zombie) process in the group remains.
Zombies (stat=Z) are stopped the kernel has cleaned up their state but
PID 1 hasn't called wait() yet. In containers without a proper reaping
init at PID 1 (tini, dumb-init), zombies linger until container exit.
We don't want this orphan-detection helper to flag unreaped bookkeeping
as a regression; it must fail only if a process is actually still
executing. ``os.killpg(pgid, 0)`` doesn't distinguish — it returns
success for zombies. ``ps STAT`` does.
"""
try:
os.killpg(pgid, 0) # signal 0 = existence check
return True
except ProcessLookupError:
return False
out = subprocess.run(
["ps", "-g", str(pgid), "-o", "stat="],
capture_output=True, text=True, check=False,
).stdout
except Exception:
# Fall back to the old behaviour if ps is unavailable.
try:
os.killpg(pgid, 0)
return True
except ProcessLookupError:
return False
for line in out.splitlines():
stat = line.strip()
if stat and not stat.startswith("Z"):
return True
return False
def _process_group_snapshot(pgid: int) -> str:
@ -71,6 +92,7 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch):
_hermes_pgid=67890,
poll=lambda: 0,
kill=lambda: None,
wait=lambda timeout=None: 0,
)
killpg_calls = []
@ -79,15 +101,16 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch):
def fake_killpg(pgid, sig):
killpg_calls.append((pgid, sig))
if sig == 0:
raise ProcessLookupError
monkeypatch.setattr(os, "getpgid", fake_getpgid)
monkeypatch.setattr(os, "killpg", fake_killpg)
env._kill_process(proc)
assert killpg_calls == [(67890, signal.SIGTERM), (67890, 0)]
# Cleanup path goes straight to SIGKILL — no graceful SIGTERM retry,
# because the caller (timeout / KeyboardInterrupt / SystemExit branches)
# has already given up on the process.
assert killpg_calls == [(67890, signal.SIGKILL)]
def test_wait_for_process_kills_subprocess_on_keyboardinterrupt():

View File

@ -382,37 +382,19 @@ class LocalEnvironment(BaseEnvironment):
return proc
def _kill_process(self, proc):
"""Kill the entire process group (all children)."""
def _group_alive(pgid: int) -> bool:
try:
# POSIX-only: _IS_WINDOWS is handled before this helper is used.
os.killpg(pgid, 0)
return True
except ProcessLookupError:
return False
except PermissionError:
# The group exists, even if this process cannot signal it.
return True
def _wait_for_group_exit(pgid: int, timeout: float) -> bool:
deadline = time.monotonic() + timeout
while time.monotonic() < deadline:
# Reap the wrapper promptly. A dead but unreaped group leader
# still makes killpg(pgid, 0) report the group as alive.
try:
proc.poll()
except Exception:
pass
if not _group_alive(pgid):
return True
time.sleep(0.05)
try:
proc.poll()
except Exception:
pass
return not _group_alive(pgid)
"""Kill the entire process group (all children).
This is the cleanup path invoked from ``_wait_for_process`` for
the timeout, KeyboardInterrupt, and SystemExit branches. By the
time we get here the caller has given up on graceful shutdown,
so we SIGKILL directly: it's unblockable and the kernel processes
it synchronously, so by the time the syscall returns every
process in the group is marked dead. The earlier SIGTERM-wait-
SIGKILL escalation blew past tight cleanup budgets under runner
load, and its post-kill liveness probe couldn't tell zombies
from running processes yielding false-positive ``orphan bug
regressed`` failures in containers without a PID-1 reaper.
"""
try:
if _IS_WINDOWS:
proc.terminate()
@ -425,24 +407,11 @@ class LocalEnvironment(BaseEnvironment):
raise
try:
os.killpg(pgid, signal.SIGTERM)
except ProcessLookupError:
return
# Wait on the process group, not just the shell wrapper. Under
# load the wrapper can exit before grandchildren do; returning
# at that point leaves orphaned process-group members behind.
if _wait_for_group_exit(pgid, 1.0):
return
try:
# POSIX-only: _IS_WINDOWS is handled by the outer branch.
os.killpg(pgid, signal.SIGKILL)
except ProcessLookupError:
return
_wait_for_group_exit(pgid, 2.0)
try:
proc.wait(timeout=0.2)
proc.wait(timeout=0.5)
except (subprocess.TimeoutExpired, OSError):
pass
except (ProcessLookupError, PermissionError, OSError):