fix(local-env): use SIGKILL directly on cleanup, don't wait for zombie group reap
Some checks failed
Contributor Attribution Check / check-attribution (pull_request) Failing after 15s
Supply Chain Audit / Scan PR for critical supply chain risks (pull_request) Successful in 20s
Nix / nix (ubuntu-latest) (pull_request) Failing after 9m23s
Tests / test (pull_request) Failing after 11m50s
Tests / e2e (pull_request) Successful in 1m4s
Nix / nix (macos-latest) (pull_request) Has been cancelled

Two real bugs in tools/environments/local.py:_kill_process surfaced under
runner load (operator host load avg 16-37 on 8 CPUs):

1. SIGTERM-then-1s-wait-then-SIGKILL escalation is the wrong shape for the
   cleanup path. _kill_process is invoked from base.py:_wait_for_process's
   timeout and KeyboardInterrupt/SystemExit branches — by the time we're
   here, the caller has given up on graceful shutdown. The 1s SIGTERM-wait
   provides no benefit and on oversubscribed hosts blows past tight test
   budgets (test_timeout_path_still_works: assert elapsed < 4.0s, was 5.05s).
   Send SIGKILL directly: it's unblockable and the kernel processes it
   synchronously.

2. _wait_for_group_exit polled os.killpg(pgid, 0) for the group to disappear,
   but kill(-pgid, 0) returns success for zombies. In containers without a
   proper PID-1 reaper (tini/dumb-init), orphaned zombies linger until
   container exit. The function sat in its 2.0s ceiling waiting for kernel
   bookkeeping that would never happen. SIGKILL stopped the processes; their
   zombie entries are not our problem.

Replace both with a single os.killpg(SIGKILL) + proc.wait(timeout=0.5) to
reap our direct child. The nested _group_alive / _wait_for_group_exit
helpers are removed (no callers remain).

Test fixes:
- test_kill_process_uses_cached_pgid_if_wrapper_already_exited: update
  expected call sequence to [(SIGKILL)] (was [SIGTERM, KILL-0-check]).
- _pgid_still_alive helper: use ps STAT field to skip zombies, instead of
  os.killpg(pgid, 0). Zombies aren't "alive" in any meaningful sense and
  reporting them as such causes false-positive 'orphan bug regressed' alerts
  on container runners.

Verified: tests/tools/test_local_interrupt_cleanup.py +
tests/tools/test_local_background_child_hang.py — all 10 tests pass 3/3
runs on the act_runner image (load avg 16+).

Remaining red tests on this branch (test_concurrent_inserts_settle_at_cap,
test_blocking_approval_*, snapshot-drift tests) are unrelated to this
change — they have separate root causes (test creates 160 real AIAgents;
test order-pollution; code drift). Tracking separately.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
claude-ceo-assistant 2026-05-08 02:33:43 +00:00
parent 5d3be898a8
commit d6fca4f623
2 changed files with 41 additions and 51 deletions

View File

@ -30,12 +30,31 @@ def _isolate_hermes_home(tmp_path, monkeypatch):
def _pgid_still_alive(pgid: int) -> bool:
"""Return True if any process in the given process group is still alive."""
"""Return True if any LIVE (non-zombie) process in the group remains.
Zombies (stat=Z) are stopped the kernel has cleaned up their state but
PID 1 hasn't called wait() yet. In containers without a proper reaping
init at PID 1 (tini, dumb-init), zombies linger until container exit. We
don't want the test to fail because of unreaped bookkeeping; we want it
to fail only if a process is actually still executing.
"""
try:
os.killpg(pgid, 0) # signal 0 = existence check
return True
except ProcessLookupError:
return False
out = subprocess.run(
["ps", "-g", str(pgid), "-o", "stat="],
capture_output=True, text=True, check=False,
).stdout
except Exception:
# Fall back to the old behaviour if ps is unavailable.
try:
os.killpg(pgid, 0)
return True
except ProcessLookupError:
return False
for line in out.splitlines():
stat = line.strip()
if stat and not stat.startswith("Z"):
return True
return False
def _process_group_snapshot(pgid: int) -> str:
@ -71,6 +90,7 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch):
_hermes_pgid=67890,
poll=lambda: 0,
kill=lambda: None,
wait=lambda timeout=None: 0,
)
killpg_calls = []
@ -79,15 +99,15 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch):
def fake_killpg(pgid, sig):
killpg_calls.append((pgid, sig))
if sig == 0:
raise ProcessLookupError
monkeypatch.setattr(os, "getpgid", fake_getpgid)
monkeypatch.setattr(os, "killpg", fake_killpg)
env._kill_process(proc)
assert killpg_calls == [(67890, signal.SIGTERM), (67890, 0)]
# Cleanup path goes straight to SIGKILL — no graceful SIGTERM retry,
# because the caller (timeout/interrupt branches) has already given up.
assert killpg_calls == [(67890, signal.SIGKILL)]
def test_wait_for_process_kills_subprocess_on_keyboardinterrupt():

View File

@ -384,35 +384,6 @@ class LocalEnvironment(BaseEnvironment):
def _kill_process(self, proc):
"""Kill the entire process group (all children)."""
def _group_alive(pgid: int) -> bool:
try:
# POSIX-only: _IS_WINDOWS is handled before this helper is used.
os.killpg(pgid, 0)
return True
except ProcessLookupError:
return False
except PermissionError:
# The group exists, even if this process cannot signal it.
return True
def _wait_for_group_exit(pgid: int, timeout: float) -> bool:
deadline = time.monotonic() + timeout
while time.monotonic() < deadline:
# Reap the wrapper promptly. A dead but unreaped group leader
# still makes killpg(pgid, 0) report the group as alive.
try:
proc.poll()
except Exception:
pass
if not _group_alive(pgid):
return True
time.sleep(0.05)
try:
proc.poll()
except Exception:
pass
return not _group_alive(pgid)
try:
if _IS_WINDOWS:
proc.terminate()
@ -424,25 +395,24 @@ class LocalEnvironment(BaseEnvironment):
if pgid is None:
raise
# Cleanup path: timeout / KeyboardInterrupt / SystemExit.
# Graceful shutdown is not wanted — the caller has given up on
# the process. Send SIGKILL directly: it's unblockable and the
# kernel processes it synchronously. By the time the syscall
# returns, every process in the group is marked dead.
try:
os.killpg(pgid, signal.SIGTERM)
except ProcessLookupError:
return
# Wait on the process group, not just the shell wrapper. Under
# load the wrapper can exit before grandchildren do; returning
# at that point leaves orphaned process-group members behind.
if _wait_for_group_exit(pgid, 1.0):
return
try:
# POSIX-only: _IS_WINDOWS is handled by the outer branch.
os.killpg(pgid, signal.SIGKILL)
except ProcessLookupError:
return
_wait_for_group_exit(pgid, 2.0)
# Reap our direct child (the wrapper bash). We do NOT wait
# for the entire process group to disappear via kill(-pgid, 0)
# — that requires PID 1 (or some ancestor) to reap orphaned
# zombies, which is not guaranteed in containers without a
# proper init (tini/dumb-init). The processes have stopped;
# their zombie entries are the kernel's bookkeeping, not our
# problem to wait on.
try:
proc.wait(timeout=0.2)
proc.wait(timeout=0.5)
except (subprocess.TimeoutExpired, OSError):
pass
except (ProcessLookupError, PermissionError, OSError):