From d6fca4f62365aa3c897df132ed665dd1f0d6fa39 Mon Sep 17 00:00:00 2001 From: claude-ceo-assistant Date: Fri, 8 May 2026 02:33:43 +0000 Subject: [PATCH] fix(local-env): use SIGKILL directly on cleanup, don't wait for zombie group reap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two real bugs in tools/environments/local.py:_kill_process surfaced under runner load (operator host load avg 16-37 on 8 CPUs): 1. SIGTERM-then-1s-wait-then-SIGKILL escalation is the wrong shape for the cleanup path. _kill_process is invoked from base.py:_wait_for_process's timeout and KeyboardInterrupt/SystemExit branches — by the time we're here, the caller has given up on graceful shutdown. The 1s SIGTERM-wait provides no benefit and on oversubscribed hosts blows past tight test budgets (test_timeout_path_still_works: assert elapsed < 4.0s, was 5.05s). Send SIGKILL directly: it's unblockable and the kernel processes it synchronously. 2. _wait_for_group_exit polled os.killpg(pgid, 0) for the group to disappear, but kill(-pgid, 0) returns success for zombies. In containers without a proper PID-1 reaper (tini/dumb-init), orphaned zombies linger until container exit. The function sat in its 2.0s ceiling waiting for kernel bookkeeping that would never happen. SIGKILL stopped the processes; their zombie entries are not our problem. Replace both with a single os.killpg(SIGKILL) + proc.wait(timeout=0.5) to reap our direct child. The nested _group_alive / _wait_for_group_exit helpers are removed (no callers remain). Test fixes: - test_kill_process_uses_cached_pgid_if_wrapper_already_exited: update expected call sequence to [(SIGKILL)] (was [SIGTERM, KILL-0-check]). - _pgid_still_alive helper: use ps STAT field to skip zombies, instead of os.killpg(pgid, 0). Zombies aren't "alive" in any meaningful sense and reporting them as such causes false-positive 'orphan bug regressed' alerts on container runners. Verified: tests/tools/test_local_interrupt_cleanup.py + tests/tools/test_local_background_child_hang.py — all 10 tests pass 3/3 runs on the act_runner image (load avg 16+). Remaining red tests on this branch (test_concurrent_inserts_settle_at_cap, test_blocking_approval_*, snapshot-drift tests) are unrelated to this change — they have separate root causes (test creates 160 real AIAgents; test order-pollution; code drift). Tracking separately. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/tools/test_local_interrupt_cleanup.py | 36 ++++++++++--- tools/environments/local.py | 56 +++++---------------- 2 files changed, 41 insertions(+), 51 deletions(-) diff --git a/tests/tools/test_local_interrupt_cleanup.py b/tests/tools/test_local_interrupt_cleanup.py index a9b74559..0c4b7516 100644 --- a/tests/tools/test_local_interrupt_cleanup.py +++ b/tests/tools/test_local_interrupt_cleanup.py @@ -30,12 +30,31 @@ def _isolate_hermes_home(tmp_path, monkeypatch): def _pgid_still_alive(pgid: int) -> bool: - """Return True if any process in the given process group is still alive.""" + """Return True if any LIVE (non-zombie) process in the group remains. + + Zombies (stat=Z) are stopped — the kernel has cleaned up their state but + PID 1 hasn't called wait() yet. In containers without a proper reaping + init at PID 1 (tini, dumb-init), zombies linger until container exit. We + don't want the test to fail because of unreaped bookkeeping; we want it + to fail only if a process is actually still executing. + """ try: - os.killpg(pgid, 0) # signal 0 = existence check - return True - except ProcessLookupError: - return False + out = subprocess.run( + ["ps", "-g", str(pgid), "-o", "stat="], + capture_output=True, text=True, check=False, + ).stdout + except Exception: + # Fall back to the old behaviour if ps is unavailable. + try: + os.killpg(pgid, 0) + return True + except ProcessLookupError: + return False + for line in out.splitlines(): + stat = line.strip() + if stat and not stat.startswith("Z"): + return True + return False def _process_group_snapshot(pgid: int) -> str: @@ -71,6 +90,7 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch): _hermes_pgid=67890, poll=lambda: 0, kill=lambda: None, + wait=lambda timeout=None: 0, ) killpg_calls = [] @@ -79,15 +99,15 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch): def fake_killpg(pgid, sig): killpg_calls.append((pgid, sig)) - if sig == 0: - raise ProcessLookupError monkeypatch.setattr(os, "getpgid", fake_getpgid) monkeypatch.setattr(os, "killpg", fake_killpg) env._kill_process(proc) - assert killpg_calls == [(67890, signal.SIGTERM), (67890, 0)] + # Cleanup path goes straight to SIGKILL — no graceful SIGTERM retry, + # because the caller (timeout/interrupt branches) has already given up. + assert killpg_calls == [(67890, signal.SIGKILL)] def test_wait_for_process_kills_subprocess_on_keyboardinterrupt(): diff --git a/tools/environments/local.py b/tools/environments/local.py index 3200e63e..93bb731c 100644 --- a/tools/environments/local.py +++ b/tools/environments/local.py @@ -384,35 +384,6 @@ class LocalEnvironment(BaseEnvironment): def _kill_process(self, proc): """Kill the entire process group (all children).""" - def _group_alive(pgid: int) -> bool: - try: - # POSIX-only: _IS_WINDOWS is handled before this helper is used. - os.killpg(pgid, 0) - return True - except ProcessLookupError: - return False - except PermissionError: - # The group exists, even if this process cannot signal it. - return True - - def _wait_for_group_exit(pgid: int, timeout: float) -> bool: - deadline = time.monotonic() + timeout - while time.monotonic() < deadline: - # Reap the wrapper promptly. A dead but unreaped group leader - # still makes killpg(pgid, 0) report the group as alive. - try: - proc.poll() - except Exception: - pass - if not _group_alive(pgid): - return True - time.sleep(0.05) - try: - proc.poll() - except Exception: - pass - return not _group_alive(pgid) - try: if _IS_WINDOWS: proc.terminate() @@ -424,25 +395,24 @@ class LocalEnvironment(BaseEnvironment): if pgid is None: raise + # Cleanup path: timeout / KeyboardInterrupt / SystemExit. + # Graceful shutdown is not wanted — the caller has given up on + # the process. Send SIGKILL directly: it's unblockable and the + # kernel processes it synchronously. By the time the syscall + # returns, every process in the group is marked dead. try: - os.killpg(pgid, signal.SIGTERM) - except ProcessLookupError: - return - - # Wait on the process group, not just the shell wrapper. Under - # load the wrapper can exit before grandchildren do; returning - # at that point leaves orphaned process-group members behind. - if _wait_for_group_exit(pgid, 1.0): - return - - try: - # POSIX-only: _IS_WINDOWS is handled by the outer branch. os.killpg(pgid, signal.SIGKILL) except ProcessLookupError: return - _wait_for_group_exit(pgid, 2.0) + # Reap our direct child (the wrapper bash). We do NOT wait + # for the entire process group to disappear via kill(-pgid, 0) + # — that requires PID 1 (or some ancestor) to reap orphaned + # zombies, which is not guaranteed in containers without a + # proper init (tini/dumb-init). The processes have stopped; + # their zombie entries are the kernel's bookkeeping, not our + # problem to wait on. try: - proc.wait(timeout=0.2) + proc.wait(timeout=0.5) except (subprocess.TimeoutExpired, OSError): pass except (ProcessLookupError, PermissionError, OSError):