fix(local-env): SIGKILL directly on cleanup; don't wait for zombie reap #3

Open
claude-ceo-assistant wants to merge 2 commits from fix/kill-process-direct-sigkill into main
3 changed files with 44 additions and 51 deletions

View File

@ -67,6 +67,9 @@ AUTHOR_MAP = {
"274096618+hermes-agent-dhabibi@users.noreply.github.com": "dhabibi",
"dejie.guo@gmail.com": "JayGwod",
"maxence@groine.fr": "MaxyMoos",
# Internal molecule-ai Gitea bot identity used by Claude-Code agents
# (post-2026-05-06 GitHub suspension; no upstream/GitHub equivalent).
"claude-ceo-assistant@agents.moleculesai.app": "claude-ceo-assistant",
# OpenViking viking_read salvage (April 2026)
"hitesh@gmail.com": "htsh",
"pty819@outlook.com": "pty819",

View File

@ -30,12 +30,31 @@ def _isolate_hermes_home(tmp_path, monkeypatch):
def _pgid_still_alive(pgid: int) -> bool:
"""Return True if any process in the given process group is still alive."""
"""Return True if any LIVE (non-zombie) process in the group remains.
Zombies (stat=Z) are stopped the kernel has cleaned up their state but
PID 1 hasn't called wait() yet. In containers without a proper reaping
init at PID 1 (tini, dumb-init), zombies linger until container exit. We
don't want the test to fail because of unreaped bookkeeping; we want it
to fail only if a process is actually still executing.
"""
try:
os.killpg(pgid, 0) # signal 0 = existence check
return True
except ProcessLookupError:
return False
out = subprocess.run(
["ps", "-g", str(pgid), "-o", "stat="],
capture_output=True, text=True, check=False,
).stdout
except Exception:
# Fall back to the old behaviour if ps is unavailable.
try:
os.killpg(pgid, 0)
return True
except ProcessLookupError:
return False
for line in out.splitlines():
stat = line.strip()
if stat and not stat.startswith("Z"):
return True
return False
def _process_group_snapshot(pgid: int) -> str:
@ -71,6 +90,7 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch):
_hermes_pgid=67890,
poll=lambda: 0,
kill=lambda: None,
wait=lambda timeout=None: 0,
)
killpg_calls = []
@ -79,15 +99,15 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch):
def fake_killpg(pgid, sig):
killpg_calls.append((pgid, sig))
if sig == 0:
raise ProcessLookupError
monkeypatch.setattr(os, "getpgid", fake_getpgid)
monkeypatch.setattr(os, "killpg", fake_killpg)
env._kill_process(proc)
assert killpg_calls == [(67890, signal.SIGTERM), (67890, 0)]
# Cleanup path goes straight to SIGKILL — no graceful SIGTERM retry,
# because the caller (timeout/interrupt branches) has already given up.
assert killpg_calls == [(67890, signal.SIGKILL)]
def test_wait_for_process_kills_subprocess_on_keyboardinterrupt():

View File

@ -384,35 +384,6 @@ class LocalEnvironment(BaseEnvironment):
def _kill_process(self, proc):
"""Kill the entire process group (all children)."""
def _group_alive(pgid: int) -> bool:
try:
# POSIX-only: _IS_WINDOWS is handled before this helper is used.
os.killpg(pgid, 0)
return True
except ProcessLookupError:
return False
except PermissionError:
# The group exists, even if this process cannot signal it.
return True
def _wait_for_group_exit(pgid: int, timeout: float) -> bool:
deadline = time.monotonic() + timeout
while time.monotonic() < deadline:
# Reap the wrapper promptly. A dead but unreaped group leader
# still makes killpg(pgid, 0) report the group as alive.
try:
proc.poll()
except Exception:
pass
if not _group_alive(pgid):
return True
time.sleep(0.05)
try:
proc.poll()
except Exception:
pass
return not _group_alive(pgid)
try:
if _IS_WINDOWS:
proc.terminate()
@ -424,25 +395,24 @@ class LocalEnvironment(BaseEnvironment):
if pgid is None:
raise
# Cleanup path: timeout / KeyboardInterrupt / SystemExit.
# Graceful shutdown is not wanted — the caller has given up on
# the process. Send SIGKILL directly: it's unblockable and the
# kernel processes it synchronously. By the time the syscall
# returns, every process in the group is marked dead.
try:
os.killpg(pgid, signal.SIGTERM)
except ProcessLookupError:
return
# Wait on the process group, not just the shell wrapper. Under
# load the wrapper can exit before grandchildren do; returning
# at that point leaves orphaned process-group members behind.
if _wait_for_group_exit(pgid, 1.0):
return
try:
# POSIX-only: _IS_WINDOWS is handled by the outer branch.
os.killpg(pgid, signal.SIGKILL)
except ProcessLookupError:
return
_wait_for_group_exit(pgid, 2.0)
# Reap our direct child (the wrapper bash). We do NOT wait
# for the entire process group to disappear via kill(-pgid, 0)
# — that requires PID 1 (or some ancestor) to reap orphaned
# zombies, which is not guaranteed in containers without a
# proper init (tini/dumb-init). The processes have stopped;
# their zombie entries are the kernel's bookkeeping, not our
# problem to wait on.
try:
proc.wait(timeout=0.2)
proc.wait(timeout=0.5)
except (subprocess.TimeoutExpired, OSError):
pass
except (ProcessLookupError, PermissionError, OSError):