fix(local-env): SIGKILL directly on cleanup; don't wait for zombie reap #3
@ -67,6 +67,9 @@ AUTHOR_MAP = {
|
||||
"274096618+hermes-agent-dhabibi@users.noreply.github.com": "dhabibi",
|
||||
"dejie.guo@gmail.com": "JayGwod",
|
||||
"maxence@groine.fr": "MaxyMoos",
|
||||
# Internal molecule-ai Gitea bot identity used by Claude-Code agents
|
||||
# (post-2026-05-06 GitHub suspension; no upstream/GitHub equivalent).
|
||||
"claude-ceo-assistant@agents.moleculesai.app": "claude-ceo-assistant",
|
||||
# OpenViking viking_read salvage (April 2026)
|
||||
"hitesh@gmail.com": "htsh",
|
||||
"pty819@outlook.com": "pty819",
|
||||
|
||||
@ -30,12 +30,31 @@ def _isolate_hermes_home(tmp_path, monkeypatch):
|
||||
|
||||
|
||||
def _pgid_still_alive(pgid: int) -> bool:
|
||||
"""Return True if any process in the given process group is still alive."""
|
||||
"""Return True if any LIVE (non-zombie) process in the group remains.
|
||||
|
||||
Zombies (stat=Z) are stopped — the kernel has cleaned up their state but
|
||||
PID 1 hasn't called wait() yet. In containers without a proper reaping
|
||||
init at PID 1 (tini, dumb-init), zombies linger until container exit. We
|
||||
don't want the test to fail because of unreaped bookkeeping; we want it
|
||||
to fail only if a process is actually still executing.
|
||||
"""
|
||||
try:
|
||||
os.killpg(pgid, 0) # signal 0 = existence check
|
||||
return True
|
||||
except ProcessLookupError:
|
||||
return False
|
||||
out = subprocess.run(
|
||||
["ps", "-g", str(pgid), "-o", "stat="],
|
||||
capture_output=True, text=True, check=False,
|
||||
).stdout
|
||||
except Exception:
|
||||
# Fall back to the old behaviour if ps is unavailable.
|
||||
try:
|
||||
os.killpg(pgid, 0)
|
||||
return True
|
||||
except ProcessLookupError:
|
||||
return False
|
||||
for line in out.splitlines():
|
||||
stat = line.strip()
|
||||
if stat and not stat.startswith("Z"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _process_group_snapshot(pgid: int) -> str:
|
||||
@ -71,6 +90,7 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch):
|
||||
_hermes_pgid=67890,
|
||||
poll=lambda: 0,
|
||||
kill=lambda: None,
|
||||
wait=lambda timeout=None: 0,
|
||||
)
|
||||
killpg_calls = []
|
||||
|
||||
@ -79,15 +99,15 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch):
|
||||
|
||||
def fake_killpg(pgid, sig):
|
||||
killpg_calls.append((pgid, sig))
|
||||
if sig == 0:
|
||||
raise ProcessLookupError
|
||||
|
||||
monkeypatch.setattr(os, "getpgid", fake_getpgid)
|
||||
monkeypatch.setattr(os, "killpg", fake_killpg)
|
||||
|
||||
env._kill_process(proc)
|
||||
|
||||
assert killpg_calls == [(67890, signal.SIGTERM), (67890, 0)]
|
||||
# Cleanup path goes straight to SIGKILL — no graceful SIGTERM retry,
|
||||
# because the caller (timeout/interrupt branches) has already given up.
|
||||
assert killpg_calls == [(67890, signal.SIGKILL)]
|
||||
|
||||
|
||||
def test_wait_for_process_kills_subprocess_on_keyboardinterrupt():
|
||||
|
||||
@ -384,35 +384,6 @@ class LocalEnvironment(BaseEnvironment):
|
||||
def _kill_process(self, proc):
|
||||
"""Kill the entire process group (all children)."""
|
||||
|
||||
def _group_alive(pgid: int) -> bool:
|
||||
try:
|
||||
# POSIX-only: _IS_WINDOWS is handled before this helper is used.
|
||||
os.killpg(pgid, 0)
|
||||
return True
|
||||
except ProcessLookupError:
|
||||
return False
|
||||
except PermissionError:
|
||||
# The group exists, even if this process cannot signal it.
|
||||
return True
|
||||
|
||||
def _wait_for_group_exit(pgid: int, timeout: float) -> bool:
|
||||
deadline = time.monotonic() + timeout
|
||||
while time.monotonic() < deadline:
|
||||
# Reap the wrapper promptly. A dead but unreaped group leader
|
||||
# still makes killpg(pgid, 0) report the group as alive.
|
||||
try:
|
||||
proc.poll()
|
||||
except Exception:
|
||||
pass
|
||||
if not _group_alive(pgid):
|
||||
return True
|
||||
time.sleep(0.05)
|
||||
try:
|
||||
proc.poll()
|
||||
except Exception:
|
||||
pass
|
||||
return not _group_alive(pgid)
|
||||
|
||||
try:
|
||||
if _IS_WINDOWS:
|
||||
proc.terminate()
|
||||
@ -424,25 +395,24 @@ class LocalEnvironment(BaseEnvironment):
|
||||
if pgid is None:
|
||||
raise
|
||||
|
||||
# Cleanup path: timeout / KeyboardInterrupt / SystemExit.
|
||||
# Graceful shutdown is not wanted — the caller has given up on
|
||||
# the process. Send SIGKILL directly: it's unblockable and the
|
||||
# kernel processes it synchronously. By the time the syscall
|
||||
# returns, every process in the group is marked dead.
|
||||
try:
|
||||
os.killpg(pgid, signal.SIGTERM)
|
||||
except ProcessLookupError:
|
||||
return
|
||||
|
||||
# Wait on the process group, not just the shell wrapper. Under
|
||||
# load the wrapper can exit before grandchildren do; returning
|
||||
# at that point leaves orphaned process-group members behind.
|
||||
if _wait_for_group_exit(pgid, 1.0):
|
||||
return
|
||||
|
||||
try:
|
||||
# POSIX-only: _IS_WINDOWS is handled by the outer branch.
|
||||
os.killpg(pgid, signal.SIGKILL)
|
||||
except ProcessLookupError:
|
||||
return
|
||||
_wait_for_group_exit(pgid, 2.0)
|
||||
# Reap our direct child (the wrapper bash). We do NOT wait
|
||||
# for the entire process group to disappear via kill(-pgid, 0)
|
||||
# — that requires PID 1 (or some ancestor) to reap orphaned
|
||||
# zombies, which is not guaranteed in containers without a
|
||||
# proper init (tini/dumb-init). The processes have stopped;
|
||||
# their zombie entries are the kernel's bookkeeping, not our
|
||||
# problem to wait on.
|
||||
try:
|
||||
proc.wait(timeout=0.2)
|
||||
proc.wait(timeout=0.5)
|
||||
except (subprocess.TimeoutExpired, OSError):
|
||||
pass
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user