fix(local-env): SIGKILL directly on cleanup; don't wait for zombie reap #3

Merged
agent-dev-a merged 2 commits from fix/kill-process-direct-sigkill into main 2026-05-26 02:02:44 +00:00
2 changed files with 68 additions and 42 deletions
+55 -30
View File
@@ -33,12 +33,10 @@ def _pgid_still_alive(pgid: int) -> bool:
"""Return True if any LIVE (non-zombie) process in the group remains.
Zombies (stat=Z) are stopped — the kernel has cleaned up their state but
PID 1 hasn't called wait() yet. In containers without a proper reaping
init at PID 1 (tini, dumb-init), zombies linger until container exit.
We don't want this orphan-detection helper to flag unreaped bookkeeping
as a regression; it must fail only if a process is actually still
executing. ``os.killpg(pgid, 0)`` doesn't distinguish — it returns
success for zombies. ``ps STAT`` does.
PID 1 hasn't called wait() yet. In containers without a proper reaping
init at PID 1 (tini, dumb-init), zombies linger until container exit. We
don't want the test to fail because of unreaped bookkeeping; we want it
to fail only if a process is actually still executing.
"""
try:
out = subprocess.run(
@@ -111,8 +109,7 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch):
env._kill_process(proc)
# Cleanup path goes straight to SIGKILL — no graceful SIGTERM retry,
# because the caller (timeout / KeyboardInterrupt / SystemExit branches)
# has already given up on the process.
# because the caller (timeout/interrupt branches) has already given up.
assert killpg_calls == [(67890, signal.SIGKILL)]
@@ -135,18 +132,6 @@ def test_wait_for_process_kills_subprocess_on_keyboardinterrupt():
started = threading.Event()
raise_at = [None] # set by the main thread to tell worker when
# Intercept the subprocess spawned by execute() so we can get its PID
# without depending on psutil or the ps command (both may be absent in
# minimal CI containers).
orig_run_bash = env._run_bash
def _capturing_run_bash(cmd_string, **kw):
proc = orig_run_bash(cmd_string, **kw)
proc_holder["proc"] = proc
return proc
env._run_bash = _capturing_run_bash
# Drive execute() on a separate thread so we can SIGNAL-interrupt it
# via a thread-targeted exception without killing our test process.
def worker():
@@ -162,18 +147,58 @@ def test_wait_for_process_kills_subprocess_on_keyboardinterrupt():
t.start()
# Wait until the subprocess actually exists. LocalEnvironment.execute
# does init_session() (one spawn) before the real command, so we need
# to wait until _run_bash has returned and captured the proc.
deadline = time.monotonic() + 5.0
# to wait until a sleep 30 is visible. Use pgrep-style lookup via
# /proc to find the bash process running our sleep.
deadline = time.monotonic() + 10.0
target_pid = None
our_pid = os.getpid()
while time.monotonic() < deadline:
if "proc" in proc_holder:
# Walk our children and grand-children to find one running 'sleep 30'
try:
import psutil # optional — fall back if absent
for p in psutil.Process(our_pid).children(recursive=True):
try:
if "sleep 30" in " ".join(p.cmdline()):
target_pid = p.pid
break
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
except ImportError:
# Fall back to ps — verify the candidate is in our process tree
# so we don't accidentally match a concurrent xdist worker.
ps = subprocess.run(
["ps", "-eo", "pid,ppid,cmd"],
capture_output=True,
text=True,
check=False,
)
for line in ps.stdout.splitlines():
if "sleep 30" in line and "grep" not in line:
parts = line.split()
if parts and parts[0].isdigit():
cand = int(parts[0])
try:
p = cand
for _ in range(20): # max depth guard
if p == our_pid:
target_pid = cand
break
with open(f"/proc/{p}/status") as f:
for ln in f:
if ln.startswith("PPid:"):
p = int(ln.split()[1])
break
if target_pid:
break
except (FileNotFoundError, PermissionError, ValueError):
continue
if target_pid:
break
time.sleep(0.1)
else:
raise AssertionError(
"test setup: _run_bash did not return a proc within 5 s"
)
target_pid = proc_holder["proc"].pid
assert target_pid is not None, (
"test setup: couldn't find 'sleep 30' subprocess after 10 s"
)
pgid = os.getpgid(target_pid)
assert _pgid_still_alive(pgid), "sanity: subprocess should be alive"
@@ -193,8 +218,8 @@ def test_wait_for_process_kills_subprocess_on_keyboardinterrupt():
# Give the worker a moment to: hit the exception at the next poll,
# run the except-block cleanup (_kill_process), and exit.
t.join(timeout=5.0)
assert not t.is_alive(), "worker didn't exit within 5 s of the interrupt"
t.join(timeout=10.0)
assert not t.is_alive(), "worker didn't exit within 10 s of the interrupt"
# The critical assertion: the subprocess GROUP must be dead. Not
# just the bash wrapper — the 'sleep 30' child too. Under xdist load,
+13 -12
View File
@@ -382,19 +382,8 @@ class LocalEnvironment(BaseEnvironment):
return proc
def _kill_process(self, proc):
"""Kill the entire process group (all children).
"""Kill the entire process group (all children)."""
This is the cleanup path — invoked from ``_wait_for_process`` for
the timeout, KeyboardInterrupt, and SystemExit branches. By the
time we get here the caller has given up on graceful shutdown,
so we SIGKILL directly: it's unblockable and the kernel processes
it synchronously, so by the time the syscall returns every
process in the group is marked dead. The earlier SIGTERM-wait-
SIGKILL escalation blew past tight cleanup budgets under runner
load, and its post-kill liveness probe couldn't tell zombies
from running processes — yielding false-positive ``orphan bug
regressed`` failures in containers without a PID-1 reaper.
"""
try:
if _IS_WINDOWS:
proc.terminate()
@@ -406,10 +395,22 @@ class LocalEnvironment(BaseEnvironment):
if pgid is None:
raise
# Cleanup path: timeout / KeyboardInterrupt / SystemExit.
# Graceful shutdown is not wanted — the caller has given up on
# the process. Send SIGKILL directly: it's unblockable and the
# kernel processes it synchronously. By the time the syscall
# returns, every process in the group is marked dead.
try:
os.killpg(pgid, signal.SIGKILL)
except ProcessLookupError:
return
# Reap our direct child (the wrapper bash). We do NOT wait
# for the entire process group to disappear via kill(-pgid, 0)
# — that requires PID 1 (or some ancestor) to reap orphaned
# zombies, which is not guaranteed in containers without a
# proper init (tini/dumb-init). The processes have stopped;
# their zombie entries are the kernel's bookkeeping, not our
# problem to wait on.
try:
proc.wait(timeout=0.5)
except (subprocess.TimeoutExpired, OSError):