fix(local-env): SIGKILL directly on cleanup; don't wait for zombie reap #3
@@ -33,12 +33,10 @@ def _pgid_still_alive(pgid: int) -> bool:
|
||||
"""Return True if any LIVE (non-zombie) process in the group remains.
|
||||
|
||||
Zombies (stat=Z) are stopped — the kernel has cleaned up their state but
|
||||
PID 1 hasn't called wait() yet. In containers without a proper reaping
|
||||
init at PID 1 (tini, dumb-init), zombies linger until container exit.
|
||||
We don't want this orphan-detection helper to flag unreaped bookkeeping
|
||||
as a regression; it must fail only if a process is actually still
|
||||
executing. ``os.killpg(pgid, 0)`` doesn't distinguish — it returns
|
||||
success for zombies. ``ps STAT`` does.
|
||||
PID 1 hasn't called wait() yet. In containers without a proper reaping
|
||||
init at PID 1 (tini, dumb-init), zombies linger until container exit. We
|
||||
don't want the test to fail because of unreaped bookkeeping; we want it
|
||||
to fail only if a process is actually still executing.
|
||||
"""
|
||||
try:
|
||||
out = subprocess.run(
|
||||
@@ -111,8 +109,7 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch):
|
||||
env._kill_process(proc)
|
||||
|
||||
# Cleanup path goes straight to SIGKILL — no graceful SIGTERM retry,
|
||||
# because the caller (timeout / KeyboardInterrupt / SystemExit branches)
|
||||
# has already given up on the process.
|
||||
# because the caller (timeout/interrupt branches) has already given up.
|
||||
assert killpg_calls == [(67890, signal.SIGKILL)]
|
||||
|
||||
|
||||
@@ -135,18 +132,6 @@ def test_wait_for_process_kills_subprocess_on_keyboardinterrupt():
|
||||
started = threading.Event()
|
||||
raise_at = [None] # set by the main thread to tell worker when
|
||||
|
||||
# Intercept the subprocess spawned by execute() so we can get its PID
|
||||
# without depending on psutil or the ps command (both may be absent in
|
||||
# minimal CI containers).
|
||||
orig_run_bash = env._run_bash
|
||||
|
||||
def _capturing_run_bash(cmd_string, **kw):
|
||||
proc = orig_run_bash(cmd_string, **kw)
|
||||
proc_holder["proc"] = proc
|
||||
return proc
|
||||
|
||||
env._run_bash = _capturing_run_bash
|
||||
|
||||
# Drive execute() on a separate thread so we can SIGNAL-interrupt it
|
||||
# via a thread-targeted exception without killing our test process.
|
||||
def worker():
|
||||
@@ -162,18 +147,58 @@ def test_wait_for_process_kills_subprocess_on_keyboardinterrupt():
|
||||
t.start()
|
||||
# Wait until the subprocess actually exists. LocalEnvironment.execute
|
||||
# does init_session() (one spawn) before the real command, so we need
|
||||
# to wait until _run_bash has returned and captured the proc.
|
||||
deadline = time.monotonic() + 5.0
|
||||
# to wait until a sleep 30 is visible. Use pgrep-style lookup via
|
||||
# /proc to find the bash process running our sleep.
|
||||
deadline = time.monotonic() + 10.0
|
||||
target_pid = None
|
||||
our_pid = os.getpid()
|
||||
while time.monotonic() < deadline:
|
||||
if "proc" in proc_holder:
|
||||
# Walk our children and grand-children to find one running 'sleep 30'
|
||||
try:
|
||||
import psutil # optional — fall back if absent
|
||||
for p in psutil.Process(our_pid).children(recursive=True):
|
||||
try:
|
||||
if "sleep 30" in " ".join(p.cmdline()):
|
||||
target_pid = p.pid
|
||||
break
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||||
continue
|
||||
except ImportError:
|
||||
# Fall back to ps — verify the candidate is in our process tree
|
||||
# so we don't accidentally match a concurrent xdist worker.
|
||||
ps = subprocess.run(
|
||||
["ps", "-eo", "pid,ppid,cmd"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
for line in ps.stdout.splitlines():
|
||||
if "sleep 30" in line and "grep" not in line:
|
||||
parts = line.split()
|
||||
if parts and parts[0].isdigit():
|
||||
cand = int(parts[0])
|
||||
try:
|
||||
p = cand
|
||||
for _ in range(20): # max depth guard
|
||||
if p == our_pid:
|
||||
target_pid = cand
|
||||
break
|
||||
with open(f"/proc/{p}/status") as f:
|
||||
for ln in f:
|
||||
if ln.startswith("PPid:"):
|
||||
p = int(ln.split()[1])
|
||||
break
|
||||
if target_pid:
|
||||
break
|
||||
except (FileNotFoundError, PermissionError, ValueError):
|
||||
continue
|
||||
if target_pid:
|
||||
break
|
||||
time.sleep(0.1)
|
||||
else:
|
||||
raise AssertionError(
|
||||
"test setup: _run_bash did not return a proc within 5 s"
|
||||
)
|
||||
|
||||
target_pid = proc_holder["proc"].pid
|
||||
assert target_pid is not None, (
|
||||
"test setup: couldn't find 'sleep 30' subprocess after 10 s"
|
||||
)
|
||||
pgid = os.getpgid(target_pid)
|
||||
assert _pgid_still_alive(pgid), "sanity: subprocess should be alive"
|
||||
|
||||
@@ -193,8 +218,8 @@ def test_wait_for_process_kills_subprocess_on_keyboardinterrupt():
|
||||
|
||||
# Give the worker a moment to: hit the exception at the next poll,
|
||||
# run the except-block cleanup (_kill_process), and exit.
|
||||
t.join(timeout=5.0)
|
||||
assert not t.is_alive(), "worker didn't exit within 5 s of the interrupt"
|
||||
t.join(timeout=10.0)
|
||||
assert not t.is_alive(), "worker didn't exit within 10 s of the interrupt"
|
||||
|
||||
# The critical assertion: the subprocess GROUP must be dead. Not
|
||||
# just the bash wrapper — the 'sleep 30' child too. Under xdist load,
|
||||
|
||||
+13
-12
@@ -382,19 +382,8 @@ class LocalEnvironment(BaseEnvironment):
|
||||
return proc
|
||||
|
||||
def _kill_process(self, proc):
|
||||
"""Kill the entire process group (all children).
|
||||
"""Kill the entire process group (all children)."""
|
||||
|
||||
This is the cleanup path — invoked from ``_wait_for_process`` for
|
||||
the timeout, KeyboardInterrupt, and SystemExit branches. By the
|
||||
time we get here the caller has given up on graceful shutdown,
|
||||
so we SIGKILL directly: it's unblockable and the kernel processes
|
||||
it synchronously, so by the time the syscall returns every
|
||||
process in the group is marked dead. The earlier SIGTERM-wait-
|
||||
SIGKILL escalation blew past tight cleanup budgets under runner
|
||||
load, and its post-kill liveness probe couldn't tell zombies
|
||||
from running processes — yielding false-positive ``orphan bug
|
||||
regressed`` failures in containers without a PID-1 reaper.
|
||||
"""
|
||||
try:
|
||||
if _IS_WINDOWS:
|
||||
proc.terminate()
|
||||
@@ -406,10 +395,22 @@ class LocalEnvironment(BaseEnvironment):
|
||||
if pgid is None:
|
||||
raise
|
||||
|
||||
# Cleanup path: timeout / KeyboardInterrupt / SystemExit.
|
||||
# Graceful shutdown is not wanted — the caller has given up on
|
||||
# the process. Send SIGKILL directly: it's unblockable and the
|
||||
# kernel processes it synchronously. By the time the syscall
|
||||
# returns, every process in the group is marked dead.
|
||||
try:
|
||||
os.killpg(pgid, signal.SIGKILL)
|
||||
except ProcessLookupError:
|
||||
return
|
||||
# Reap our direct child (the wrapper bash). We do NOT wait
|
||||
# for the entire process group to disappear via kill(-pgid, 0)
|
||||
# — that requires PID 1 (or some ancestor) to reap orphaned
|
||||
# zombies, which is not guaranteed in containers without a
|
||||
# proper init (tini/dumb-init). The processes have stopped;
|
||||
# their zombie entries are the kernel's bookkeeping, not our
|
||||
# problem to wait on.
|
||||
try:
|
||||
proc.wait(timeout=0.5)
|
||||
except (subprocess.TimeoutExpired, OSError):
|
||||
|
||||
Reference in New Issue
Block a user