hermes-agent/tests/tools/test_local_interrupt_cleanup.py

"""Regression tests for _wait_for_process subprocess cleanup on exception exit.

When the poll loop exits via KeyboardInterrupt or SystemExit (SIGTERM via
cli.py signal handler, SIGINT on the main thread in non-interactive -q mode,
or explicit sys.exit from some caller), the child subprocess must be killed
before the exception propagates — otherwise the local backend's use of
os.setsid leaves an orphan with PPID=1.

The live repro that motivated this: hermes chat -q ... 'sleep 300', SIGTERM
to the python process, sleep 300 survived with PPID=1 for the full 300 s
because _wait_for_process never got to call _kill_process before python
died.  See commit message for full context.
"""
import os
import signal
import subprocess
import threading
import time
from types import SimpleNamespace

import pytest

from tools.environments.local import LocalEnvironment


@pytest.fixture(autouse=True)
def _isolate_hermes_home(tmp_path, monkeypatch):
    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
    (tmp_path / "logs").mkdir(exist_ok=True)


def _pgid_still_alive(pgid: int) -> bool:
    """Return True if any LIVE (non-zombie) process in the group remains.

    Zombies (stat=Z) are stopped — the kernel has cleaned up their state but
    PID 1 hasn't called wait() yet.  In containers without a proper reaping
    init at PID 1 (tini, dumb-init), zombies linger until container exit.
    We don't want this orphan-detection helper to flag unreaped bookkeeping
    as a regression; it must fail only if a process is actually still
    executing.  ``os.killpg(pgid, 0)`` doesn't distinguish — it returns
    success for zombies.  ``ps STAT`` does.
    """
    try:
        out = subprocess.run(
            ["ps", "-g", str(pgid), "-o", "stat="],
            capture_output=True, text=True, check=False,
        ).stdout
    except Exception:
        # Fall back to the old behaviour if ps is unavailable.
        try:
            os.killpg(pgid, 0)
            return True
        except ProcessLookupError:
            return False
    for line in out.splitlines():
        stat = line.strip()
        if stat and not stat.startswith("Z"):
            return True
    return False


def _process_group_snapshot(pgid: int) -> str:
    """Return a process-table snapshot for diagnostics."""
    return subprocess.run(
        ["ps", "-o", "pid,ppid,pgid,stat,cmd", "-g", str(pgid)],
        capture_output=True,
        text=True,
        check=False,
    ).stdout.strip()


def _wait_for_pgid_exit(pgid: int, timeout: float = 10.0) -> bool:
    """Wait for a process group to disappear under loaded xdist hosts."""
    deadline = time.monotonic() + timeout
    while time.monotonic() < deadline:
        if not _pgid_still_alive(pgid):
            return True
        time.sleep(0.1)
    return not _pgid_still_alive(pgid)


def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch):
    """If the shell wrapper exits before cleanup, still kill its process group.

    Without the cached pgid fallback, ``os.getpgid(proc.pid)`` raises for the
    dead wrapper and cleanup falls back to ``proc.kill()``, which cannot reach
    orphaned grandchildren still running in the original process group.
    """
    env = object.__new__(LocalEnvironment)
    proc = SimpleNamespace(
        pid=12345,
        _hermes_pgid=67890,
        poll=lambda: 0,
        kill=lambda: None,
        wait=lambda timeout=None: 0,
    )
    killpg_calls = []

    def fake_getpgid(_pid):
        raise ProcessLookupError

    def fake_killpg(pgid, sig):
        killpg_calls.append((pgid, sig))

    monkeypatch.setattr(os, "getpgid", fake_getpgid)
    monkeypatch.setattr(os, "killpg", fake_killpg)

    env._kill_process(proc)

    # Cleanup path goes straight to SIGKILL — no graceful SIGTERM retry,
    # because the caller (timeout / KeyboardInterrupt / SystemExit branches)
    # has already given up on the process.
    assert killpg_calls == [(67890, signal.SIGKILL)]


def test_wait_for_process_kills_subprocess_on_keyboardinterrupt():
    """When KeyboardInterrupt arrives mid-poll, the subprocess group must be
    killed before the exception is re-raised."""
    env = LocalEnvironment(cwd="/tmp")
    try:
        result_holder = {}
        proc_holder = {}
        started = threading.Event()
        raise_at = [None]  # set by the main thread to tell worker when

        # Drive execute() on a separate thread so we can SIGNAL-interrupt it
        # via a thread-targeted exception without killing our test process.
        def worker():
            # Spawn a subprocess that will definitely be alive long enough
            # to observe the cleanup, via env.execute(...) — the normal path
            # that goes through _wait_for_process.
            try:
                result_holder["result"] = env.execute("sleep 30", timeout=60)
            except BaseException as e:  # noqa: BLE001 — we want to observe it
                result_holder["exception"] = type(e).__name__

        t = threading.Thread(target=worker, daemon=True)
        t.start()
        # Wait until the subprocess actually exists.  LocalEnvironment.execute
        # does init_session() (one spawn) before the real command, so we need
        # to wait until a sleep 30 is visible.  Use pgrep-style lookup via
        # /proc to find the bash process running our sleep.
        deadline = time.monotonic() + 5.0
        target_pid = None
        while time.monotonic() < deadline:
            # Walk our children and grand-children to find one running 'sleep 30'
            try:
                import psutil  # optional — fall back if absent
                for p in psutil.Process(os.getpid()).children(recursive=True):
                    try:
                        if "sleep 30" in " ".join(p.cmdline()):
                            target_pid = p.pid
                            break
                    except (psutil.NoSuchProcess, psutil.AccessDenied):
                        continue
            except ImportError:
                # Fall back to ps
                ps = subprocess.run(
                    ["ps", "-eo", "pid,ppid,pgid,cmd"], capture_output=True, text=True,
                )
                for line in ps.stdout.splitlines():
                    if "sleep 30" in line and "grep" not in line:
                        parts = line.split()
                        if parts and parts[0].isdigit():
                            target_pid = int(parts[0])
                            break
            if target_pid:
                break
            time.sleep(0.1)

        assert target_pid is not None, (
            "test setup: couldn't find 'sleep 30' subprocess after 5 s"
        )
        pgid = os.getpgid(target_pid)
        assert _pgid_still_alive(pgid), "sanity: subprocess should be alive"

        # Now inject a KeyboardInterrupt into the worker thread the same
        # way CPython's signal machinery would.  We use ctypes.PyThreadState_SetAsyncExc
        # which is how signal delivery to non-main threads is simulated.
        import ctypes
        import sys as _sys
        # py-thread-state exception targets need the ident, not the Thread
        tid = t.ident
        assert tid is not None
        # Fire KeyboardInterrupt into the worker thread
        ret = ctypes.pythonapi.PyThreadState_SetAsyncExc(
            ctypes.c_ulong(tid), ctypes.py_object(KeyboardInterrupt),
        )
        assert ret == 1, f"SetAsyncExc returned {ret}, expected 1"

        # Give the worker a moment to: hit the exception at the next poll,
        # run the except-block cleanup (_kill_process), and exit.
        t.join(timeout=5.0)
        assert not t.is_alive(), "worker didn't exit within 5 s of the interrupt"

        # The critical assertion: the subprocess GROUP must be dead.  Not
        # just the bash wrapper — the 'sleep 30' child too. Under xdist load,
        # process-group disappearance can lag briefly after the worker exits,
        # especially if the process is already dying or waiting to be reaped.
        assert _wait_for_pgid_exit(pgid), (
            f"subprocess group {pgid} is STILL ALIVE after worker received "
            f"KeyboardInterrupt — orphan bug regressed.  This is the "
            f"sleep-300-survives-SIGTERM scenario from Physikal's Apr 2026 "
            f"report.  See tools/environments/base.py _wait_for_process "
            f"except-block.\n{_process_group_snapshot(pgid)}"
        )
        # And the worker should have observed the KeyboardInterrupt (i.e.
        # it re-raised cleanly, not silently swallowed).
        assert result_holder.get("exception") == "KeyboardInterrupt", (
            f"worker result: {result_holder!r} — expected KeyboardInterrupt "
            f"propagation after cleanup"
        )
    finally:
        try:
            env.cleanup()
        except Exception:
            pass