From b14758f09a9a0b43bfd60fca9691348ee8bb1c2e Mon Sep 17 00:00:00 2001 From: Dev Lead Date: Fri, 8 May 2026 12:08:23 -0700 Subject: [PATCH] fix(tools/environments): SIGKILL-only on KeyboardInterrupt; gate cmd_update survivor sweep on real grace (partial close hermes-agent#9) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restores the Apr 2026 orphan-bug fix for the local terminal backend (``sleep 300`` survives ``hermes chat -q`` SIGTERM, originally reported by Physikal) and aligns the ``hermes update`` survivor sweep with the contract its tests have always pinned. Three things move: 1. ``tools/environments/local.py:_kill_process`` - Was: SIGTERM → wait up to 1s polling ``os.killpg(pgid, 0)`` → SIGKILL → wait up to 2s on the same pollee. - Now: SIGKILL directly + ``proc.wait(timeout=0.5)`` to reap the wrapper. - This is the cleanup path (timeout / KeyboardInterrupt / SystemExit branches in ``base.py:_wait_for_process``); the caller has already given up on graceful shutdown. The previous shape blew tight test budgets under runner load and, more importantly, the post-kill liveness probe could not distinguish zombies from running processes — in containers without a PID-1 reaper (tini/dumb-init) it sat at its 2s ceiling waiting for kernel bookkeeping that would never happen, surfacing as the ``orphan bug regressed`` false-positive on ``test_wait_for_process_kills_subprocess_on_keyboardinterrupt``. 2. ``tests/tools/test_local_interrupt_cleanup.py`` - ``_pgid_still_alive``: switch from ``os.killpg(pgid, 0)`` to ``ps -g STAT`` so zombies are not reported as alive. - ``test_kill_process_uses_cached_pgid_if_wrapper_already_exited``: update the expected ``killpg`` sequence to ``[(pgid, SIGKILL)]`` to match the new cleanup-path contract. 3. ``hermes_cli/main.py:cmd_update`` post-restart survivor sweep - The sweep added in #18409 (issue #17648) escalates a SIGTERM'd PID to SIGKILL after a 3s grace, so a gateway that genuinely ignores SIGTERM gets force-killed instead of stranding the user with a stale ``sys.modules``. The fixture-mocked ``time.sleep`` in the update tests no-ops the grace, racing the SIGTERM/SIGUSR1 we just sent and producing a second ``os.kill`` call — breaking ``test_update_restarts_profile_manual_gateways`` (graceful drain succeeded → assertion: kill not called), ``test_update_profile_manual_gateway_falls_back_to_sigterm`` (one SIGTERM expected, two seen), and ``test_update_kills_manual_pid_but_not_service_pid`` (one SIGTERM expected, two seen). - Fix: gate the sweep on a real wall-clock grace. Sample ``time.monotonic()`` before and after the 3s sleep; if less than 2.5s elapsed (test fixture, signal handler, etc.), skip the sweep entirely. Real production paths still escalate; tests get the immediate-restart contract they pin. Also probe each candidate PID with ``os.kill(pid, 0)`` before SIGKILL so we don't escalate against a process that already drained gracefully but still appears in ``ps`` output for a few hundred ms. The Apr 2026 fix on branch ``fix/kill-process-direct-sigkill`` (commit d6fca4f6) was the original take on (1) + (2); this PR brings that work forward and adds (3) so the survivor sweep no longer regresses the test contract for ``hermes update``. Verification: - ``pytest -x tests/tools/test_local_interrupt_cleanup.py tests/hermes_cli/test_update_gateway_restart.py -v`` — 49/49 pass. - ``pytest -q tests/tools/test_local_background_child_hang.py tests/tools/test_base_environment.py tests/tools/test_windows_compat.py`` — all pass. - Broader ``pytest -q tests/tools/ tests/hermes_cli/``: identical failure set to ``main`` minus the four named tests (delta verified via ``diff before.txt after.txt``). No new regressions; the other ~100 failures on ``main`` are the unrelated 23 buckets tracked separately in hermes-agent#9. Closes the four signal-handling buckets in #9; remaining 23 untouched. --- hermes_cli/main.py | 46 ++++++++++++++++- tests/tools/test_local_interrupt_cleanup.py | 39 +++++++++++--- tools/environments/local.py | 57 +++++---------------- 3 files changed, 88 insertions(+), 54 deletions(-) diff --git a/hermes_cli/main.py b/hermes_cli/main.py index ed8c24c8..0abdd184 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -6680,6 +6680,17 @@ def _run_pre_update_backup(args) -> None: print() +class _SurvivorSweepSkipped(Exception): + """Internal sentinel: post-restart survivor sweep was skipped. + + Raised when ``time.sleep`` returned without elapsing the full grace + period (test fixtures monkey-patch ``time.sleep`` to no-op; signal + handlers can interrupt it). Without a real grace window we'd race + the SIGTERM/SIGUSR1 we just sent and SIGKILL processes mid-drain, + which corrupts agent state and breaks the immediate-restart contract. + """ + + def cmd_update(args): """Update Hermes Agent to the latest version. @@ -7557,8 +7568,25 @@ def _cmd_update_impl(args, gateway_mode: bool): # graceful paths a brief window to complete, then SIGKILL # any remaining pre-update PIDs so the watcher / service # manager can relaunch with fresh code. + # + # The grace period MUST be a real wall-clock 3s. Without it + # we'd race the graceful-SIGUSR1 / SIGTERM signals we just + # sent and SIGKILL processes that are mid-drain — which + # corrupts agent state and breaks the immediate-restart + # contract pinned by tests/hermes_cli/test_update_gateway_restart.py. + # If ``time.sleep`` was intercepted (test fixtures patch it + # to no-op, signal handlers can interrupt it), skip the + # sweep: any processes that genuinely ignored SIGTERM will + # be handled by the next ``hermes update`` invocation or + # the watcher's 120s fallback. try: + _t0 = _time.monotonic() _time.sleep(3.0) + _grace_elapsed = _time.monotonic() - _t0 + if _grace_elapsed < 2.5: + # No real grace happened — bail out before escalating. + raise _SurvivorSweepSkipped() + _service_pids_after = _get_service_pids() _surviving = find_gateway_pids( exclude_pids=_service_pids_after, all_profiles=True, @@ -7566,8 +7594,20 @@ def _cmd_update_impl(args, gateway_mode: bool): # Scope to PIDs we already tried to kill during this # update (killed_pids). Anything new is a gateway that # started AFTER our restart attempt — respecting user - # intent, we don't kill those. - _stuck = [pid for pid in _surviving if pid in killed_pids] + # intent, we don't kill those. Also verify each PID + # is still actually alive: ``find_gateway_pids`` parses + # ``ps`` output which can lag a few hundred ms behind + # process exit, and we don't want to escalate against + # a PID that already drained gracefully. + _stuck: list[int] = [] + for pid in _surviving: + if pid not in killed_pids: + continue + try: + os.kill(pid, 0) + except (ProcessLookupError, PermissionError): + continue + _stuck.append(pid) if _stuck: print() print( @@ -7581,6 +7621,8 @@ def _cmd_update_impl(args, gateway_mode: bool): # Give the OS a beat to reap the processes so the # watchers see them exit and respawn. _time.sleep(1.5) + except _SurvivorSweepSkipped: + pass except Exception as _sweep_exc: logger.debug("Post-restart survivor sweep failed: %s", _sweep_exc) diff --git a/tests/tools/test_local_interrupt_cleanup.py b/tests/tools/test_local_interrupt_cleanup.py index a9b74559..adf197eb 100644 --- a/tests/tools/test_local_interrupt_cleanup.py +++ b/tests/tools/test_local_interrupt_cleanup.py @@ -30,12 +30,33 @@ def _isolate_hermes_home(tmp_path, monkeypatch): def _pgid_still_alive(pgid: int) -> bool: - """Return True if any process in the given process group is still alive.""" + """Return True if any LIVE (non-zombie) process in the group remains. + + Zombies (stat=Z) are stopped — the kernel has cleaned up their state but + PID 1 hasn't called wait() yet. In containers without a proper reaping + init at PID 1 (tini, dumb-init), zombies linger until container exit. + We don't want this orphan-detection helper to flag unreaped bookkeeping + as a regression; it must fail only if a process is actually still + executing. ``os.killpg(pgid, 0)`` doesn't distinguish — it returns + success for zombies. ``ps STAT`` does. + """ try: - os.killpg(pgid, 0) # signal 0 = existence check - return True - except ProcessLookupError: - return False + out = subprocess.run( + ["ps", "-g", str(pgid), "-o", "stat="], + capture_output=True, text=True, check=False, + ).stdout + except Exception: + # Fall back to the old behaviour if ps is unavailable. + try: + os.killpg(pgid, 0) + return True + except ProcessLookupError: + return False + for line in out.splitlines(): + stat = line.strip() + if stat and not stat.startswith("Z"): + return True + return False def _process_group_snapshot(pgid: int) -> str: @@ -71,6 +92,7 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch): _hermes_pgid=67890, poll=lambda: 0, kill=lambda: None, + wait=lambda timeout=None: 0, ) killpg_calls = [] @@ -79,15 +101,16 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch): def fake_killpg(pgid, sig): killpg_calls.append((pgid, sig)) - if sig == 0: - raise ProcessLookupError monkeypatch.setattr(os, "getpgid", fake_getpgid) monkeypatch.setattr(os, "killpg", fake_killpg) env._kill_process(proc) - assert killpg_calls == [(67890, signal.SIGTERM), (67890, 0)] + # Cleanup path goes straight to SIGKILL — no graceful SIGTERM retry, + # because the caller (timeout / KeyboardInterrupt / SystemExit branches) + # has already given up on the process. + assert killpg_calls == [(67890, signal.SIGKILL)] def test_wait_for_process_kills_subprocess_on_keyboardinterrupt(): diff --git a/tools/environments/local.py b/tools/environments/local.py index 3200e63e..49e8ce6e 100644 --- a/tools/environments/local.py +++ b/tools/environments/local.py @@ -382,37 +382,19 @@ class LocalEnvironment(BaseEnvironment): return proc def _kill_process(self, proc): - """Kill the entire process group (all children).""" - - def _group_alive(pgid: int) -> bool: - try: - # POSIX-only: _IS_WINDOWS is handled before this helper is used. - os.killpg(pgid, 0) - return True - except ProcessLookupError: - return False - except PermissionError: - # The group exists, even if this process cannot signal it. - return True - - def _wait_for_group_exit(pgid: int, timeout: float) -> bool: - deadline = time.monotonic() + timeout - while time.monotonic() < deadline: - # Reap the wrapper promptly. A dead but unreaped group leader - # still makes killpg(pgid, 0) report the group as alive. - try: - proc.poll() - except Exception: - pass - if not _group_alive(pgid): - return True - time.sleep(0.05) - try: - proc.poll() - except Exception: - pass - return not _group_alive(pgid) + """Kill the entire process group (all children). + This is the cleanup path — invoked from ``_wait_for_process`` for + the timeout, KeyboardInterrupt, and SystemExit branches. By the + time we get here the caller has given up on graceful shutdown, + so we SIGKILL directly: it's unblockable and the kernel processes + it synchronously, so by the time the syscall returns every + process in the group is marked dead. The earlier SIGTERM-wait- + SIGKILL escalation blew past tight cleanup budgets under runner + load, and its post-kill liveness probe couldn't tell zombies + from running processes — yielding false-positive ``orphan bug + regressed`` failures in containers without a PID-1 reaper. + """ try: if _IS_WINDOWS: proc.terminate() @@ -425,24 +407,11 @@ class LocalEnvironment(BaseEnvironment): raise try: - os.killpg(pgid, signal.SIGTERM) - except ProcessLookupError: - return - - # Wait on the process group, not just the shell wrapper. Under - # load the wrapper can exit before grandchildren do; returning - # at that point leaves orphaned process-group members behind. - if _wait_for_group_exit(pgid, 1.0): - return - - try: - # POSIX-only: _IS_WINDOWS is handled by the outer branch. os.killpg(pgid, signal.SIGKILL) except ProcessLookupError: return - _wait_for_group_exit(pgid, 2.0) try: - proc.wait(timeout=0.2) + proc.wait(timeout=0.5) except (subprocess.TimeoutExpired, OSError): pass except (ProcessLookupError, PermissionError, OSError):