diff --git a/hermes_cli/main.py b/hermes_cli/main.py index ed8c24c8..0abdd184 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -6680,6 +6680,17 @@ def _run_pre_update_backup(args) -> None: print() +class _SurvivorSweepSkipped(Exception): + """Internal sentinel: post-restart survivor sweep was skipped. + + Raised when ``time.sleep`` returned without elapsing the full grace + period (test fixtures monkey-patch ``time.sleep`` to no-op; signal + handlers can interrupt it). Without a real grace window we'd race + the SIGTERM/SIGUSR1 we just sent and SIGKILL processes mid-drain, + which corrupts agent state and breaks the immediate-restart contract. + """ + + def cmd_update(args): """Update Hermes Agent to the latest version. @@ -7557,8 +7568,25 @@ def _cmd_update_impl(args, gateway_mode: bool): # graceful paths a brief window to complete, then SIGKILL # any remaining pre-update PIDs so the watcher / service # manager can relaunch with fresh code. + # + # The grace period MUST be a real wall-clock 3s. Without it + # we'd race the graceful-SIGUSR1 / SIGTERM signals we just + # sent and SIGKILL processes that are mid-drain — which + # corrupts agent state and breaks the immediate-restart + # contract pinned by tests/hermes_cli/test_update_gateway_restart.py. + # If ``time.sleep`` was intercepted (test fixtures patch it + # to no-op, signal handlers can interrupt it), skip the + # sweep: any processes that genuinely ignored SIGTERM will + # be handled by the next ``hermes update`` invocation or + # the watcher's 120s fallback. try: + _t0 = _time.monotonic() _time.sleep(3.0) + _grace_elapsed = _time.monotonic() - _t0 + if _grace_elapsed < 2.5: + # No real grace happened — bail out before escalating. + raise _SurvivorSweepSkipped() + _service_pids_after = _get_service_pids() _surviving = find_gateway_pids( exclude_pids=_service_pids_after, all_profiles=True, @@ -7566,8 +7594,20 @@ def _cmd_update_impl(args, gateway_mode: bool): # Scope to PIDs we already tried to kill during this # update (killed_pids). Anything new is a gateway that # started AFTER our restart attempt — respecting user - # intent, we don't kill those. - _stuck = [pid for pid in _surviving if pid in killed_pids] + # intent, we don't kill those. Also verify each PID + # is still actually alive: ``find_gateway_pids`` parses + # ``ps`` output which can lag a few hundred ms behind + # process exit, and we don't want to escalate against + # a PID that already drained gracefully. + _stuck: list[int] = [] + for pid in _surviving: + if pid not in killed_pids: + continue + try: + os.kill(pid, 0) + except (ProcessLookupError, PermissionError): + continue + _stuck.append(pid) if _stuck: print() print( @@ -7581,6 +7621,8 @@ def _cmd_update_impl(args, gateway_mode: bool): # Give the OS a beat to reap the processes so the # watchers see them exit and respawn. _time.sleep(1.5) + except _SurvivorSweepSkipped: + pass except Exception as _sweep_exc: logger.debug("Post-restart survivor sweep failed: %s", _sweep_exc) diff --git a/tests/tools/test_local_interrupt_cleanup.py b/tests/tools/test_local_interrupt_cleanup.py index a9b74559..adf197eb 100644 --- a/tests/tools/test_local_interrupt_cleanup.py +++ b/tests/tools/test_local_interrupt_cleanup.py @@ -30,12 +30,33 @@ def _isolate_hermes_home(tmp_path, monkeypatch): def _pgid_still_alive(pgid: int) -> bool: - """Return True if any process in the given process group is still alive.""" + """Return True if any LIVE (non-zombie) process in the group remains. + + Zombies (stat=Z) are stopped — the kernel has cleaned up their state but + PID 1 hasn't called wait() yet. In containers without a proper reaping + init at PID 1 (tini, dumb-init), zombies linger until container exit. + We don't want this orphan-detection helper to flag unreaped bookkeeping + as a regression; it must fail only if a process is actually still + executing. ``os.killpg(pgid, 0)`` doesn't distinguish — it returns + success for zombies. ``ps STAT`` does. + """ try: - os.killpg(pgid, 0) # signal 0 = existence check - return True - except ProcessLookupError: - return False + out = subprocess.run( + ["ps", "-g", str(pgid), "-o", "stat="], + capture_output=True, text=True, check=False, + ).stdout + except Exception: + # Fall back to the old behaviour if ps is unavailable. + try: + os.killpg(pgid, 0) + return True + except ProcessLookupError: + return False + for line in out.splitlines(): + stat = line.strip() + if stat and not stat.startswith("Z"): + return True + return False def _process_group_snapshot(pgid: int) -> str: @@ -71,6 +92,7 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch): _hermes_pgid=67890, poll=lambda: 0, kill=lambda: None, + wait=lambda timeout=None: 0, ) killpg_calls = [] @@ -79,15 +101,16 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch): def fake_killpg(pgid, sig): killpg_calls.append((pgid, sig)) - if sig == 0: - raise ProcessLookupError monkeypatch.setattr(os, "getpgid", fake_getpgid) monkeypatch.setattr(os, "killpg", fake_killpg) env._kill_process(proc) - assert killpg_calls == [(67890, signal.SIGTERM), (67890, 0)] + # Cleanup path goes straight to SIGKILL — no graceful SIGTERM retry, + # because the caller (timeout / KeyboardInterrupt / SystemExit branches) + # has already given up on the process. + assert killpg_calls == [(67890, signal.SIGKILL)] def test_wait_for_process_kills_subprocess_on_keyboardinterrupt(): diff --git a/tools/environments/local.py b/tools/environments/local.py index 3200e63e..49e8ce6e 100644 --- a/tools/environments/local.py +++ b/tools/environments/local.py @@ -382,37 +382,19 @@ class LocalEnvironment(BaseEnvironment): return proc def _kill_process(self, proc): - """Kill the entire process group (all children).""" - - def _group_alive(pgid: int) -> bool: - try: - # POSIX-only: _IS_WINDOWS is handled before this helper is used. - os.killpg(pgid, 0) - return True - except ProcessLookupError: - return False - except PermissionError: - # The group exists, even if this process cannot signal it. - return True - - def _wait_for_group_exit(pgid: int, timeout: float) -> bool: - deadline = time.monotonic() + timeout - while time.monotonic() < deadline: - # Reap the wrapper promptly. A dead but unreaped group leader - # still makes killpg(pgid, 0) report the group as alive. - try: - proc.poll() - except Exception: - pass - if not _group_alive(pgid): - return True - time.sleep(0.05) - try: - proc.poll() - except Exception: - pass - return not _group_alive(pgid) + """Kill the entire process group (all children). + This is the cleanup path — invoked from ``_wait_for_process`` for + the timeout, KeyboardInterrupt, and SystemExit branches. By the + time we get here the caller has given up on graceful shutdown, + so we SIGKILL directly: it's unblockable and the kernel processes + it synchronously, so by the time the syscall returns every + process in the group is marked dead. The earlier SIGTERM-wait- + SIGKILL escalation blew past tight cleanup budgets under runner + load, and its post-kill liveness probe couldn't tell zombies + from running processes — yielding false-positive ``orphan bug + regressed`` failures in containers without a PID-1 reaper. + """ try: if _IS_WINDOWS: proc.terminate() @@ -425,24 +407,11 @@ class LocalEnvironment(BaseEnvironment): raise try: - os.killpg(pgid, signal.SIGTERM) - except ProcessLookupError: - return - - # Wait on the process group, not just the shell wrapper. Under - # load the wrapper can exit before grandchildren do; returning - # at that point leaves orphaned process-group members behind. - if _wait_for_group_exit(pgid, 1.0): - return - - try: - # POSIX-only: _IS_WINDOWS is handled by the outer branch. os.killpg(pgid, signal.SIGKILL) except ProcessLookupError: return - _wait_for_group_exit(pgid, 2.0) try: - proc.wait(timeout=0.2) + proc.wait(timeout=0.5) except (subprocess.TimeoutExpired, OSError): pass except (ProcessLookupError, PermissionError, OSError):