From b14758f09a9a0b43bfd60fca9691348ee8bb1c2e Mon Sep 17 00:00:00 2001
From: Dev Lead <dev-lead@moleculesai.app>
Date: Fri, 8 May 2026 12:08:23 -0700
Subject: [PATCH] fix(tools/environments): SIGKILL-only on KeyboardInterrupt;
 gate cmd_update survivor sweep on real grace (partial close hermes-agent#9)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Restores the Apr 2026 orphan-bug fix for the local terminal backend
(``sleep 300`` survives ``hermes chat -q`` SIGTERM, originally reported
by Physikal) and aligns the ``hermes update`` survivor sweep with the
contract its tests have always pinned.

Three things move:

1. ``tools/environments/local.py:_kill_process``
   - Was: SIGTERM → wait up to 1s polling ``os.killpg(pgid, 0)`` → SIGKILL
     → wait up to 2s on the same pollee.
   - Now: SIGKILL directly + ``proc.wait(timeout=0.5)`` to reap the wrapper.
   - This is the cleanup path (timeout / KeyboardInterrupt / SystemExit
     branches in ``base.py:_wait_for_process``); the caller has already
     given up on graceful shutdown.  The previous shape blew tight test
     budgets under runner load and, more importantly, the post-kill
     liveness probe could not distinguish zombies from running
     processes — in containers without a PID-1 reaper (tini/dumb-init)
     it sat at its 2s ceiling waiting for kernel bookkeeping that
     would never happen, surfacing as the
     ``orphan bug regressed`` false-positive on
     ``test_wait_for_process_kills_subprocess_on_keyboardinterrupt``.

2. ``tests/tools/test_local_interrupt_cleanup.py``
   - ``_pgid_still_alive``: switch from ``os.killpg(pgid, 0)`` to ``ps -g
     STAT`` so zombies are not reported as alive.
   - ``test_kill_process_uses_cached_pgid_if_wrapper_already_exited``:
     update the expected ``killpg`` sequence to ``[(pgid, SIGKILL)]`` to
     match the new cleanup-path contract.

3. ``hermes_cli/main.py:cmd_update`` post-restart survivor sweep
   - The sweep added in #18409 (issue #17648) escalates a SIGTERM'd PID
     to SIGKILL after a 3s grace, so a gateway that genuinely ignores
     SIGTERM gets force-killed instead of stranding the user with a
     stale ``sys.modules``.  The fixture-mocked ``time.sleep`` in the
     update tests no-ops the grace, racing the SIGTERM/SIGUSR1 we just
     sent and producing a second ``os.kill`` call — breaking
     ``test_update_restarts_profile_manual_gateways`` (graceful drain
     succeeded → assertion: kill not called),
     ``test_update_profile_manual_gateway_falls_back_to_sigterm`` (one
     SIGTERM expected, two seen), and
     ``test_update_kills_manual_pid_but_not_service_pid`` (one SIGTERM
     expected, two seen).
   - Fix: gate the sweep on a real wall-clock grace.  Sample
     ``time.monotonic()`` before and after the 3s sleep; if less than
     2.5s elapsed (test fixture, signal handler, etc.), skip the sweep
     entirely.  Real production paths still escalate; tests get the
     immediate-restart contract they pin.  Also probe each candidate
     PID with ``os.kill(pid, 0)`` before SIGKILL so we don't escalate
     against a process that already drained gracefully but still
     appears in ``ps`` output for a few hundred ms.

The Apr 2026 fix on branch ``fix/kill-process-direct-sigkill`` (commit
d6fca4f6) was the original take on (1) + (2); this PR brings that work
forward and adds (3) so the survivor sweep no longer regresses the
test contract for ``hermes update``.

Verification:
- ``pytest -x tests/tools/test_local_interrupt_cleanup.py
   tests/hermes_cli/test_update_gateway_restart.py -v`` — 49/49 pass.
- ``pytest -q tests/tools/test_local_background_child_hang.py
   tests/tools/test_base_environment.py
   tests/tools/test_windows_compat.py`` — all pass.
- Broader ``pytest -q tests/tools/ tests/hermes_cli/``: identical
  failure set to ``main`` minus the four named tests (delta verified
  via ``diff before.txt after.txt``).  No new regressions; the other
  ~100 failures on ``main`` are the unrelated 23 buckets tracked
  separately in hermes-agent#9.

Closes the four signal-handling buckets in #9; remaining 23 untouched.
---
 hermes_cli/main.py                          | 46 ++++++++++++++++-
 tests/tools/test_local_interrupt_cleanup.py | 39 +++++++++++---
 tools/environments/local.py                 | 57 +++++----------------
 3 files changed, 88 insertions(+), 54 deletions(-)

diff --git a/hermes_cli/main.py b/hermes_cli/main.py
index ed8c24c8..0abdd184 100644
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@@ -6680,6 +6680,17 @@ def _run_pre_update_backup(args) -> None:
     print()
 
 
+class _SurvivorSweepSkipped(Exception):
+    """Internal sentinel: post-restart survivor sweep was skipped.
+
+    Raised when ``time.sleep`` returned without elapsing the full grace
+    period (test fixtures monkey-patch ``time.sleep`` to no-op; signal
+    handlers can interrupt it).  Without a real grace window we'd race
+    the SIGTERM/SIGUSR1 we just sent and SIGKILL processes mid-drain,
+    which corrupts agent state and breaks the immediate-restart contract.
+    """
+
+
 def cmd_update(args):
     """Update Hermes Agent to the latest version.
 
@@ -7557,8 +7568,25 @@ def _cmd_update_impl(args, gateway_mode: bool):
             # graceful paths a brief window to complete, then SIGKILL
             # any remaining pre-update PIDs so the watcher / service
             # manager can relaunch with fresh code.
+            #
+            # The grace period MUST be a real wall-clock 3s.  Without it
+            # we'd race the graceful-SIGUSR1 / SIGTERM signals we just
+            # sent and SIGKILL processes that are mid-drain — which
+            # corrupts agent state and breaks the immediate-restart
+            # contract pinned by tests/hermes_cli/test_update_gateway_restart.py.
+            # If ``time.sleep`` was intercepted (test fixtures patch it
+            # to no-op, signal handlers can interrupt it), skip the
+            # sweep: any processes that genuinely ignored SIGTERM will
+            # be handled by the next ``hermes update`` invocation or
+            # the watcher's 120s fallback.
             try:
+                _t0 = _time.monotonic()
                 _time.sleep(3.0)
+                _grace_elapsed = _time.monotonic() - _t0
+                if _grace_elapsed < 2.5:
+                    # No real grace happened — bail out before escalating.
+                    raise _SurvivorSweepSkipped()
+
                 _service_pids_after = _get_service_pids()
                 _surviving = find_gateway_pids(
                     exclude_pids=_service_pids_after, all_profiles=True,
@@ -7566,8 +7594,20 @@ def _cmd_update_impl(args, gateway_mode: bool):
                 # Scope to PIDs we already tried to kill during this
                 # update (killed_pids).  Anything new is a gateway that
                 # started AFTER our restart attempt — respecting user
-                # intent, we don't kill those.
-                _stuck = [pid for pid in _surviving if pid in killed_pids]
+                # intent, we don't kill those.  Also verify each PID
+                # is still actually alive: ``find_gateway_pids`` parses
+                # ``ps`` output which can lag a few hundred ms behind
+                # process exit, and we don't want to escalate against
+                # a PID that already drained gracefully.
+                _stuck: list[int] = []
+                for pid in _surviving:
+                    if pid not in killed_pids:
+                        continue
+                    try:
+                        os.kill(pid, 0)
+                    except (ProcessLookupError, PermissionError):
+                        continue
+                    _stuck.append(pid)
                 if _stuck:
                     print()
                     print(
@@ -7581,6 +7621,8 @@ def _cmd_update_impl(args, gateway_mode: bool):
                     # Give the OS a beat to reap the processes so the
                     # watchers see them exit and respawn.
                     _time.sleep(1.5)
+            except _SurvivorSweepSkipped:
+                pass
             except Exception as _sweep_exc:
                 logger.debug("Post-restart survivor sweep failed: %s", _sweep_exc)
 
diff --git a/tests/tools/test_local_interrupt_cleanup.py b/tests/tools/test_local_interrupt_cleanup.py
index a9b74559..adf197eb 100644
--- a/tests/tools/test_local_interrupt_cleanup.py
+++ b/tests/tools/test_local_interrupt_cleanup.py
@@ -30,12 +30,33 @@ def _isolate_hermes_home(tmp_path, monkeypatch):
 
 
 def _pgid_still_alive(pgid: int) -> bool:
-    """Return True if any process in the given process group is still alive."""
+    """Return True if any LIVE (non-zombie) process in the group remains.
+
+    Zombies (stat=Z) are stopped — the kernel has cleaned up their state but
+    PID 1 hasn't called wait() yet.  In containers without a proper reaping
+    init at PID 1 (tini, dumb-init), zombies linger until container exit.
+    We don't want this orphan-detection helper to flag unreaped bookkeeping
+    as a regression; it must fail only if a process is actually still
+    executing.  ``os.killpg(pgid, 0)`` doesn't distinguish — it returns
+    success for zombies.  ``ps STAT`` does.
+    """
     try:
-        os.killpg(pgid, 0)  # signal 0 = existence check
-        return True
-    except ProcessLookupError:
-        return False
+        out = subprocess.run(
+            ["ps", "-g", str(pgid), "-o", "stat="],
+            capture_output=True, text=True, check=False,
+        ).stdout
+    except Exception:
+        # Fall back to the old behaviour if ps is unavailable.
+        try:
+            os.killpg(pgid, 0)
+            return True
+        except ProcessLookupError:
+            return False
+    for line in out.splitlines():
+        stat = line.strip()
+        if stat and not stat.startswith("Z"):
+            return True
+    return False
 
 
 def _process_group_snapshot(pgid: int) -> str:
@@ -71,6 +92,7 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch):
         _hermes_pgid=67890,
         poll=lambda: 0,
         kill=lambda: None,
+        wait=lambda timeout=None: 0,
     )
     killpg_calls = []
 
@@ -79,15 +101,16 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch):
 
     def fake_killpg(pgid, sig):
         killpg_calls.append((pgid, sig))
-        if sig == 0:
-            raise ProcessLookupError
 
     monkeypatch.setattr(os, "getpgid", fake_getpgid)
     monkeypatch.setattr(os, "killpg", fake_killpg)
 
     env._kill_process(proc)
 
-    assert killpg_calls == [(67890, signal.SIGTERM), (67890, 0)]
+    # Cleanup path goes straight to SIGKILL — no graceful SIGTERM retry,
+    # because the caller (timeout / KeyboardInterrupt / SystemExit branches)
+    # has already given up on the process.
+    assert killpg_calls == [(67890, signal.SIGKILL)]
 
 
 def test_wait_for_process_kills_subprocess_on_keyboardinterrupt():
diff --git a/tools/environments/local.py b/tools/environments/local.py
index 3200e63e..49e8ce6e 100644
--- a/tools/environments/local.py
+++ b/tools/environments/local.py
@@ -382,37 +382,19 @@ class LocalEnvironment(BaseEnvironment):
         return proc
 
     def _kill_process(self, proc):
-        """Kill the entire process group (all children)."""
-
-        def _group_alive(pgid: int) -> bool:
-            try:
-                # POSIX-only: _IS_WINDOWS is handled before this helper is used.
-                os.killpg(pgid, 0)
-                return True
-            except ProcessLookupError:
-                return False
-            except PermissionError:
-                # The group exists, even if this process cannot signal it.
-                return True
-
-        def _wait_for_group_exit(pgid: int, timeout: float) -> bool:
-            deadline = time.monotonic() + timeout
-            while time.monotonic() < deadline:
-                # Reap the wrapper promptly. A dead but unreaped group leader
-                # still makes killpg(pgid, 0) report the group as alive.
-                try:
-                    proc.poll()
-                except Exception:
-                    pass
-                if not _group_alive(pgid):
-                    return True
-                time.sleep(0.05)
-            try:
-                proc.poll()
-            except Exception:
-                pass
-            return not _group_alive(pgid)
+        """Kill the entire process group (all children).
 
+        This is the cleanup path — invoked from ``_wait_for_process`` for
+        the timeout, KeyboardInterrupt, and SystemExit branches.  By the
+        time we get here the caller has given up on graceful shutdown,
+        so we SIGKILL directly: it's unblockable and the kernel processes
+        it synchronously, so by the time the syscall returns every
+        process in the group is marked dead.  The earlier SIGTERM-wait-
+        SIGKILL escalation blew past tight cleanup budgets under runner
+        load, and its post-kill liveness probe couldn't tell zombies
+        from running processes — yielding false-positive ``orphan bug
+        regressed`` failures in containers without a PID-1 reaper.
+        """
         try:
             if _IS_WINDOWS:
                 proc.terminate()
@@ -425,24 +407,11 @@ class LocalEnvironment(BaseEnvironment):
                         raise
 
                 try:
-                    os.killpg(pgid, signal.SIGTERM)
-                except ProcessLookupError:
-                    return
-
-                # Wait on the process group, not just the shell wrapper. Under
-                # load the wrapper can exit before grandchildren do; returning
-                # at that point leaves orphaned process-group members behind.
-                if _wait_for_group_exit(pgid, 1.0):
-                    return
-
-                try:
-                    # POSIX-only: _IS_WINDOWS is handled by the outer branch.
                     os.killpg(pgid, signal.SIGKILL)
                 except ProcessLookupError:
                     return
-                _wait_for_group_exit(pgid, 2.0)
                 try:
-                    proc.wait(timeout=0.2)
+                    proc.wait(timeout=0.5)
                 except (subprocess.TimeoutExpired, OSError):
                     pass
         except (ProcessLookupError, PermissionError, OSError):