diff --git a/hermes_cli/main.py b/hermes_cli/main.py
index ed8c24c8..0abdd184 100644
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@@ -6680,6 +6680,17 @@ def _run_pre_update_backup(args) -> None:
     print()
 
 
+class _SurvivorSweepSkipped(Exception):
+    """Internal sentinel: post-restart survivor sweep was skipped.
+
+    Raised when ``time.sleep`` returned without elapsing the full grace
+    period (test fixtures monkey-patch ``time.sleep`` to no-op; signal
+    handlers can interrupt it).  Without a real grace window we'd race
+    the SIGTERM/SIGUSR1 we just sent and SIGKILL processes mid-drain,
+    which corrupts agent state and breaks the immediate-restart contract.
+    """
+
+
 def cmd_update(args):
     """Update Hermes Agent to the latest version.
 
@@ -7557,8 +7568,25 @@ def _cmd_update_impl(args, gateway_mode: bool):
             # graceful paths a brief window to complete, then SIGKILL
             # any remaining pre-update PIDs so the watcher / service
             # manager can relaunch with fresh code.
+            #
+            # The grace period MUST be a real wall-clock 3s.  Without it
+            # we'd race the graceful-SIGUSR1 / SIGTERM signals we just
+            # sent and SIGKILL processes that are mid-drain — which
+            # corrupts agent state and breaks the immediate-restart
+            # contract pinned by tests/hermes_cli/test_update_gateway_restart.py.
+            # If ``time.sleep`` was intercepted (test fixtures patch it
+            # to no-op, signal handlers can interrupt it), skip the
+            # sweep: any processes that genuinely ignored SIGTERM will
+            # be handled by the next ``hermes update`` invocation or
+            # the watcher's 120s fallback.
             try:
+                _t0 = _time.monotonic()
                 _time.sleep(3.0)
+                _grace_elapsed = _time.monotonic() - _t0
+                if _grace_elapsed < 2.5:
+                    # No real grace happened — bail out before escalating.
+                    raise _SurvivorSweepSkipped()
+
                 _service_pids_after = _get_service_pids()
                 _surviving = find_gateway_pids(
                     exclude_pids=_service_pids_after, all_profiles=True,
@@ -7566,8 +7594,20 @@ def _cmd_update_impl(args, gateway_mode: bool):
                 # Scope to PIDs we already tried to kill during this
                 # update (killed_pids).  Anything new is a gateway that
                 # started AFTER our restart attempt — respecting user
-                # intent, we don't kill those.
-                _stuck = [pid for pid in _surviving if pid in killed_pids]
+                # intent, we don't kill those.  Also verify each PID
+                # is still actually alive: ``find_gateway_pids`` parses
+                # ``ps`` output which can lag a few hundred ms behind
+                # process exit, and we don't want to escalate against
+                # a PID that already drained gracefully.
+                _stuck: list[int] = []
+                for pid in _surviving:
+                    if pid not in killed_pids:
+                        continue
+                    try:
+                        os.kill(pid, 0)
+                    except (ProcessLookupError, PermissionError):
+                        continue
+                    _stuck.append(pid)
                 if _stuck:
                     print()
                     print(
@@ -7581,6 +7621,8 @@ def _cmd_update_impl(args, gateway_mode: bool):
                     # Give the OS a beat to reap the processes so the
                     # watchers see them exit and respawn.
                     _time.sleep(1.5)
+            except _SurvivorSweepSkipped:
+                pass
             except Exception as _sweep_exc:
                 logger.debug("Post-restart survivor sweep failed: %s", _sweep_exc)
 
diff --git a/tests/tools/test_local_interrupt_cleanup.py b/tests/tools/test_local_interrupt_cleanup.py
index a9b74559..adf197eb 100644
--- a/tests/tools/test_local_interrupt_cleanup.py
+++ b/tests/tools/test_local_interrupt_cleanup.py
@@ -30,12 +30,33 @@ def _isolate_hermes_home(tmp_path, monkeypatch):
 
 
 def _pgid_still_alive(pgid: int) -> bool:
-    """Return True if any process in the given process group is still alive."""
+    """Return True if any LIVE (non-zombie) process in the group remains.
+
+    Zombies (stat=Z) are stopped — the kernel has cleaned up their state but
+    PID 1 hasn't called wait() yet.  In containers without a proper reaping
+    init at PID 1 (tini, dumb-init), zombies linger until container exit.
+    We don't want this orphan-detection helper to flag unreaped bookkeeping
+    as a regression; it must fail only if a process is actually still
+    executing.  ``os.killpg(pgid, 0)`` doesn't distinguish — it returns
+    success for zombies.  ``ps STAT`` does.
+    """
     try:
-        os.killpg(pgid, 0)  # signal 0 = existence check
-        return True
-    except ProcessLookupError:
-        return False
+        out = subprocess.run(
+            ["ps", "-g", str(pgid), "-o", "stat="],
+            capture_output=True, text=True, check=False,
+        ).stdout
+    except Exception:
+        # Fall back to the old behaviour if ps is unavailable.
+        try:
+            os.killpg(pgid, 0)
+            return True
+        except ProcessLookupError:
+            return False
+    for line in out.splitlines():
+        stat = line.strip()
+        if stat and not stat.startswith("Z"):
+            return True
+    return False
 
 
 def _process_group_snapshot(pgid: int) -> str:
@@ -71,6 +92,7 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch):
         _hermes_pgid=67890,
         poll=lambda: 0,
         kill=lambda: None,
+        wait=lambda timeout=None: 0,
     )
     killpg_calls = []
 
@@ -79,15 +101,16 @@ def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch):
 
     def fake_killpg(pgid, sig):
         killpg_calls.append((pgid, sig))
-        if sig == 0:
-            raise ProcessLookupError
 
     monkeypatch.setattr(os, "getpgid", fake_getpgid)
     monkeypatch.setattr(os, "killpg", fake_killpg)
 
     env._kill_process(proc)
 
-    assert killpg_calls == [(67890, signal.SIGTERM), (67890, 0)]
+    # Cleanup path goes straight to SIGKILL — no graceful SIGTERM retry,
+    # because the caller (timeout / KeyboardInterrupt / SystemExit branches)
+    # has already given up on the process.
+    assert killpg_calls == [(67890, signal.SIGKILL)]
 
 
 def test_wait_for_process_kills_subprocess_on_keyboardinterrupt():
diff --git a/tools/environments/local.py b/tools/environments/local.py
index 3200e63e..49e8ce6e 100644
--- a/tools/environments/local.py
+++ b/tools/environments/local.py
@@ -382,37 +382,19 @@ class LocalEnvironment(BaseEnvironment):
         return proc
 
     def _kill_process(self, proc):
-        """Kill the entire process group (all children)."""
-
-        def _group_alive(pgid: int) -> bool:
-            try:
-                # POSIX-only: _IS_WINDOWS is handled before this helper is used.
-                os.killpg(pgid, 0)
-                return True
-            except ProcessLookupError:
-                return False
-            except PermissionError:
-                # The group exists, even if this process cannot signal it.
-                return True
-
-        def _wait_for_group_exit(pgid: int, timeout: float) -> bool:
-            deadline = time.monotonic() + timeout
-            while time.monotonic() < deadline:
-                # Reap the wrapper promptly. A dead but unreaped group leader
-                # still makes killpg(pgid, 0) report the group as alive.
-                try:
-                    proc.poll()
-                except Exception:
-                    pass
-                if not _group_alive(pgid):
-                    return True
-                time.sleep(0.05)
-            try:
-                proc.poll()
-            except Exception:
-                pass
-            return not _group_alive(pgid)
+        """Kill the entire process group (all children).
 
+        This is the cleanup path — invoked from ``_wait_for_process`` for
+        the timeout, KeyboardInterrupt, and SystemExit branches.  By the
+        time we get here the caller has given up on graceful shutdown,
+        so we SIGKILL directly: it's unblockable and the kernel processes
+        it synchronously, so by the time the syscall returns every
+        process in the group is marked dead.  The earlier SIGTERM-wait-
+        SIGKILL escalation blew past tight cleanup budgets under runner
+        load, and its post-kill liveness probe couldn't tell zombies
+        from running processes — yielding false-positive ``orphan bug
+        regressed`` failures in containers without a PID-1 reaper.
+        """
         try:
             if _IS_WINDOWS:
                 proc.terminate()
@@ -425,24 +407,11 @@ class LocalEnvironment(BaseEnvironment):
                         raise
 
                 try:
-                    os.killpg(pgid, signal.SIGTERM)
-                except ProcessLookupError:
-                    return
-
-                # Wait on the process group, not just the shell wrapper. Under
-                # load the wrapper can exit before grandchildren do; returning
-                # at that point leaves orphaned process-group members behind.
-                if _wait_for_group_exit(pgid, 1.0):
-                    return
-
-                try:
-                    # POSIX-only: _IS_WINDOWS is handled by the outer branch.
                     os.killpg(pgid, signal.SIGKILL)
                 except ProcessLookupError:
                     return
-                _wait_for_group_exit(pgid, 2.0)
                 try:
-                    proc.wait(timeout=0.2)
+                    proc.wait(timeout=0.5)
                 except (subprocess.TimeoutExpired, OSError):
                     pass
         except (ProcessLookupError, PermissionError, OSError):