fix(update): poll is-active instead of one-shot sleep(3) after gateway restart (#15639)

The auto-restart path in `hermes update` verifies systemd unit health with `time.sleep(3)` + a single `systemctl is-active` call. The unit's Stopped -> Started transition after a graceful SIGUSR1 exit (or a hard restart) is not always complete inside that 3s window, so the verify races and reports 'drained but didn't relaunch' even though systemd is about to bring the unit back up a fraction of a second later. Users then see a spurious warning, a redundant fallback `systemctl restart` fires, and adapters (Discord, WhatsApp) get restarted twice. Replace the three sleep+oneshot sites with a small `_wait_for_service_active()` closure that polls `is-active` every 0.5s for up to 10s. Behaviour is unchanged when the unit is healthy or truly dead — only the race window around a clean restart is now handled correctly. Tests: tests/hermes_cli/test_update_gateway_restart.py (41/41).
2026-04-25 06:11:22 -07:00 · 2026-04-25 06:11:22 -07:00 · 6e561ffa6d
commit 6e561ffa6d
parent ac05daa189
1 changed files with 39 additions and 24 deletions
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@ -6046,6 +6046,31 @@ def _cmd_update_impl(args, gateway_mode: bool):
            )
            import signal as _signal

+            def _wait_for_service_active(
+                scope_cmd_: list, svc_name_: str, timeout: float = 10.0,
+            ) -> bool:
+                """Poll ``systemctl is-active`` until the unit reports active.
+
+                systemd's Stopped -> Started transition after a graceful exit
+                (or a hard restart) is not instantaneous; a one-shot check
+                races that window and falsely reports the unit as down.
+                Poll every 0.5s up to ``timeout`` seconds before giving up.
+                """
+                deadline = _time.monotonic() + max(timeout, 0.5)
+                while True:
+                    try:
+                        _verify = subprocess.run(
+                            scope_cmd_ + ["is-active", svc_name_],
+                            capture_output=True, text=True, timeout=5,
+                        )
+                        if _verify.stdout.strip() == "active":
+                            return True
+                    except (FileNotFoundError, subprocess.TimeoutExpired):
+                        pass
+                    if _time.monotonic() >= deadline:
+                        return False
+                    _time.sleep(0.5)
+
            # Drain budget for graceful SIGUSR1 restarts.  The gateway drains
            # for up to ``agent.restart_drain_timeout`` (default 60s) before
            # exiting with code 75; we wait slightly longer so the drain
@ -6152,14 +6177,14 @@ def _cmd_update_impl(args, gateway_mode: bool):

                            if _graceful_ok:
                                # Gateway exited 75; systemd should relaunch
-                                # via Restart=on-failure.  Verify the new
-                                # process came up.
-                                _time.sleep(3)
-                                verify = subprocess.run(
-                                    scope_cmd + ["is-active", svc_name],
-                                    capture_output=True, text=True, timeout=5,
-                                )
-                                if verify.stdout.strip() == "active":
+                                # via Restart=on-failure.  Poll is-active for
+                                # up to ~10s because the unit's Stopped ->
+                                # Started transition can take a few seconds
+                                # after the old PID exits, and a one-shot
+                                # check races that window.
+                                if _wait_for_service_active(
+                                    scope_cmd, svc_name, timeout=10.0,
+                                ):
                                    restarted_services.append(svc_name)
                                    continue
                                # Process exited but wasn't respawned (older
@ -6185,14 +6210,9 @@ def _cmd_update_impl(args, gateway_mode: bool):
                                # Verify the service actually survived the
                                # restart.  systemctl restart returns 0 even
                                # if the new process crashes immediately.
-                                _time.sleep(3)
-                                verify = subprocess.run(
-                                    scope_cmd + ["is-active", svc_name],
-                                    capture_output=True,
-                                    text=True,
-                                    timeout=5,
-                                )
-                                if verify.stdout.strip() == "active":
+                                if _wait_for_service_active(
+                                    scope_cmd, svc_name, timeout=10.0,
+                                ):
                                    restarted_services.append(svc_name)
                                else:
                                    # Retry once — transient startup failures
@ -6207,14 +6227,9 @@ def _cmd_update_impl(args, gateway_mode: bool):
                                        text=True,
                                        timeout=15,
                                    )
-                                    _time.sleep(3)
-                                    verify2 = subprocess.run(
-                                        scope_cmd + ["is-active", svc_name],
-                                        capture_output=True,
-                                        text=True,
-                                        timeout=5,
-                                    )
-                                    if verify2.stdout.strip() == "active":
+                                    if _wait_for_service_active(
+                                        scope_cmd, svc_name, timeout=10.0,
+                                    ):
                                        restarted_services.append(svc_name)
                                        print(f"  ✓ {svc_name} recovered on retry")
                                    else: