From 9dc9a6998f3c29808b7a53339523013a3a1d853f Mon Sep 17 00:00:00 2001 From: Dev Lead Date: Fri, 8 May 2026 14:10:43 -0700 Subject: [PATCH] fix(test_gateway_service,test_gateway_wsl): align systemd tests with current production shape (partial close hermes-agent#9) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sub-shape A (TimeoutStopSec literal drift): - generate_systemd_unit() formula: max(60, drain_timeout) + 30 - DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT bumped 60→180 in config.py, so emitted TimeoutStopSec went 90→210; tests pinned the literal 90. - Replace literal with TestGeneratedSystemdUnits._expected_timeout_stop_sec() helper that mirrors the production formula via _get_restart_drain_timeout(), so future config-default bumps don't silently regress the test. Sub-shape B (production preflight not stubbed): - systemd_start() / systemd_restart() now call _preflight_user_systemd() before the systemctl call sequence (PR #14531: "preflight user D-Bus before systemctl --user start"). The preflight invokes loginctl enable-linger and waits for the D-Bus socket — neither of which the unit tests' fake subprocess runner answers. - Unit-tests under TestSystemdServiceRefresh and TestGatewaySystemServiceRouting assert the systemctl call sequence, not the preflight; preflight has dedicated coverage in TestUserSystemdPrivateSocketPreflight. Stub _preflight_user_systemd as a no-op in the four affected tests. Sub-shape B (supports_systemd_services container branch): - supports_systemd_services() now branches on is_container() to decide whether to probe `systemctl is-system-running`. Tests that assert the native-Linux True path didn't stub is_container, so a containerized CI runner inherited a real probe of the runner image's systemd: - test_supports_systemd_services_returns_true_when_systemctl_present - TestSupportsSystemdServicesWSL.test_native_linux - Stub is_container() False in both, plus shutil.which() in the WSL test so it also passes on macOS dev boxes (was implicitly Linux-only via systemctl-on-PATH). Tests fixed: test_systemd_start_refreshes_outdated_unit test_systemd_restart_refreshes_outdated_unit test_user_unit_avoids_recursive_execstop_and_uses_extended_stop_timeout test_system_unit_avoids_recursive_execstop_and_uses_extended_stop_timeout test_supports_systemd_services_returns_true_when_systemctl_present test_systemd_restart_self_requests_graceful_restart_and_waits test_systemd_restart_recovers_failed_planned_restart TestSupportsSystemdServicesWSL.test_native_linux Verified locally on darwin py3.13: all 8 target tests pass; one unrelated macOS-only failure (test_wsl_with_systemd) remains because its body relies on the host having systemctl on PATH — not in this PR's scope (not in the issue's failing-list). Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/hermes_cli/test_gateway_service.py | 44 +++++++++++++++++++++--- tests/hermes_cli/test_gateway_wsl.py | 11 +++++- 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py index f2bfa8b8..19644734 100644 --- a/tests/hermes_cli/test_gateway_service.py +++ b/tests/hermes_cli/test_gateway_service.py @@ -64,6 +64,12 @@ class TestSystemdServiceRefresh: monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda system=False: unit_path) monkeypatch.setattr(gateway_cli, "generate_systemd_unit", lambda system=False, run_as_user=None: "new unit\n") + # Production now preflights user-systemd availability (loginctl + # enable-linger + D-Bus socket wait, #14531) before start/restart. + # These unit tests assert the systemctl call sequence, not the + # preflight — stub the preflight as a no-op so the fake subprocess + # runner doesn't have to reproduce the loginctl/D-Bus dance. + monkeypatch.setattr(gateway_cli, "_preflight_user_systemd", lambda *a, **kw: None) calls = [] @@ -87,6 +93,9 @@ class TestSystemdServiceRefresh: monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda system=False: unit_path) monkeypatch.setattr(gateway_cli, "generate_systemd_unit", lambda system=False, run_as_user=None: "new unit\n") + # See note on test_systemd_start_refreshes_outdated_unit — preflight + # is a separate concern and has its own dedicated coverage. + monkeypatch.setattr(gateway_cli, "_preflight_user_systemd", lambda *a, **kw: None) calls = [] @@ -108,6 +117,15 @@ class TestSystemdServiceRefresh: class TestGeneratedSystemdUnits: + @staticmethod + def _expected_timeout_stop_sec() -> int: + # Mirror the formula in gateway.generate_systemd_unit: + # restart_timeout = max(60, drain_timeout) + 30 + # so that bumping the default drain_timeout in config doesn't silently + # break this test — we want to pin the relationship, not a magic number. + drain_timeout = int(gateway_cli._get_restart_drain_timeout() or 0) + return max(60, drain_timeout) + 30 + def test_user_unit_avoids_recursive_execstop_and_uses_extended_stop_timeout(self): unit = gateway_cli.generate_systemd_unit(system=False) @@ -115,10 +133,13 @@ class TestGeneratedSystemdUnits: assert "ExecStop=" not in unit assert "ExecReload=/bin/kill -USR1 $MAINPID" in unit assert f"RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}" in unit - # TimeoutStopSec must exceed the default drain_timeout (60s) so + # TimeoutStopSec must exceed the configured drain_timeout so # systemd doesn't SIGKILL the cgroup before post-interrupt cleanup # (tool subprocess kill, adapter disconnect) runs — issue #8202. - assert "TimeoutStopSec=90" in unit + # Formula is max(60, drain_timeout) + 30; pin the relationship to + # _get_restart_drain_timeout() rather than a literal so a config + # default bump (default jumped 60→180s) doesn't silently regress us. + assert f"TimeoutStopSec={self._expected_timeout_stop_sec()}" in unit def test_user_unit_includes_resolved_node_directory_in_path(self, monkeypatch): monkeypatch.setattr(gateway_cli.shutil, "which", lambda cmd: "/home/test/.nvm/versions/node/v24.14.0/bin/node" if cmd == "node" else None) @@ -134,10 +155,13 @@ class TestGeneratedSystemdUnits: assert "ExecStop=" not in unit assert "ExecReload=/bin/kill -USR1 $MAINPID" in unit assert f"RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}" in unit - # TimeoutStopSec must exceed the default drain_timeout (60s) so + # TimeoutStopSec must exceed the configured drain_timeout so # systemd doesn't SIGKILL the cgroup before post-interrupt cleanup # (tool subprocess kill, adapter disconnect) runs — issue #8202. - assert "TimeoutStopSec=90" in unit + # Formula is max(60, drain_timeout) + 30; pin the relationship to + # _get_restart_drain_timeout() rather than a literal so a config + # default bump (default jumped 60→180s) doesn't silently regress us. + assert f"TimeoutStopSec={self._expected_timeout_stop_sec()}" in unit assert "WantedBy=multi-user.target" in unit @@ -437,6 +461,10 @@ class TestGatewayServiceDetection: monkeypatch.setattr(gateway_cli, "is_linux", lambda: True) monkeypatch.setattr(gateway_cli, "is_termux", lambda: False) monkeypatch.setattr(gateway_cli, "is_wsl", lambda: False) + # Native-Linux assertion: explicitly opt out of the container path + # (added after this test was written) so a containerized CI runner + # doesn't inherit a probe of the real systemd in the runner image. + monkeypatch.setattr(gateway_cli, "is_container", lambda: False) monkeypatch.setattr(gateway_cli.shutil, "which", lambda name: "/usr/bin/systemctl") assert gateway_cli.supports_systemd_services() is True @@ -487,6 +515,11 @@ class TestGatewaySystemServiceRouting: calls = [] monkeypatch.setattr(gateway_cli, "_select_systemd_scope", lambda system=False: False) + # Production now preflights user-systemd availability (loginctl + # enable-linger + D-Bus socket wait, #14531) before restart. This + # test exercises the restart routing path; preflight has its own + # dedicated coverage in TestUserSystemdPrivateSocketPreflight. + monkeypatch.setattr(gateway_cli, "_preflight_user_systemd", lambda *a, **kw: None) monkeypatch.setattr(gateway_cli, "refresh_systemd_unit_if_needed", lambda system=False: calls.append(("refresh", system))) monkeypatch.setattr( "gateway.status.get_running_pid", @@ -541,6 +574,9 @@ class TestGatewaySystemServiceRouting: def test_systemd_restart_recovers_failed_planned_restart(self, monkeypatch, capsys): monkeypatch.setattr(gateway_cli, "_select_systemd_scope", lambda system=False: False) + # See note on test_systemd_restart_self_requests_graceful_restart_and_waits + # — preflight is a separate concern with dedicated coverage. + monkeypatch.setattr(gateway_cli, "_preflight_user_systemd", lambda *a, **kw: None) monkeypatch.setattr(gateway_cli, "refresh_systemd_unit_if_needed", lambda system=False: None) monkeypatch.setattr( "gateway.status.read_runtime_status", diff --git a/tests/hermes_cli/test_gateway_wsl.py b/tests/hermes_cli/test_gateway_wsl.py index ea5bf40c..e1a20e65 100644 --- a/tests/hermes_cli/test_gateway_wsl.py +++ b/tests/hermes_cli/test_gateway_wsl.py @@ -141,10 +141,19 @@ class TestSupportsSystemdServicesWSL: assert gateway.supports_systemd_services() is False def test_native_linux(self, monkeypatch): - """Native Linux (not WSL) → True without checking systemd.""" + """Native Linux (not WSL, not container) → True without further probing.""" monkeypatch.setattr(gateway, "is_linux", lambda: True) monkeypatch.setattr(gateway, "is_termux", lambda: False) monkeypatch.setattr(gateway, "is_wsl", lambda: False) + # supports_systemd_services() now also branches on is_container() to + # decide whether to probe `systemctl is-system-running` — explicitly + # opt this case out of the container path so a containerized CI + # runner doesn't inherit the probe of the runner image's systemd. + monkeypatch.setattr(gateway, "is_container", lambda: False) + # On macOS dev boxes shutil.which("systemctl") returns None; stub it + # so the test exercises the native-Linux branch independently of the + # host's $PATH. + monkeypatch.setattr(gateway.shutil, "which", lambda name: "/usr/bin/systemctl") assert gateway.supports_systemd_services() is True def test_termux_still_excluded(self, monkeypatch): -- 2.45.2