From 53a01a84007276e3f9117d994dd7eb249ab10e6a Mon Sep 17 00:00:00 2001
From: dev-lead <dev-lead@moleculesai.app>
Date: Fri, 8 May 2026 14:11:19 -0700
Subject: [PATCH] =?UTF-8?q?fix(systemd):=20align=20tests=20with=20producti?=
 =?UTF-8?q?on=20contract=20=E2=80=94=20D-Bus=20stubs=20+=20TimeoutStopSec?=
 =?UTF-8?q?=20(partial=20close=20hermes-agent#9)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two distinct sub-shapes converged in the seven systemd test failures
tracked under hermes-agent#9.  Both are addressed here without changing
production code; the production contract is correct on both axes.

## Sub-shape 1 — TimeoutStopSec=210 vs 90 (2 tests)

`TestGeneratedSystemdUnits::test_user_unit_avoids_recursive_execstop_and_uses_extended_stop_timeout`
and its system-scope sibling pin `TimeoutStopSec=90`.  The unit
generator computes `restart_timeout = max(60, drain_timeout) + 30s`
of post-interrupt cleanup headroom (gateway.py L1635).

PR #18761 (2026-05-02) intentionally raised the default
`agent.restart_drain_timeout` from 60s to 180s after a /restart on
2026-05-02 force-interrupted three mid-API-call agents inside the old
60s budget.  The new arithmetic is therefore `max(60, 180) + 30 = 210s`,
and both unit generators produce `TimeoutStopSec=210`.

The tests are updated to derive the expected value from
`DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT` (already imported at the top of
the file) so future drain-timeout changes flag the contract drift in
one place.  An explicit `expected_timeout == 210` assert + commit
reference keeps the rationale visible at the failure site.

## Sub-shape 2 — D-Bus / loginctl mocks (5 tests)

`systemd_start()` and `systemd_restart()` (user scope) gained a
`_preflight_user_systemd()` call that probes `/run/user/$UID/{bus,
systemd/private}` and, if both are missing, runs `loginctl
enable-linger $USER` to bring the user@.service socket up.  None of
that is reachable from a unit-test fixture: the test runner has no
real user D-Bus session, the linger probe touches the live filesystem,
and `loginctl enable-linger runner` falls outside the
`fake_subprocess_run` whitelist these tests pin.

Five tests were therefore failing not on production behavior but on
their fixtures missing a setup that the production code path can't
have in a test:

  - TestSystemdServiceRefresh::test_systemd_start_refreshes_outdated_unit
  - TestSystemdServiceRefresh::test_systemd_restart_refreshes_outdated_unit
  - TestGatewaySystemServiceRouting::test_systemd_restart_self_requests_graceful_restart_and_waits
  - TestGatewaySystemServiceRouting::test_systemd_restart_recovers_failed_planned_restart
  - TestGatewayServiceDetection::test_supports_systemd_services_returns_true_when_systemctl_present
    (different shape — see below)
  - TestSupportsSystemdServicesWSL::test_native_linux (same shape as
    the detection test)

Each of the four restart/start tests now stubs `_preflight_user_systemd`
with a no-op.  The preflight contract itself is exercised end-to-end
by the existing `TestPreflightUserSystemd` suite (six tests), so this
stub doesn't paper over any real bug — it only narrows the assertion
to what the test was actually meant to check (refresh + systemctl call
shape).

For the two `supports_systemd_services()` tests, the missing fixture
was `is_container()`.  CI runners frequently expose `/.dockerenv`,
which makes `is_container()` return True at the gateway layer and
sends the call into the `_container_systemd_operational()` branch
(real systemctl probe).  Both tests model a native-Linux host, so
they pin `is_container=False`; the WSL one also pins
`shutil.which("systemctl")` to a non-None value so the early
binary-presence check doesn't short-circuit.

## Verification

  - 7/7 named tests in the brief now pass on a clean macOS Python 3.13
    venv (no real systemd present).
  - Set-diff of `pytest tests/hermes_cli/ -q` failure list before vs
    after the change: 7 fewer failures, 0 added (residual non-systemd
    failures match between baseline and branch — same set, just
    reordered by xdist).
  - `pytest tests/gateway/ -q`: same 14 pre-existing failures as on
    main; no new regressions.

The post-fix contract:

  - User and system units write `TimeoutStopSec=210` while
    `restart_drain_timeout=180s` (default).  The formula is pinned
    in test as `max(60, drain_timeout) + 30` so future drain bumps
    fail loudly in one place.
  - `_preflight_user_systemd()` probe sequence (still owned by
    `TestPreflightUserSystemd`): D-Bus / private-socket exists →
    short-circuit; else linger-on → wait 3s; else `loginctl
    enable-linger $USER` → wait 5s; else raise
    `UserSystemdUnavailableError` with remediation hint.
---
 tests/hermes_cli/test_gateway_service.py | 50 ++++++++++++++++++++----
 tests/hermes_cli/test_gateway_wsl.py     |  7 ++++
 2 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py
index f2bfa8b8..408a4558 100644
--- a/tests/hermes_cli/test_gateway_service.py
+++ b/tests/hermes_cli/test_gateway_service.py
@@ -64,6 +64,10 @@ class TestSystemdServiceRefresh:
 
         monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda system=False: unit_path)
         monkeypatch.setattr(gateway_cli, "generate_systemd_unit", lambda system=False, run_as_user=None: "new unit\n")
+        # Test fixtures don't have a real user D-Bus session — stub the preflight
+        # so the test exercises the unit-refresh + systemctl call sequence rather
+        # than the linger / loginctl probe (covered by TestPreflightUserSystemd).
+        monkeypatch.setattr(gateway_cli, "_preflight_user_systemd", lambda *a, **kw: None)
 
         calls = []
 
@@ -87,6 +91,9 @@ class TestSystemdServiceRefresh:
 
         monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda system=False: unit_path)
         monkeypatch.setattr(gateway_cli, "generate_systemd_unit", lambda system=False, run_as_user=None: "new unit\n")
+        # Same rationale as test_systemd_start_refreshes_outdated_unit: this test
+        # validates the unit-refresh + systemctl call shape, not the D-Bus probe.
+        monkeypatch.setattr(gateway_cli, "_preflight_user_systemd", lambda *a, **kw: None)
 
         calls = []
 
@@ -115,10 +122,20 @@ class TestGeneratedSystemdUnits:
         assert "ExecStop=" not in unit
         assert "ExecReload=/bin/kill -USR1 $MAINPID" in unit
         assert f"RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}" in unit
-        # TimeoutStopSec must exceed the default drain_timeout (60s) so
-        # systemd doesn't SIGKILL the cgroup before post-interrupt cleanup
-        # (tool subprocess kill, adapter disconnect) runs — issue #8202.
-        assert "TimeoutStopSec=90" in unit
+        # TimeoutStopSec must exceed the default drain_timeout so systemd
+        # doesn't SIGKILL the cgroup before post-interrupt cleanup (tool
+        # subprocess kill, adapter disconnect) runs — issue #8202.
+        # gateway.py: restart_timeout = max(60, drain_timeout) + 30s headroom.
+        # PR #18761 (2026-05-02) raised the default drain_timeout from 60s
+        # to 180s after a /restart force-interrupted three mid-API-call
+        # agents inside the old 60s budget, so the expected TimeoutStopSec
+        # is now 180 + 30 = 210s.
+        expected_timeout = max(60, int(DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT)) + 30
+        assert f"TimeoutStopSec={expected_timeout}" in unit
+        assert expected_timeout == 210, (
+            "drain_timeout default changed; update DEFAULT_CONFIG and this "
+            "test in lockstep — see PR #18761 commit message for rationale."
+        )
 
     def test_user_unit_includes_resolved_node_directory_in_path(self, monkeypatch):
         monkeypatch.setattr(gateway_cli.shutil, "which", lambda cmd: "/home/test/.nvm/versions/node/v24.14.0/bin/node" if cmd == "node" else None)
@@ -134,10 +151,16 @@ class TestGeneratedSystemdUnits:
         assert "ExecStop=" not in unit
         assert "ExecReload=/bin/kill -USR1 $MAINPID" in unit
         assert f"RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}" in unit
-        # TimeoutStopSec must exceed the default drain_timeout (60s) so
-        # systemd doesn't SIGKILL the cgroup before post-interrupt cleanup
-        # (tool subprocess kill, adapter disconnect) runs — issue #8202.
-        assert "TimeoutStopSec=90" in unit
+        # See test_user_unit_avoids_recursive_execstop_and_uses_extended_stop_timeout
+        # for the TimeoutStopSec rationale; both unit generators share the same
+        # max(60, drain_timeout) + 30s formula so the user and system units stay
+        # in lockstep on this contract.
+        expected_timeout = max(60, int(DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT)) + 30
+        assert f"TimeoutStopSec={expected_timeout}" in unit
+        assert expected_timeout == 210, (
+            "drain_timeout default changed; update DEFAULT_CONFIG and this "
+            "test in lockstep — see PR #18761 commit message for rationale."
+        )
         assert "WantedBy=multi-user.target" in unit
 
 
@@ -437,6 +460,11 @@ class TestGatewayServiceDetection:
         monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
         monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
         monkeypatch.setattr(gateway_cli, "is_wsl", lambda: False)
+        # supports_systemd_services() also branches on is_container() — without
+        # this stub the test runs the container fallback which calls real
+        # systemctl, returning False on a CI runner that has /.dockerenv but
+        # no working systemd. The test models a native-Linux host, so pin it.
+        monkeypatch.setattr(gateway_cli, "is_container", lambda: False)
         monkeypatch.setattr(gateway_cli.shutil, "which", lambda name: "/usr/bin/systemctl")
 
         assert gateway_cli.supports_systemd_services() is True
@@ -487,6 +515,10 @@ class TestGatewaySystemServiceRouting:
         calls = []
 
         monkeypatch.setattr(gateway_cli, "_select_systemd_scope", lambda system=False: False)
+        # systemd_restart() calls _preflight_user_systemd() before refreshing
+        # the unit; the test fixture has no real D-Bus session so stub it. The
+        # preflight contract itself is covered by TestPreflightUserSystemd below.
+        monkeypatch.setattr(gateway_cli, "_preflight_user_systemd", lambda *a, **kw: None)
         monkeypatch.setattr(gateway_cli, "refresh_systemd_unit_if_needed", lambda system=False: calls.append(("refresh", system)))
         monkeypatch.setattr(
             "gateway.status.get_running_pid",
@@ -541,6 +573,8 @@ class TestGatewaySystemServiceRouting:
 
     def test_systemd_restart_recovers_failed_planned_restart(self, monkeypatch, capsys):
         monkeypatch.setattr(gateway_cli, "_select_systemd_scope", lambda system=False: False)
+        # See sibling test for preflight stub rationale.
+        monkeypatch.setattr(gateway_cli, "_preflight_user_systemd", lambda *a, **kw: None)
         monkeypatch.setattr(gateway_cli, "refresh_systemd_unit_if_needed", lambda system=False: None)
         monkeypatch.setattr(
             "gateway.status.read_runtime_status",
diff --git a/tests/hermes_cli/test_gateway_wsl.py b/tests/hermes_cli/test_gateway_wsl.py
index ea5bf40c..1d0ea150 100644
--- a/tests/hermes_cli/test_gateway_wsl.py
+++ b/tests/hermes_cli/test_gateway_wsl.py
@@ -145,6 +145,13 @@ class TestSupportsSystemdServicesWSL:
         monkeypatch.setattr(gateway, "is_linux", lambda: True)
         monkeypatch.setattr(gateway, "is_termux", lambda: False)
         monkeypatch.setattr(gateway, "is_wsl", lambda: False)
+        # Pin is_container() too: CI runners often have /.dockerenv which would
+        # send supports_systemd_services() down the _container_systemd_operational()
+        # branch (calls real systemctl) and return False here.
+        monkeypatch.setattr(gateway, "is_container", lambda: False)
+        # Make sure a systemctl binary appears present so we don't trip the
+        # which() short-circuit in supports_systemd_services().
+        monkeypatch.setattr(gateway.shutil, "which", lambda name: "/usr/bin/systemctl")
         assert gateway.supports_systemd_services() is True
 
     def test_termux_still_excluded(self, monkeypatch):
-- 
2.45.2