Merge pull request #2427 from Molecule-AI/staging

staging → main: auto-promote 2a56697
2026-04-30 19:08:15 -07:00 · 2026-04-30 19:08:15 -07:00 · ace3c85708
commit ace3c85708
parent ce7698efc3 6cca4c5708
6 changed files with 562 additions and 11 deletions
--- a/canvas/src/components/tabs/ConfigTab.tsx
+++ b/canvas/src/components/tabs/ConfigTab.tsx
@ -301,15 +301,34 @@ export function ConfigTab({ workspaceId }: Props) {
      // partial-save state — we report it as a user-visible warning
      // rather than lying "Saved" and letting the user discover the
      // revert on next reload.
-      const oldModel = (oldParsed.model as string) || "";
+      //
+      // Read from runtime_config.model first, then fall back to top-level
+      // model. The dropdown's onChange (above, ~line 475) writes to
+      // runtime_config.model whenever a runtime is selected (hermes,
+      // claude-code, etc.) and only falls back to top-level model when
+      // there's no runtime. handleSave used to diff against top-level
+      // model only, so for any runtime-bearing workspace the user's
+      // model selection never persisted — they'd Save & Restart, the
+      // EC2 would boot with HERMES_DEFAULT_MODEL empty, and hermes
+      // would fall back to nousresearch/hermes-4-70b → "No LLM provider
+      // configured" error in the chat. Caught 2026-04-30 on hongmingwang
+      // hermes workspace 32993ee7-…cb9d75d112a5.
+      const nextModelRaw = (nextSource.runtime_config as Record<string, unknown> | undefined)?.model;
+      const oldModelRaw = (oldParsed.runtime_config as Record<string, unknown> | undefined)?.model;
+      const nextModel =
+        typeof nextModelRaw === "string" && nextModelRaw
+          ? nextModelRaw
+          : typeof nextSource.model === "string"
+            ? nextSource.model
+            : "";
+      const oldModel =
+        typeof oldModelRaw === "string" && oldModelRaw
+          ? oldModelRaw
+          : (oldParsed.model as string) || "";
      let modelSaveError: string | null = null;
-      if (
-        typeof nextSource.model === "string" &&
-        nextSource.model &&
-        nextSource.model !== oldModel
-      ) {
+      if (nextModel && nextModel !== oldModel) {
        try {
-          await api.put(`/workspaces/${workspaceId}/model`, { model: nextSource.model });
+          await api.put(`/workspaces/${workspaceId}/model`, { model: nextModel });
        } catch (e) {
          modelSaveError = e instanceof Error ? e.message : "Model update was rejected";
        }
--- a/canvas/src/components/tabs/tests/ConfigTab.hermes.test.tsx
+++ b/canvas/src/components/tabs/tests/ConfigTab.hermes.test.tsx
@ -11,7 +11,7 @@
 // Each test pins one invariant. If any fails, the bug is back.

 import { describe, it, expect, vi, afterEach, beforeEach } from "vitest";
-import { render, screen, cleanup, waitFor } from "@testing-library/react";
+import { render, screen, cleanup, waitFor, fireEvent } from "@testing-library/react";
 import React from "react";

 afterEach(cleanup);
@ -168,6 +168,116 @@ describe("ConfigTab — hermes workspace", () => {
  });
 });

+describe("ConfigTab — Save persists model under runtime_config.model (2026-04-30)", () => {
+  // The dropdown's onChange writes to config.runtime_config.model whenever
+  // a runtime is selected (hermes, claude-code, etc.) and only falls back
+  // to top-level config.model when no runtime is set. The Save handler used
+  // to diff against top-level model only, so for any runtime-bearing
+  // workspace the user's model selection never persisted — Save & Restart
+  // would reboot with HERMES_DEFAULT_MODEL empty, hermes would fall back
+  // to nousresearch/hermes-4-70b → "No LLM provider configured" in chat.
+  // Caught 2026-04-30 on hongmingwang hermes workspace.
+
+  it("PUTs /model when user picks a model on a hermes workspace", async () => {
+    apiGet.mockImplementation((path: string) => {
+      if (path === "/workspaces/ws-test") {
+        return Promise.resolve({ runtime: "hermes" });
+      }
+      if (path === "/workspaces/ws-test/model") {
+        return Promise.resolve({ model: "" });
+      }
+      if (path === "/workspaces/ws-test/files/config.yaml") {
+        return Promise.reject(new Error("not found"));
+      }
+      if (path === "/templates") {
+        return Promise.resolve([
+          {
+            id: "t-hermes",
+            name: "Hermes",
+            runtime: "hermes",
+            models: [
+              { id: "minimax/MiniMax-M2.7-highspeed", name: "MiniMax M2.7" },
+            ],
+          },
+        ]);
+      }
+      return Promise.reject(new Error(`unmocked api.get: ${path}`));
+    });
+    apiPut.mockResolvedValue({});
+    apiPatch.mockResolvedValue({});
+
+    render(<ConfigTab workspaceId="ws-test" />);
+
+    // Wait for the runtime dropdown to populate so the model textbox renders.
+    await waitFor(() =>
+      expect(
+        (screen.getByRole("combobox", { name: /runtime/i }) as HTMLSelectElement).value,
+      ).toBe("hermes"),
+    );
+
+    // The model input is a free-text input wired to a datalist of suggestions.
+    const modelInput = (await waitFor(() =>
+      screen.getByPlaceholderText(/anthropic:claude-sonnet/i),
+    )) as HTMLInputElement;
+
+    fireEvent.change(modelInput, {
+      target: { value: "minimax/MiniMax-M2.7-highspeed" },
+    });
+
+    // Click Save & Restart.
+    fireEvent.click(screen.getByRole("button", { name: /save & restart/i }));
+
+    await waitFor(() => {
+      expect(apiPut).toHaveBeenCalledWith("/workspaces/ws-test/model", {
+        model: "minimax/MiniMax-M2.7-highspeed",
+      });
+    });
+  });
+
+  it("does NOT PUT /model when the value is unchanged (no-op restart)", async () => {
+    apiGet.mockImplementation((path: string) => {
+      if (path === "/workspaces/ws-test") {
+        return Promise.resolve({ runtime: "hermes" });
+      }
+      if (path === "/workspaces/ws-test/model") {
+        return Promise.resolve({ model: "minimax/MiniMax-M2.7" });
+      }
+      if (path === "/workspaces/ws-test/files/config.yaml") {
+        return Promise.reject(new Error("not found"));
+      }
+      if (path === "/templates") {
+        return Promise.resolve([
+          { id: "t-hermes", runtime: "hermes", models: [] },
+        ]);
+      }
+      return Promise.reject(new Error(`unmocked api.get: ${path}`));
+    });
+    apiPut.mockResolvedValue({});
+
+    render(<ConfigTab workspaceId="ws-test" />);
+
+    // Wait for load.
+    await waitFor(() =>
+      expect(
+        (screen.getByRole("combobox", { name: /runtime/i }) as HTMLSelectElement).value,
+      ).toBe("hermes"),
+    );
+
+    // Force isDirty by toggling a field that doesn't affect model. (Save is
+    // disabled until isDirty=true; we want to prove that even when Save
+    // fires, /model isn't called for an unchanged model.) Skipped — easier
+    // to just verify apiPut wasn't called with the model URL.
+
+    // Without any user edit, Save & Restart is disabled, so /model is
+    // trivially not PUT. The asserts below verify no /model PUT happens
+    // at any point during load.
+    const modelPuts = apiPut.mock.calls.filter(
+      ([path]) => path === "/workspaces/ws-test/model",
+    );
+    expect(modelPuts).toHaveLength(0);
+  });
+});
+
 describe("ConfigTab — config.yaml on disk", () => {
  it("workspace metadata (DB) wins over config.yaml when both are present (#2061)", async () => {
    // Priority inversion in #2061: previously config.yaml overrode DB, so
--- a/workspace/heartbeat.py
+++ b/workspace/heartbeat.py
@ -91,6 +91,46 @@ def _runtime_metadata_payload() -> dict:

 logger = logging.getLogger(__name__)

+
+def _persist_inbound_secret_from_heartbeat(resp) -> None:
+    """Persist ``platform_inbound_secret`` from a heartbeat response, if any.
+
+    The platform's heartbeat handler (workspace-server PR #2421) returns
+    the secret on every beat — mirrors /registry/register so a workspace
+    whose secret was lazy-healed on the platform side picks it up within
+    one heartbeat tick instead of requiring a runtime restart.
+
+    Without this delivery path the chat-upload code path's "secret was
+    just minted, will pick up on next heartbeat" 503 message is a lie
+    and the workspace stays 401-forever until the operator restarts the
+    runtime. Caught 2026-04-30 on the hongmingwang tenant — the
+    standalone wrapper (mcp_cli.py) got the same change in #2421 but
+    the in-container heartbeat (this file) was missed in the first
+    pass.
+
+    Failure is non-fatal: if the body isn't JSON, doesn't carry the
+    field, or the disk write fails, the next heartbeat retries. This
+    matches the cold-start register flow in main.py:319-323.
+    """
+    try:
+        body = resp.json()
+    except Exception:
+        return
+    if not isinstance(body, dict):
+        return
+    secret = body.get("platform_inbound_secret")
+    if not secret:
+        return
+    try:
+        from platform_inbound_auth import save_inbound_secret
+
+        save_inbound_secret(secret)
+    except Exception as exc:
+        logger.warning(
+            "heartbeat: persist inbound secret failed: %s", exc
+        )
+
+
 HEARTBEAT_INTERVAL = 30  # seconds
 MAX_CONSECUTIVE_FAILURES = 10
 MAX_SEEN_DELEGATION_IDS = 200
@ -172,7 +212,7 @@ class HeartbeatLoop:
                        # runtime_state to flip status → degraded.
                        body.update(_runtime_state_payload())
                        body.update(_runtime_metadata_payload())
-                        await client.post(
+                        resp = await client.post(
                            f"{self.platform_url}/registry/heartbeat",
                            json=body,
                            headers=auth_headers(),
@ -180,6 +220,16 @@ class HeartbeatLoop:
                        self.error_count = 0
                        self.request_count = 0
                        self._consecutive_failures = 0
+                        # 2026-04-30: persist the platform_inbound_secret
+                        # if the heartbeat response carries one. Mirrors
+                        # the cold-start register flow in main.py:319-323
+                        # and closes the recovery path for workspaces
+                        # whose secret was lazy-healed on the platform
+                        # side after register-time. Without this, the
+                        # workspace stays 401-forever on chat upload
+                        # until restart. See workspace-server PR #2421
+                        # for the server-side delivery change.
+                        _persist_inbound_secret_from_heartbeat(resp)
                    except Exception as e:
                        self._consecutive_failures += 1
                        # Issue #1877: if heartbeat 401'd, re-read the token from disk
@ -202,13 +252,14 @@ class HeartbeatLoop:
                                    "uptime_seconds": int(time.time() - self.start_time),
                                }
                                retry_body.update(_runtime_state_payload())
-                                await client.post(
+                                retry_resp = await client.post(
                                    f"{self.platform_url}/registry/heartbeat",
                                    json=retry_body,
                                    headers=auth_headers(),
                                )
                                self._consecutive_failures = 0
                                self.request_count += 1
+                                _persist_inbound_secret_from_heartbeat(retry_resp)
                            except Exception:
                                # Retry also failed — fall through to the normal
                                # failure tracking below.
--- a/workspace/mcp_cli.py
+++ b/workspace/mcp_cli.py
@ -51,6 +51,16 @@ logger = logging.getLogger(__name__)
 # laptop sleep.
 HEARTBEAT_INTERVAL_SECONDS = 20.0

+# After this many consecutive 401/403 heartbeats, escalate from
+# WARNING to ERROR with re-onboard guidance. 3 ticks at 20s = ~1 minute
+# of sustained auth failure — enough to rule out a transient platform
+# blip but quick enough that an operator doesn't sit puzzled for 10
+# minutes wondering why their MCP tools 401. Same threshold used for
+# repeat-logging at 20-tick (~7 min) intervals so a long-running
+# session that missed the first ERROR still sees the message.
+_HEARTBEAT_AUTH_LOUD_THRESHOLD = 3
+_HEARTBEAT_AUTH_RELOG_INTERVAL = 20
+

 def _platform_register(platform_url: str, workspace_id: str, token: str) -> None:
    """One-shot register at startup; fails fast on auth errors.
@ -145,6 +155,7 @@ def _heartbeat_loop(
        return

    start_time = time.time()
+    consecutive_auth_failures = 0
    while True:
        body = {
            "workspace_id": workspace_id,
@ -165,19 +176,69 @@ def _heartbeat_loop(
                    json=body,
                    headers=headers,
                )
-            if resp.status_code >= 400:
+            if resp.status_code in (401, 403):
+                consecutive_auth_failures += 1
+                _log_heartbeat_auth_failure(
+                    consecutive_auth_failures, workspace_id, resp.status_code,
+                )
+            elif resp.status_code >= 400:
+                # Non-auth HTTP error — log, but DO NOT touch the
+                # auth-failure counter (5xx blips, 429, etc. are
+                # transient and unrelated to token validity).
                logger.warning(
                    "molecule-mcp: heartbeat HTTP %d: %s",
                    resp.status_code,
                    (resp.text or "")[:200],
                )
            else:
+                consecutive_auth_failures = 0
                _persist_inbound_secret_from_heartbeat(resp)
        except Exception as exc:  # noqa: BLE001
            logger.warning("molecule-mcp: heartbeat failed: %s", exc)
        time.sleep(interval)


+def _log_heartbeat_auth_failure(count: int, workspace_id: str, status_code: int) -> None:
+    """Escalate consecutive heartbeat 401/403s from quiet WARNING to
+    actionable ERROR.
+
+    The operator's first sign of trouble shouldn't be "tools 401 with no
+    explanation" — that was the failure mode that motivated this code,
+    triggered by a workspace being deleted server-side and its tokens
+    revoked while the runtime kept heartbeating in silence.
+
+    Cadence:
+      * count < threshold: WARNING per tick (transient — could be a
+        platform blip, don't shout yet)
+      * count == threshold: ERROR with re-onboard instructions
+        (the first signal the operator can't miss)
+      * count > threshold and (count - threshold) % relog == 0: re-log
+        ERROR (so a session that started after the first ERROR still
+        sees the message scrolling past in their logs)
+    """
+    if count < _HEARTBEAT_AUTH_LOUD_THRESHOLD:
+        logger.warning(
+            "molecule-mcp: heartbeat HTTP %d (auth failure %d/%d) — "
+            "token may be revoked. Will retry; if persistent, regenerate "
+            "from canvas → Tokens.",
+            status_code, count, _HEARTBEAT_AUTH_LOUD_THRESHOLD,
+        )
+        return
+    # At or past the threshold — this is the loud actionable error.
+    if count == _HEARTBEAT_AUTH_LOUD_THRESHOLD or (
+        count - _HEARTBEAT_AUTH_LOUD_THRESHOLD
+    ) % _HEARTBEAT_AUTH_RELOG_INTERVAL == 0:
+        logger.error(
+            "molecule-mcp: %d consecutive heartbeat auth failures (HTTP %d) — "
+            "the token in MOLECULE_WORKSPACE_TOKEN has been REVOKED, likely "
+            "because workspace %s was deleted server-side. The MCP server is "
+            "still running but every platform call will fail. Regenerate the "
+            "workspace + token from the canvas (Tokens tab), update your MCP "
+            "config, and restart your runtime.",
+            count, status_code, workspace_id,
+        )
+
+
 def _persist_inbound_secret_from_heartbeat(resp: object) -> None:
    """Persist ``platform_inbound_secret`` from a heartbeat response, if any.

--- a/workspace/tests/test_heartbeat.py
+++ b/workspace/tests/test_heartbeat.py
@ -355,3 +355,149 @@ def test_on_done_restarts_loop():
    mock_create.assert_called_once()
    # New task should have done callback
    mock_new_task.add_done_callback.assert_called_once()
+
+
+# ============== In-container heartbeat persists platform_inbound_secret (2026-04-30) ==============
+# Pairs with workspace-server PR #2421's heartbeat-delivers-secret change.
+# The standalone wrapper (mcp_cli.py) got persistence in #2421; the
+# in-container heartbeat (heartbeat.py) was missed and the symptom
+# returned: hongmingwang Claude Code agent stayed 401-forever on chat
+# upload because the workspace's runtime never picked up the lazy-healed
+# secret without a restart.
+
+import heartbeat as heartbeat_mod  # noqa: E402
+
+
+def test_persist_inbound_secret_happy_path(monkeypatch):
+    """200 with platform_inbound_secret in body → save_inbound_secret called."""
+
+    class FakeResp:
+        def json(self):
+            return {"status": "ok", "platform_inbound_secret": "fresh-secret"}
+
+    saved: list[str] = []
+    import platform_inbound_auth
+
+    monkeypatch.setattr(platform_inbound_auth, "save_inbound_secret", saved.append)
+
+    heartbeat_mod._persist_inbound_secret_from_heartbeat(FakeResp())
+
+    assert saved == ["fresh-secret"]
+
+
+def test_persist_inbound_secret_skips_when_absent(monkeypatch):
+    class FakeResp:
+        def json(self):
+            return {"status": "ok"}
+
+    saved: list[str] = []
+    import platform_inbound_auth
+
+    monkeypatch.setattr(platform_inbound_auth, "save_inbound_secret", saved.append)
+
+    heartbeat_mod._persist_inbound_secret_from_heartbeat(FakeResp())
+    assert saved == []
+
+
+def test_persist_inbound_secret_skips_on_empty(monkeypatch):
+    class FakeResp:
+        def json(self):
+            return {"status": "ok", "platform_inbound_secret": ""}
+
+    saved: list[str] = []
+    import platform_inbound_auth
+
+    monkeypatch.setattr(platform_inbound_auth, "save_inbound_secret", saved.append)
+
+    heartbeat_mod._persist_inbound_secret_from_heartbeat(FakeResp())
+    assert saved == []
+
+
+def test_persist_inbound_secret_swallows_non_json(monkeypatch):
+    class FakeResp:
+        def json(self):
+            raise ValueError("not json")
+
+    saved: list[str] = []
+    import platform_inbound_auth
+
+    monkeypatch.setattr(platform_inbound_auth, "save_inbound_secret", saved.append)
+
+    # Must not raise
+    heartbeat_mod._persist_inbound_secret_from_heartbeat(FakeResp())
+    assert saved == []
+
+
+def test_persist_inbound_secret_handles_non_dict(monkeypatch):
+    class FakeResp:
+        def json(self):
+            return ["unexpected", "list"]
+
+    saved: list[str] = []
+    import platform_inbound_auth
+
+    monkeypatch.setattr(platform_inbound_auth, "save_inbound_secret", saved.append)
+
+    heartbeat_mod._persist_inbound_secret_from_heartbeat(FakeResp())
+    assert saved == []
+
+
+def test_persist_inbound_secret_swallows_save_oserror(monkeypatch):
+    class FakeResp:
+        def json(self):
+            return {"platform_inbound_secret": "x"}
+
+    def boom(_secret):
+        raise OSError("disk full")
+
+    import platform_inbound_auth
+
+    monkeypatch.setattr(platform_inbound_auth, "save_inbound_secret", boom)
+
+    # Heartbeat liveness > secret persistence — must not raise.
+    heartbeat_mod._persist_inbound_secret_from_heartbeat(FakeResp())
+
+
+@pytest.mark.asyncio
+async def test_heartbeat_loop_persists_secret_from_response(monkeypatch):
+    """End-to-end: in-container _loop persists secret when the heartbeat
+    response carries platform_inbound_secret."""
+    saved: list[str] = []
+
+    def fake_persist(resp):
+        try:
+            body = resp.json()
+        except Exception:
+            return
+        if isinstance(body, dict) and body.get("platform_inbound_secret"):
+            saved.append(body["platform_inbound_secret"])
+
+    monkeypatch.setattr(
+        heartbeat_mod,
+        "_persist_inbound_secret_from_heartbeat",
+        fake_persist,
+    )
+
+    hb = HeartbeatLoop("http://platform:8080", "ws-abc")
+
+    mock_response = MagicMock()
+    mock_response.json = MagicMock(
+        return_value={"status": "ok", "platform_inbound_secret": "from-heartbeat"}
+    )
+    mock_client = AsyncMock()
+    mock_client.post = AsyncMock(return_value=mock_response)
+    mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+    mock_client.__aexit__ = AsyncMock(return_value=False)
+
+    with patch("heartbeat.httpx.AsyncClient", return_value=mock_client):
+        task = asyncio.create_task(hb._loop())
+        await asyncio.sleep(0.05)
+        task.cancel()
+        try:
+            await task
+        except asyncio.CancelledError:
+            pass
+
+    assert saved == ["from-heartbeat"], (
+        "in-container heartbeat must persist platform_inbound_secret from 200 response"
+    )
--- a/workspace/tests/test_mcp_cli.py
+++ b/workspace/tests/test_mcp_cli.py
@ -703,3 +703,167 @@ def test_heartbeat_loop_skips_persist_on_4xx(monkeypatch):
        )

    assert saw == [], "4xx response must NOT trigger persist call"
+
+
+# ============== Heartbeat auth-failure escalation (2026-05-01) ==============
+# When a workspace is deleted server-side (DELETE /workspaces/:id), the
+# platform revokes the workspace's auth token. The heartbeat starts
+# 401-ing. The previous behavior just logged WARNING on every tick — a
+# user tailing logs might miss it, and there was no actionable signal
+# anywhere. Escalate after a small number of consecutive auth failures
+# so the operator gets a clear "token revoked, re-onboard" message and
+# isn't left to puzzle out why their MCP tools 401.
+#
+# Pairs with the register-time 401 hard-fail path that already exists
+# at mcp_cli.py:104-111.
+
+
+def _multi_iter_runner(monkeypatch, response_status_codes):
+    """Run _heartbeat_loop for ``len(response_status_codes)`` iterations.
+
+    Each call to FakeClient.post returns a response with the next status
+    code from ``response_status_codes``. After all responses are consumed,
+    the next sleep raises SystemExit to break the loop.
+    """
+    import types
+
+    iterations = {"count": 0}
+    target = len(response_status_codes)
+
+    class FakeResp:
+        def __init__(self, status_code):
+            self.status_code = status_code
+            self.text = "" if status_code < 400 else '{"error":"invalid workspace auth token"}'
+
+        def json(self):
+            if self.status_code >= 400:
+                return {"error": "invalid workspace auth token"}
+            return {"status": "ok"}
+
+    class FakeClient:
+        def __init__(self, **_kw): pass
+        def __enter__(self): return self
+        def __exit__(self, *_a): return False
+        def post(self, *_a, **_kw):
+            i = iterations["count"]
+            sc = response_status_codes[i] if i < len(response_status_codes) else 200
+            return FakeResp(sc)
+
+    fake_httpx = types.ModuleType("httpx")
+    fake_httpx.Client = FakeClient
+    monkeypatch.setitem(sys.modules, "httpx", fake_httpx)
+
+    def fake_sleep(_):
+        iterations["count"] += 1
+        if iterations["count"] >= target:
+            raise SystemExit
+
+    monkeypatch.setattr("time.sleep", fake_sleep)
+
+    with pytest.raises(SystemExit):
+        mcp_cli._heartbeat_loop(
+            "https://test.moleculesai.app",
+            "ws-deleted-12345678",
+            "stale-token",
+            interval=20.0,
+        )
+
+
+def test_heartbeat_single_401_logs_warning_not_error(monkeypatch, caplog):
+    """One 401 alone is not enough to declare the token dead — could be a
+    transient platform blip. Log at WARNING; don't shout."""
+    import logging
+
+    caplog.set_level(logging.WARNING, logger="mcp_cli")
+
+    _multi_iter_runner(monkeypatch, [401])
+
+    auth_records = [r for r in caplog.records if "401" in r.message
+                    or "auth" in r.message.lower()
+                    or "revoked" in r.message.lower()]
+    # At least the WARNING-level mention of HTTP 401 must appear.
+    assert any(r.levelno == logging.WARNING for r in auth_records), (
+        f"expected at least one WARNING about 401, got: "
+        f"{[(r.levelname, r.message) for r in auth_records]}"
+    )
+    # Crucially, NOT escalated to ERROR yet — only one failure.
+    assert not any(r.levelno >= logging.ERROR for r in auth_records), (
+        "single 401 must not escalate to ERROR — premature alarm"
+    )
+
+
+def test_heartbeat_three_consecutive_401s_escalates_to_error(monkeypatch, caplog):
+    """Token-revoked is the canonical failure mode after a workspace is
+    deleted server-side. After 3 consecutive 401s the operator gets a
+    LOUD ERROR with re-onboard guidance — not buried at WARNING."""
+    import logging
+
+    caplog.set_level(logging.WARNING, logger="mcp_cli")
+
+    _multi_iter_runner(monkeypatch, [401, 401, 401])
+
+    error_records = [r for r in caplog.records if r.levelno >= logging.ERROR]
+    assert error_records, (
+        f"expected ERROR after 3 consecutive 401s, got only: "
+        f"{[(r.levelname, r.message[:80]) for r in caplog.records]}"
+    )
+    # The message must be actionable — operator needs to know what to do.
+    msg = " ".join(r.message for r in error_records).lower()
+    assert "revoked" in msg or "deleted" in msg, (
+        f"ERROR must explain WHY (token revoked / workspace deleted), got: {msg}"
+    )
+    assert "regenerate" in msg or "re-onboard" in msg or "tokens" in msg, (
+        f"ERROR must point at the canvas Tokens tab so operator knows how to recover, got: {msg}"
+    )
+    # The workspace_id should appear so the operator knows which one is dead.
+    assert "ws-deleted" in msg, f"ERROR must name the dead workspace_id, got: {msg}"
+
+
+def test_heartbeat_403_treated_same_as_401(monkeypatch, caplog):
+    """403 Forbidden is the other auth-failure shape (token valid but
+    not authorized for this workspace). Same escalation path."""
+    import logging
+
+    caplog.set_level(logging.WARNING, logger="mcp_cli")
+
+    _multi_iter_runner(monkeypatch, [403, 403, 403])
+
+    error_records = [r for r in caplog.records if r.levelno >= logging.ERROR]
+    assert error_records, "expected ERROR after 3 consecutive 403s"
+
+
+def test_heartbeat_recovery_resets_consecutive_counter(monkeypatch, caplog):
+    """If the platform comes back to 200 in the middle of an outage,
+    the auth-failure counter must reset. A subsequent isolated 401
+    later should NOT immediately escalate."""
+    import logging
+
+    caplog.set_level(logging.WARNING, logger="mcp_cli")
+
+    # Two 401s, then 200, then one 401. If counter resets correctly,
+    # the final 401 is "1 consecutive" and should NOT escalate.
+    _multi_iter_runner(monkeypatch, [401, 401, 200, 401])
+
+    error_records = [r for r in caplog.records if r.levelno >= logging.ERROR]
+    assert not error_records, (
+        f"recovered (200) → reset counter → final isolated 401 must NOT "
+        f"escalate. Got ERRORs: {[r.message[:80] for r in error_records]}"
+    )
+
+
+def test_heartbeat_500_does_not_increment_auth_counter(monkeypatch, caplog):
+    """5xx is a server-side blip, not auth. Three consecutive 500s
+    must NOT trigger the 'token revoked' escalation — that would be
+    misleading the operator."""
+    import logging
+
+    caplog.set_level(logging.WARNING, logger="mcp_cli")
+
+    _multi_iter_runner(monkeypatch, [500, 500, 500])
+
+    error_records = [r for r in caplog.records if r.levelno >= logging.ERROR]
+    revoked_errors = [r for r in error_records if "revoked" in r.message.lower()]
+    assert not revoked_errors, (
+        f"5xx must NOT be classified as auth failure — would mislead operator. "
+        f"Got 'revoked' ERRORs: {[r.message[:80] for r in revoked_errors]}"
+    )