Merge pull request #2427 from Molecule-AI/staging
staging → main: auto-promote 2a56697
This commit is contained in:
commit
ace3c85708
@ -301,15 +301,34 @@ export function ConfigTab({ workspaceId }: Props) {
|
||||
// partial-save state — we report it as a user-visible warning
|
||||
// rather than lying "Saved" and letting the user discover the
|
||||
// revert on next reload.
|
||||
const oldModel = (oldParsed.model as string) || "";
|
||||
//
|
||||
// Read from runtime_config.model first, then fall back to top-level
|
||||
// model. The dropdown's onChange (above, ~line 475) writes to
|
||||
// runtime_config.model whenever a runtime is selected (hermes,
|
||||
// claude-code, etc.) and only falls back to top-level model when
|
||||
// there's no runtime. handleSave used to diff against top-level
|
||||
// model only, so for any runtime-bearing workspace the user's
|
||||
// model selection never persisted — they'd Save & Restart, the
|
||||
// EC2 would boot with HERMES_DEFAULT_MODEL empty, and hermes
|
||||
// would fall back to nousresearch/hermes-4-70b → "No LLM provider
|
||||
// configured" error in the chat. Caught 2026-04-30 on hongmingwang
|
||||
// hermes workspace 32993ee7-…cb9d75d112a5.
|
||||
const nextModelRaw = (nextSource.runtime_config as Record<string, unknown> | undefined)?.model;
|
||||
const oldModelRaw = (oldParsed.runtime_config as Record<string, unknown> | undefined)?.model;
|
||||
const nextModel =
|
||||
typeof nextModelRaw === "string" && nextModelRaw
|
||||
? nextModelRaw
|
||||
: typeof nextSource.model === "string"
|
||||
? nextSource.model
|
||||
: "";
|
||||
const oldModel =
|
||||
typeof oldModelRaw === "string" && oldModelRaw
|
||||
? oldModelRaw
|
||||
: (oldParsed.model as string) || "";
|
||||
let modelSaveError: string | null = null;
|
||||
if (
|
||||
typeof nextSource.model === "string" &&
|
||||
nextSource.model &&
|
||||
nextSource.model !== oldModel
|
||||
) {
|
||||
if (nextModel && nextModel !== oldModel) {
|
||||
try {
|
||||
await api.put(`/workspaces/${workspaceId}/model`, { model: nextSource.model });
|
||||
await api.put(`/workspaces/${workspaceId}/model`, { model: nextModel });
|
||||
} catch (e) {
|
||||
modelSaveError = e instanceof Error ? e.message : "Model update was rejected";
|
||||
}
|
||||
|
||||
@ -11,7 +11,7 @@
|
||||
// Each test pins one invariant. If any fails, the bug is back.
|
||||
|
||||
import { describe, it, expect, vi, afterEach, beforeEach } from "vitest";
|
||||
import { render, screen, cleanup, waitFor } from "@testing-library/react";
|
||||
import { render, screen, cleanup, waitFor, fireEvent } from "@testing-library/react";
|
||||
import React from "react";
|
||||
|
||||
afterEach(cleanup);
|
||||
@ -168,6 +168,116 @@ describe("ConfigTab — hermes workspace", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("ConfigTab — Save persists model under runtime_config.model (2026-04-30)", () => {
|
||||
// The dropdown's onChange writes to config.runtime_config.model whenever
|
||||
// a runtime is selected (hermes, claude-code, etc.) and only falls back
|
||||
// to top-level config.model when no runtime is set. The Save handler used
|
||||
// to diff against top-level model only, so for any runtime-bearing
|
||||
// workspace the user's model selection never persisted — Save & Restart
|
||||
// would reboot with HERMES_DEFAULT_MODEL empty, hermes would fall back
|
||||
// to nousresearch/hermes-4-70b → "No LLM provider configured" in chat.
|
||||
// Caught 2026-04-30 on hongmingwang hermes workspace.
|
||||
|
||||
it("PUTs /model when user picks a model on a hermes workspace", async () => {
|
||||
apiGet.mockImplementation((path: string) => {
|
||||
if (path === "/workspaces/ws-test") {
|
||||
return Promise.resolve({ runtime: "hermes" });
|
||||
}
|
||||
if (path === "/workspaces/ws-test/model") {
|
||||
return Promise.resolve({ model: "" });
|
||||
}
|
||||
if (path === "/workspaces/ws-test/files/config.yaml") {
|
||||
return Promise.reject(new Error("not found"));
|
||||
}
|
||||
if (path === "/templates") {
|
||||
return Promise.resolve([
|
||||
{
|
||||
id: "t-hermes",
|
||||
name: "Hermes",
|
||||
runtime: "hermes",
|
||||
models: [
|
||||
{ id: "minimax/MiniMax-M2.7-highspeed", name: "MiniMax M2.7" },
|
||||
],
|
||||
},
|
||||
]);
|
||||
}
|
||||
return Promise.reject(new Error(`unmocked api.get: ${path}`));
|
||||
});
|
||||
apiPut.mockResolvedValue({});
|
||||
apiPatch.mockResolvedValue({});
|
||||
|
||||
render(<ConfigTab workspaceId="ws-test" />);
|
||||
|
||||
// Wait for the runtime dropdown to populate so the model textbox renders.
|
||||
await waitFor(() =>
|
||||
expect(
|
||||
(screen.getByRole("combobox", { name: /runtime/i }) as HTMLSelectElement).value,
|
||||
).toBe("hermes"),
|
||||
);
|
||||
|
||||
// The model input is a free-text input wired to a datalist of suggestions.
|
||||
const modelInput = (await waitFor(() =>
|
||||
screen.getByPlaceholderText(/anthropic:claude-sonnet/i),
|
||||
)) as HTMLInputElement;
|
||||
|
||||
fireEvent.change(modelInput, {
|
||||
target: { value: "minimax/MiniMax-M2.7-highspeed" },
|
||||
});
|
||||
|
||||
// Click Save & Restart.
|
||||
fireEvent.click(screen.getByRole("button", { name: /save & restart/i }));
|
||||
|
||||
await waitFor(() => {
|
||||
expect(apiPut).toHaveBeenCalledWith("/workspaces/ws-test/model", {
|
||||
model: "minimax/MiniMax-M2.7-highspeed",
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
it("does NOT PUT /model when the value is unchanged (no-op restart)", async () => {
|
||||
apiGet.mockImplementation((path: string) => {
|
||||
if (path === "/workspaces/ws-test") {
|
||||
return Promise.resolve({ runtime: "hermes" });
|
||||
}
|
||||
if (path === "/workspaces/ws-test/model") {
|
||||
return Promise.resolve({ model: "minimax/MiniMax-M2.7" });
|
||||
}
|
||||
if (path === "/workspaces/ws-test/files/config.yaml") {
|
||||
return Promise.reject(new Error("not found"));
|
||||
}
|
||||
if (path === "/templates") {
|
||||
return Promise.resolve([
|
||||
{ id: "t-hermes", runtime: "hermes", models: [] },
|
||||
]);
|
||||
}
|
||||
return Promise.reject(new Error(`unmocked api.get: ${path}`));
|
||||
});
|
||||
apiPut.mockResolvedValue({});
|
||||
|
||||
render(<ConfigTab workspaceId="ws-test" />);
|
||||
|
||||
// Wait for load.
|
||||
await waitFor(() =>
|
||||
expect(
|
||||
(screen.getByRole("combobox", { name: /runtime/i }) as HTMLSelectElement).value,
|
||||
).toBe("hermes"),
|
||||
);
|
||||
|
||||
// Force isDirty by toggling a field that doesn't affect model. (Save is
|
||||
// disabled until isDirty=true; we want to prove that even when Save
|
||||
// fires, /model isn't called for an unchanged model.) Skipped — easier
|
||||
// to just verify apiPut wasn't called with the model URL.
|
||||
|
||||
// Without any user edit, Save & Restart is disabled, so /model is
|
||||
// trivially not PUT. The asserts below verify no /model PUT happens
|
||||
// at any point during load.
|
||||
const modelPuts = apiPut.mock.calls.filter(
|
||||
([path]) => path === "/workspaces/ws-test/model",
|
||||
);
|
||||
expect(modelPuts).toHaveLength(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe("ConfigTab — config.yaml on disk", () => {
|
||||
it("workspace metadata (DB) wins over config.yaml when both are present (#2061)", async () => {
|
||||
// Priority inversion in #2061: previously config.yaml overrode DB, so
|
||||
|
||||
@ -91,6 +91,46 @@ def _runtime_metadata_payload() -> dict:
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _persist_inbound_secret_from_heartbeat(resp) -> None:
|
||||
"""Persist ``platform_inbound_secret`` from a heartbeat response, if any.
|
||||
|
||||
The platform's heartbeat handler (workspace-server PR #2421) returns
|
||||
the secret on every beat — mirrors /registry/register so a workspace
|
||||
whose secret was lazy-healed on the platform side picks it up within
|
||||
one heartbeat tick instead of requiring a runtime restart.
|
||||
|
||||
Without this delivery path the chat-upload code path's "secret was
|
||||
just minted, will pick up on next heartbeat" 503 message is a lie
|
||||
and the workspace stays 401-forever until the operator restarts the
|
||||
runtime. Caught 2026-04-30 on the hongmingwang tenant — the
|
||||
standalone wrapper (mcp_cli.py) got the same change in #2421 but
|
||||
the in-container heartbeat (this file) was missed in the first
|
||||
pass.
|
||||
|
||||
Failure is non-fatal: if the body isn't JSON, doesn't carry the
|
||||
field, or the disk write fails, the next heartbeat retries. This
|
||||
matches the cold-start register flow in main.py:319-323.
|
||||
"""
|
||||
try:
|
||||
body = resp.json()
|
||||
except Exception:
|
||||
return
|
||||
if not isinstance(body, dict):
|
||||
return
|
||||
secret = body.get("platform_inbound_secret")
|
||||
if not secret:
|
||||
return
|
||||
try:
|
||||
from platform_inbound_auth import save_inbound_secret
|
||||
|
||||
save_inbound_secret(secret)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"heartbeat: persist inbound secret failed: %s", exc
|
||||
)
|
||||
|
||||
|
||||
HEARTBEAT_INTERVAL = 30 # seconds
|
||||
MAX_CONSECUTIVE_FAILURES = 10
|
||||
MAX_SEEN_DELEGATION_IDS = 200
|
||||
@ -172,7 +212,7 @@ class HeartbeatLoop:
|
||||
# runtime_state to flip status → degraded.
|
||||
body.update(_runtime_state_payload())
|
||||
body.update(_runtime_metadata_payload())
|
||||
await client.post(
|
||||
resp = await client.post(
|
||||
f"{self.platform_url}/registry/heartbeat",
|
||||
json=body,
|
||||
headers=auth_headers(),
|
||||
@ -180,6 +220,16 @@ class HeartbeatLoop:
|
||||
self.error_count = 0
|
||||
self.request_count = 0
|
||||
self._consecutive_failures = 0
|
||||
# 2026-04-30: persist the platform_inbound_secret
|
||||
# if the heartbeat response carries one. Mirrors
|
||||
# the cold-start register flow in main.py:319-323
|
||||
# and closes the recovery path for workspaces
|
||||
# whose secret was lazy-healed on the platform
|
||||
# side after register-time. Without this, the
|
||||
# workspace stays 401-forever on chat upload
|
||||
# until restart. See workspace-server PR #2421
|
||||
# for the server-side delivery change.
|
||||
_persist_inbound_secret_from_heartbeat(resp)
|
||||
except Exception as e:
|
||||
self._consecutive_failures += 1
|
||||
# Issue #1877: if heartbeat 401'd, re-read the token from disk
|
||||
@ -202,13 +252,14 @@ class HeartbeatLoop:
|
||||
"uptime_seconds": int(time.time() - self.start_time),
|
||||
}
|
||||
retry_body.update(_runtime_state_payload())
|
||||
await client.post(
|
||||
retry_resp = await client.post(
|
||||
f"{self.platform_url}/registry/heartbeat",
|
||||
json=retry_body,
|
||||
headers=auth_headers(),
|
||||
)
|
||||
self._consecutive_failures = 0
|
||||
self.request_count += 1
|
||||
_persist_inbound_secret_from_heartbeat(retry_resp)
|
||||
except Exception:
|
||||
# Retry also failed — fall through to the normal
|
||||
# failure tracking below.
|
||||
|
||||
@ -51,6 +51,16 @@ logger = logging.getLogger(__name__)
|
||||
# laptop sleep.
|
||||
HEARTBEAT_INTERVAL_SECONDS = 20.0
|
||||
|
||||
# After this many consecutive 401/403 heartbeats, escalate from
|
||||
# WARNING to ERROR with re-onboard guidance. 3 ticks at 20s = ~1 minute
|
||||
# of sustained auth failure — enough to rule out a transient platform
|
||||
# blip but quick enough that an operator doesn't sit puzzled for 10
|
||||
# minutes wondering why their MCP tools 401. Same threshold used for
|
||||
# repeat-logging at 20-tick (~7 min) intervals so a long-running
|
||||
# session that missed the first ERROR still sees the message.
|
||||
_HEARTBEAT_AUTH_LOUD_THRESHOLD = 3
|
||||
_HEARTBEAT_AUTH_RELOG_INTERVAL = 20
|
||||
|
||||
|
||||
def _platform_register(platform_url: str, workspace_id: str, token: str) -> None:
|
||||
"""One-shot register at startup; fails fast on auth errors.
|
||||
@ -145,6 +155,7 @@ def _heartbeat_loop(
|
||||
return
|
||||
|
||||
start_time = time.time()
|
||||
consecutive_auth_failures = 0
|
||||
while True:
|
||||
body = {
|
||||
"workspace_id": workspace_id,
|
||||
@ -165,19 +176,69 @@ def _heartbeat_loop(
|
||||
json=body,
|
||||
headers=headers,
|
||||
)
|
||||
if resp.status_code >= 400:
|
||||
if resp.status_code in (401, 403):
|
||||
consecutive_auth_failures += 1
|
||||
_log_heartbeat_auth_failure(
|
||||
consecutive_auth_failures, workspace_id, resp.status_code,
|
||||
)
|
||||
elif resp.status_code >= 400:
|
||||
# Non-auth HTTP error — log, but DO NOT touch the
|
||||
# auth-failure counter (5xx blips, 429, etc. are
|
||||
# transient and unrelated to token validity).
|
||||
logger.warning(
|
||||
"molecule-mcp: heartbeat HTTP %d: %s",
|
||||
resp.status_code,
|
||||
(resp.text or "")[:200],
|
||||
)
|
||||
else:
|
||||
consecutive_auth_failures = 0
|
||||
_persist_inbound_secret_from_heartbeat(resp)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning("molecule-mcp: heartbeat failed: %s", exc)
|
||||
time.sleep(interval)
|
||||
|
||||
|
||||
def _log_heartbeat_auth_failure(count: int, workspace_id: str, status_code: int) -> None:
|
||||
"""Escalate consecutive heartbeat 401/403s from quiet WARNING to
|
||||
actionable ERROR.
|
||||
|
||||
The operator's first sign of trouble shouldn't be "tools 401 with no
|
||||
explanation" — that was the failure mode that motivated this code,
|
||||
triggered by a workspace being deleted server-side and its tokens
|
||||
revoked while the runtime kept heartbeating in silence.
|
||||
|
||||
Cadence:
|
||||
* count < threshold: WARNING per tick (transient — could be a
|
||||
platform blip, don't shout yet)
|
||||
* count == threshold: ERROR with re-onboard instructions
|
||||
(the first signal the operator can't miss)
|
||||
* count > threshold and (count - threshold) % relog == 0: re-log
|
||||
ERROR (so a session that started after the first ERROR still
|
||||
sees the message scrolling past in their logs)
|
||||
"""
|
||||
if count < _HEARTBEAT_AUTH_LOUD_THRESHOLD:
|
||||
logger.warning(
|
||||
"molecule-mcp: heartbeat HTTP %d (auth failure %d/%d) — "
|
||||
"token may be revoked. Will retry; if persistent, regenerate "
|
||||
"from canvas → Tokens.",
|
||||
status_code, count, _HEARTBEAT_AUTH_LOUD_THRESHOLD,
|
||||
)
|
||||
return
|
||||
# At or past the threshold — this is the loud actionable error.
|
||||
if count == _HEARTBEAT_AUTH_LOUD_THRESHOLD or (
|
||||
count - _HEARTBEAT_AUTH_LOUD_THRESHOLD
|
||||
) % _HEARTBEAT_AUTH_RELOG_INTERVAL == 0:
|
||||
logger.error(
|
||||
"molecule-mcp: %d consecutive heartbeat auth failures (HTTP %d) — "
|
||||
"the token in MOLECULE_WORKSPACE_TOKEN has been REVOKED, likely "
|
||||
"because workspace %s was deleted server-side. The MCP server is "
|
||||
"still running but every platform call will fail. Regenerate the "
|
||||
"workspace + token from the canvas (Tokens tab), update your MCP "
|
||||
"config, and restart your runtime.",
|
||||
count, status_code, workspace_id,
|
||||
)
|
||||
|
||||
|
||||
def _persist_inbound_secret_from_heartbeat(resp: object) -> None:
|
||||
"""Persist ``platform_inbound_secret`` from a heartbeat response, if any.
|
||||
|
||||
|
||||
@ -355,3 +355,149 @@ def test_on_done_restarts_loop():
|
||||
mock_create.assert_called_once()
|
||||
# New task should have done callback
|
||||
mock_new_task.add_done_callback.assert_called_once()
|
||||
|
||||
|
||||
# ============== In-container heartbeat persists platform_inbound_secret (2026-04-30) ==============
|
||||
# Pairs with workspace-server PR #2421's heartbeat-delivers-secret change.
|
||||
# The standalone wrapper (mcp_cli.py) got persistence in #2421; the
|
||||
# in-container heartbeat (heartbeat.py) was missed and the symptom
|
||||
# returned: hongmingwang Claude Code agent stayed 401-forever on chat
|
||||
# upload because the workspace's runtime never picked up the lazy-healed
|
||||
# secret without a restart.
|
||||
|
||||
import heartbeat as heartbeat_mod # noqa: E402
|
||||
|
||||
|
||||
def test_persist_inbound_secret_happy_path(monkeypatch):
|
||||
"""200 with platform_inbound_secret in body → save_inbound_secret called."""
|
||||
|
||||
class FakeResp:
|
||||
def json(self):
|
||||
return {"status": "ok", "platform_inbound_secret": "fresh-secret"}
|
||||
|
||||
saved: list[str] = []
|
||||
import platform_inbound_auth
|
||||
|
||||
monkeypatch.setattr(platform_inbound_auth, "save_inbound_secret", saved.append)
|
||||
|
||||
heartbeat_mod._persist_inbound_secret_from_heartbeat(FakeResp())
|
||||
|
||||
assert saved == ["fresh-secret"]
|
||||
|
||||
|
||||
def test_persist_inbound_secret_skips_when_absent(monkeypatch):
|
||||
class FakeResp:
|
||||
def json(self):
|
||||
return {"status": "ok"}
|
||||
|
||||
saved: list[str] = []
|
||||
import platform_inbound_auth
|
||||
|
||||
monkeypatch.setattr(platform_inbound_auth, "save_inbound_secret", saved.append)
|
||||
|
||||
heartbeat_mod._persist_inbound_secret_from_heartbeat(FakeResp())
|
||||
assert saved == []
|
||||
|
||||
|
||||
def test_persist_inbound_secret_skips_on_empty(monkeypatch):
|
||||
class FakeResp:
|
||||
def json(self):
|
||||
return {"status": "ok", "platform_inbound_secret": ""}
|
||||
|
||||
saved: list[str] = []
|
||||
import platform_inbound_auth
|
||||
|
||||
monkeypatch.setattr(platform_inbound_auth, "save_inbound_secret", saved.append)
|
||||
|
||||
heartbeat_mod._persist_inbound_secret_from_heartbeat(FakeResp())
|
||||
assert saved == []
|
||||
|
||||
|
||||
def test_persist_inbound_secret_swallows_non_json(monkeypatch):
|
||||
class FakeResp:
|
||||
def json(self):
|
||||
raise ValueError("not json")
|
||||
|
||||
saved: list[str] = []
|
||||
import platform_inbound_auth
|
||||
|
||||
monkeypatch.setattr(platform_inbound_auth, "save_inbound_secret", saved.append)
|
||||
|
||||
# Must not raise
|
||||
heartbeat_mod._persist_inbound_secret_from_heartbeat(FakeResp())
|
||||
assert saved == []
|
||||
|
||||
|
||||
def test_persist_inbound_secret_handles_non_dict(monkeypatch):
|
||||
class FakeResp:
|
||||
def json(self):
|
||||
return ["unexpected", "list"]
|
||||
|
||||
saved: list[str] = []
|
||||
import platform_inbound_auth
|
||||
|
||||
monkeypatch.setattr(platform_inbound_auth, "save_inbound_secret", saved.append)
|
||||
|
||||
heartbeat_mod._persist_inbound_secret_from_heartbeat(FakeResp())
|
||||
assert saved == []
|
||||
|
||||
|
||||
def test_persist_inbound_secret_swallows_save_oserror(monkeypatch):
|
||||
class FakeResp:
|
||||
def json(self):
|
||||
return {"platform_inbound_secret": "x"}
|
||||
|
||||
def boom(_secret):
|
||||
raise OSError("disk full")
|
||||
|
||||
import platform_inbound_auth
|
||||
|
||||
monkeypatch.setattr(platform_inbound_auth, "save_inbound_secret", boom)
|
||||
|
||||
# Heartbeat liveness > secret persistence — must not raise.
|
||||
heartbeat_mod._persist_inbound_secret_from_heartbeat(FakeResp())
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_heartbeat_loop_persists_secret_from_response(monkeypatch):
|
||||
"""End-to-end: in-container _loop persists secret when the heartbeat
|
||||
response carries platform_inbound_secret."""
|
||||
saved: list[str] = []
|
||||
|
||||
def fake_persist(resp):
|
||||
try:
|
||||
body = resp.json()
|
||||
except Exception:
|
||||
return
|
||||
if isinstance(body, dict) and body.get("platform_inbound_secret"):
|
||||
saved.append(body["platform_inbound_secret"])
|
||||
|
||||
monkeypatch.setattr(
|
||||
heartbeat_mod,
|
||||
"_persist_inbound_secret_from_heartbeat",
|
||||
fake_persist,
|
||||
)
|
||||
|
||||
hb = HeartbeatLoop("http://platform:8080", "ws-abc")
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.json = MagicMock(
|
||||
return_value={"status": "ok", "platform_inbound_secret": "from-heartbeat"}
|
||||
)
|
||||
mock_client = AsyncMock()
|
||||
mock_client.post = AsyncMock(return_value=mock_response)
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
|
||||
with patch("heartbeat.httpx.AsyncClient", return_value=mock_client):
|
||||
task = asyncio.create_task(hb._loop())
|
||||
await asyncio.sleep(0.05)
|
||||
task.cancel()
|
||||
try:
|
||||
await task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
assert saved == ["from-heartbeat"], (
|
||||
"in-container heartbeat must persist platform_inbound_secret from 200 response"
|
||||
)
|
||||
|
||||
@ -703,3 +703,167 @@ def test_heartbeat_loop_skips_persist_on_4xx(monkeypatch):
|
||||
)
|
||||
|
||||
assert saw == [], "4xx response must NOT trigger persist call"
|
||||
|
||||
|
||||
# ============== Heartbeat auth-failure escalation (2026-05-01) ==============
|
||||
# When a workspace is deleted server-side (DELETE /workspaces/:id), the
|
||||
# platform revokes the workspace's auth token. The heartbeat starts
|
||||
# 401-ing. The previous behavior just logged WARNING on every tick — a
|
||||
# user tailing logs might miss it, and there was no actionable signal
|
||||
# anywhere. Escalate after a small number of consecutive auth failures
|
||||
# so the operator gets a clear "token revoked, re-onboard" message and
|
||||
# isn't left to puzzle out why their MCP tools 401.
|
||||
#
|
||||
# Pairs with the register-time 401 hard-fail path that already exists
|
||||
# at mcp_cli.py:104-111.
|
||||
|
||||
|
||||
def _multi_iter_runner(monkeypatch, response_status_codes):
|
||||
"""Run _heartbeat_loop for ``len(response_status_codes)`` iterations.
|
||||
|
||||
Each call to FakeClient.post returns a response with the next status
|
||||
code from ``response_status_codes``. After all responses are consumed,
|
||||
the next sleep raises SystemExit to break the loop.
|
||||
"""
|
||||
import types
|
||||
|
||||
iterations = {"count": 0}
|
||||
target = len(response_status_codes)
|
||||
|
||||
class FakeResp:
|
||||
def __init__(self, status_code):
|
||||
self.status_code = status_code
|
||||
self.text = "" if status_code < 400 else '{"error":"invalid workspace auth token"}'
|
||||
|
||||
def json(self):
|
||||
if self.status_code >= 400:
|
||||
return {"error": "invalid workspace auth token"}
|
||||
return {"status": "ok"}
|
||||
|
||||
class FakeClient:
|
||||
def __init__(self, **_kw): pass
|
||||
def __enter__(self): return self
|
||||
def __exit__(self, *_a): return False
|
||||
def post(self, *_a, **_kw):
|
||||
i = iterations["count"]
|
||||
sc = response_status_codes[i] if i < len(response_status_codes) else 200
|
||||
return FakeResp(sc)
|
||||
|
||||
fake_httpx = types.ModuleType("httpx")
|
||||
fake_httpx.Client = FakeClient
|
||||
monkeypatch.setitem(sys.modules, "httpx", fake_httpx)
|
||||
|
||||
def fake_sleep(_):
|
||||
iterations["count"] += 1
|
||||
if iterations["count"] >= target:
|
||||
raise SystemExit
|
||||
|
||||
monkeypatch.setattr("time.sleep", fake_sleep)
|
||||
|
||||
with pytest.raises(SystemExit):
|
||||
mcp_cli._heartbeat_loop(
|
||||
"https://test.moleculesai.app",
|
||||
"ws-deleted-12345678",
|
||||
"stale-token",
|
||||
interval=20.0,
|
||||
)
|
||||
|
||||
|
||||
def test_heartbeat_single_401_logs_warning_not_error(monkeypatch, caplog):
|
||||
"""One 401 alone is not enough to declare the token dead — could be a
|
||||
transient platform blip. Log at WARNING; don't shout."""
|
||||
import logging
|
||||
|
||||
caplog.set_level(logging.WARNING, logger="mcp_cli")
|
||||
|
||||
_multi_iter_runner(monkeypatch, [401])
|
||||
|
||||
auth_records = [r for r in caplog.records if "401" in r.message
|
||||
or "auth" in r.message.lower()
|
||||
or "revoked" in r.message.lower()]
|
||||
# At least the WARNING-level mention of HTTP 401 must appear.
|
||||
assert any(r.levelno == logging.WARNING for r in auth_records), (
|
||||
f"expected at least one WARNING about 401, got: "
|
||||
f"{[(r.levelname, r.message) for r in auth_records]}"
|
||||
)
|
||||
# Crucially, NOT escalated to ERROR yet — only one failure.
|
||||
assert not any(r.levelno >= logging.ERROR for r in auth_records), (
|
||||
"single 401 must not escalate to ERROR — premature alarm"
|
||||
)
|
||||
|
||||
|
||||
def test_heartbeat_three_consecutive_401s_escalates_to_error(monkeypatch, caplog):
|
||||
"""Token-revoked is the canonical failure mode after a workspace is
|
||||
deleted server-side. After 3 consecutive 401s the operator gets a
|
||||
LOUD ERROR with re-onboard guidance — not buried at WARNING."""
|
||||
import logging
|
||||
|
||||
caplog.set_level(logging.WARNING, logger="mcp_cli")
|
||||
|
||||
_multi_iter_runner(monkeypatch, [401, 401, 401])
|
||||
|
||||
error_records = [r for r in caplog.records if r.levelno >= logging.ERROR]
|
||||
assert error_records, (
|
||||
f"expected ERROR after 3 consecutive 401s, got only: "
|
||||
f"{[(r.levelname, r.message[:80]) for r in caplog.records]}"
|
||||
)
|
||||
# The message must be actionable — operator needs to know what to do.
|
||||
msg = " ".join(r.message for r in error_records).lower()
|
||||
assert "revoked" in msg or "deleted" in msg, (
|
||||
f"ERROR must explain WHY (token revoked / workspace deleted), got: {msg}"
|
||||
)
|
||||
assert "regenerate" in msg or "re-onboard" in msg or "tokens" in msg, (
|
||||
f"ERROR must point at the canvas Tokens tab so operator knows how to recover, got: {msg}"
|
||||
)
|
||||
# The workspace_id should appear so the operator knows which one is dead.
|
||||
assert "ws-deleted" in msg, f"ERROR must name the dead workspace_id, got: {msg}"
|
||||
|
||||
|
||||
def test_heartbeat_403_treated_same_as_401(monkeypatch, caplog):
|
||||
"""403 Forbidden is the other auth-failure shape (token valid but
|
||||
not authorized for this workspace). Same escalation path."""
|
||||
import logging
|
||||
|
||||
caplog.set_level(logging.WARNING, logger="mcp_cli")
|
||||
|
||||
_multi_iter_runner(monkeypatch, [403, 403, 403])
|
||||
|
||||
error_records = [r for r in caplog.records if r.levelno >= logging.ERROR]
|
||||
assert error_records, "expected ERROR after 3 consecutive 403s"
|
||||
|
||||
|
||||
def test_heartbeat_recovery_resets_consecutive_counter(monkeypatch, caplog):
|
||||
"""If the platform comes back to 200 in the middle of an outage,
|
||||
the auth-failure counter must reset. A subsequent isolated 401
|
||||
later should NOT immediately escalate."""
|
||||
import logging
|
||||
|
||||
caplog.set_level(logging.WARNING, logger="mcp_cli")
|
||||
|
||||
# Two 401s, then 200, then one 401. If counter resets correctly,
|
||||
# the final 401 is "1 consecutive" and should NOT escalate.
|
||||
_multi_iter_runner(monkeypatch, [401, 401, 200, 401])
|
||||
|
||||
error_records = [r for r in caplog.records if r.levelno >= logging.ERROR]
|
||||
assert not error_records, (
|
||||
f"recovered (200) → reset counter → final isolated 401 must NOT "
|
||||
f"escalate. Got ERRORs: {[r.message[:80] for r in error_records]}"
|
||||
)
|
||||
|
||||
|
||||
def test_heartbeat_500_does_not_increment_auth_counter(monkeypatch, caplog):
|
||||
"""5xx is a server-side blip, not auth. Three consecutive 500s
|
||||
must NOT trigger the 'token revoked' escalation — that would be
|
||||
misleading the operator."""
|
||||
import logging
|
||||
|
||||
caplog.set_level(logging.WARNING, logger="mcp_cli")
|
||||
|
||||
_multi_iter_runner(monkeypatch, [500, 500, 500])
|
||||
|
||||
error_records = [r for r in caplog.records if r.levelno >= logging.ERROR]
|
||||
revoked_errors = [r for r in error_records if "revoked" in r.message.lower()]
|
||||
assert not revoked_errors, (
|
||||
f"5xx must NOT be classified as auth failure — would mislead operator. "
|
||||
f"Got 'revoked' ERRORs: {[r.message[:80] for r in revoked_errors]}"
|
||||
)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user