Merge pull request #2427 from Molecule-AI/staging

staging → main: auto-promote 2a56697
This commit is contained in:
github-actions[bot] 2026-04-30 19:08:15 -07:00 committed by GitHub
commit ace3c85708
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 562 additions and 11 deletions

View File

@ -301,15 +301,34 @@ export function ConfigTab({ workspaceId }: Props) {
// partial-save state — we report it as a user-visible warning
// rather than lying "Saved" and letting the user discover the
// revert on next reload.
const oldModel = (oldParsed.model as string) || "";
//
// Read from runtime_config.model first, then fall back to top-level
// model. The dropdown's onChange (above, ~line 475) writes to
// runtime_config.model whenever a runtime is selected (hermes,
// claude-code, etc.) and only falls back to top-level model when
// there's no runtime. handleSave used to diff against top-level
// model only, so for any runtime-bearing workspace the user's
// model selection never persisted — they'd Save & Restart, the
// EC2 would boot with HERMES_DEFAULT_MODEL empty, and hermes
// would fall back to nousresearch/hermes-4-70b → "No LLM provider
// configured" error in the chat. Caught 2026-04-30 on hongmingwang
// hermes workspace 32993ee7-…cb9d75d112a5.
const nextModelRaw = (nextSource.runtime_config as Record<string, unknown> | undefined)?.model;
const oldModelRaw = (oldParsed.runtime_config as Record<string, unknown> | undefined)?.model;
const nextModel =
typeof nextModelRaw === "string" && nextModelRaw
? nextModelRaw
: typeof nextSource.model === "string"
? nextSource.model
: "";
const oldModel =
typeof oldModelRaw === "string" && oldModelRaw
? oldModelRaw
: (oldParsed.model as string) || "";
let modelSaveError: string | null = null;
if (
typeof nextSource.model === "string" &&
nextSource.model &&
nextSource.model !== oldModel
) {
if (nextModel && nextModel !== oldModel) {
try {
await api.put(`/workspaces/${workspaceId}/model`, { model: nextSource.model });
await api.put(`/workspaces/${workspaceId}/model`, { model: nextModel });
} catch (e) {
modelSaveError = e instanceof Error ? e.message : "Model update was rejected";
}

View File

@ -11,7 +11,7 @@
// Each test pins one invariant. If any fails, the bug is back.
import { describe, it, expect, vi, afterEach, beforeEach } from "vitest";
import { render, screen, cleanup, waitFor } from "@testing-library/react";
import { render, screen, cleanup, waitFor, fireEvent } from "@testing-library/react";
import React from "react";
afterEach(cleanup);
@ -168,6 +168,116 @@ describe("ConfigTab — hermes workspace", () => {
});
});
describe("ConfigTab — Save persists model under runtime_config.model (2026-04-30)", () => {
// The dropdown's onChange writes to config.runtime_config.model whenever
// a runtime is selected (hermes, claude-code, etc.) and only falls back
// to top-level config.model when no runtime is set. The Save handler used
// to diff against top-level model only, so for any runtime-bearing
// workspace the user's model selection never persisted — Save & Restart
// would reboot with HERMES_DEFAULT_MODEL empty, hermes would fall back
// to nousresearch/hermes-4-70b → "No LLM provider configured" in chat.
// Caught 2026-04-30 on hongmingwang hermes workspace.
it("PUTs /model when user picks a model on a hermes workspace", async () => {
apiGet.mockImplementation((path: string) => {
if (path === "/workspaces/ws-test") {
return Promise.resolve({ runtime: "hermes" });
}
if (path === "/workspaces/ws-test/model") {
return Promise.resolve({ model: "" });
}
if (path === "/workspaces/ws-test/files/config.yaml") {
return Promise.reject(new Error("not found"));
}
if (path === "/templates") {
return Promise.resolve([
{
id: "t-hermes",
name: "Hermes",
runtime: "hermes",
models: [
{ id: "minimax/MiniMax-M2.7-highspeed", name: "MiniMax M2.7" },
],
},
]);
}
return Promise.reject(new Error(`unmocked api.get: ${path}`));
});
apiPut.mockResolvedValue({});
apiPatch.mockResolvedValue({});
render(<ConfigTab workspaceId="ws-test" />);
// Wait for the runtime dropdown to populate so the model textbox renders.
await waitFor(() =>
expect(
(screen.getByRole("combobox", { name: /runtime/i }) as HTMLSelectElement).value,
).toBe("hermes"),
);
// The model input is a free-text input wired to a datalist of suggestions.
const modelInput = (await waitFor(() =>
screen.getByPlaceholderText(/anthropic:claude-sonnet/i),
)) as HTMLInputElement;
fireEvent.change(modelInput, {
target: { value: "minimax/MiniMax-M2.7-highspeed" },
});
// Click Save & Restart.
fireEvent.click(screen.getByRole("button", { name: /save & restart/i }));
await waitFor(() => {
expect(apiPut).toHaveBeenCalledWith("/workspaces/ws-test/model", {
model: "minimax/MiniMax-M2.7-highspeed",
});
});
});
it("does NOT PUT /model when the value is unchanged (no-op restart)", async () => {
apiGet.mockImplementation((path: string) => {
if (path === "/workspaces/ws-test") {
return Promise.resolve({ runtime: "hermes" });
}
if (path === "/workspaces/ws-test/model") {
return Promise.resolve({ model: "minimax/MiniMax-M2.7" });
}
if (path === "/workspaces/ws-test/files/config.yaml") {
return Promise.reject(new Error("not found"));
}
if (path === "/templates") {
return Promise.resolve([
{ id: "t-hermes", runtime: "hermes", models: [] },
]);
}
return Promise.reject(new Error(`unmocked api.get: ${path}`));
});
apiPut.mockResolvedValue({});
render(<ConfigTab workspaceId="ws-test" />);
// Wait for load.
await waitFor(() =>
expect(
(screen.getByRole("combobox", { name: /runtime/i }) as HTMLSelectElement).value,
).toBe("hermes"),
);
// Force isDirty by toggling a field that doesn't affect model. (Save is
// disabled until isDirty=true; we want to prove that even when Save
// fires, /model isn't called for an unchanged model.) Skipped — easier
// to just verify apiPut wasn't called with the model URL.
// Without any user edit, Save & Restart is disabled, so /model is
// trivially not PUT. The asserts below verify no /model PUT happens
// at any point during load.
const modelPuts = apiPut.mock.calls.filter(
([path]) => path === "/workspaces/ws-test/model",
);
expect(modelPuts).toHaveLength(0);
});
});
describe("ConfigTab — config.yaml on disk", () => {
it("workspace metadata (DB) wins over config.yaml when both are present (#2061)", async () => {
// Priority inversion in #2061: previously config.yaml overrode DB, so

View File

@ -91,6 +91,46 @@ def _runtime_metadata_payload() -> dict:
logger = logging.getLogger(__name__)
def _persist_inbound_secret_from_heartbeat(resp) -> None:
"""Persist ``platform_inbound_secret`` from a heartbeat response, if any.
The platform's heartbeat handler (workspace-server PR #2421) returns
the secret on every beat mirrors /registry/register so a workspace
whose secret was lazy-healed on the platform side picks it up within
one heartbeat tick instead of requiring a runtime restart.
Without this delivery path the chat-upload code path's "secret was
just minted, will pick up on next heartbeat" 503 message is a lie
and the workspace stays 401-forever until the operator restarts the
runtime. Caught 2026-04-30 on the hongmingwang tenant the
standalone wrapper (mcp_cli.py) got the same change in #2421 but
the in-container heartbeat (this file) was missed in the first
pass.
Failure is non-fatal: if the body isn't JSON, doesn't carry the
field, or the disk write fails, the next heartbeat retries. This
matches the cold-start register flow in main.py:319-323.
"""
try:
body = resp.json()
except Exception:
return
if not isinstance(body, dict):
return
secret = body.get("platform_inbound_secret")
if not secret:
return
try:
from platform_inbound_auth import save_inbound_secret
save_inbound_secret(secret)
except Exception as exc:
logger.warning(
"heartbeat: persist inbound secret failed: %s", exc
)
HEARTBEAT_INTERVAL = 30 # seconds
MAX_CONSECUTIVE_FAILURES = 10
MAX_SEEN_DELEGATION_IDS = 200
@ -172,7 +212,7 @@ class HeartbeatLoop:
# runtime_state to flip status → degraded.
body.update(_runtime_state_payload())
body.update(_runtime_metadata_payload())
await client.post(
resp = await client.post(
f"{self.platform_url}/registry/heartbeat",
json=body,
headers=auth_headers(),
@ -180,6 +220,16 @@ class HeartbeatLoop:
self.error_count = 0
self.request_count = 0
self._consecutive_failures = 0
# 2026-04-30: persist the platform_inbound_secret
# if the heartbeat response carries one. Mirrors
# the cold-start register flow in main.py:319-323
# and closes the recovery path for workspaces
# whose secret was lazy-healed on the platform
# side after register-time. Without this, the
# workspace stays 401-forever on chat upload
# until restart. See workspace-server PR #2421
# for the server-side delivery change.
_persist_inbound_secret_from_heartbeat(resp)
except Exception as e:
self._consecutive_failures += 1
# Issue #1877: if heartbeat 401'd, re-read the token from disk
@ -202,13 +252,14 @@ class HeartbeatLoop:
"uptime_seconds": int(time.time() - self.start_time),
}
retry_body.update(_runtime_state_payload())
await client.post(
retry_resp = await client.post(
f"{self.platform_url}/registry/heartbeat",
json=retry_body,
headers=auth_headers(),
)
self._consecutive_failures = 0
self.request_count += 1
_persist_inbound_secret_from_heartbeat(retry_resp)
except Exception:
# Retry also failed — fall through to the normal
# failure tracking below.

View File

@ -51,6 +51,16 @@ logger = logging.getLogger(__name__)
# laptop sleep.
HEARTBEAT_INTERVAL_SECONDS = 20.0
# After this many consecutive 401/403 heartbeats, escalate from
# WARNING to ERROR with re-onboard guidance. 3 ticks at 20s = ~1 minute
# of sustained auth failure — enough to rule out a transient platform
# blip but quick enough that an operator doesn't sit puzzled for 10
# minutes wondering why their MCP tools 401. Same threshold used for
# repeat-logging at 20-tick (~7 min) intervals so a long-running
# session that missed the first ERROR still sees the message.
_HEARTBEAT_AUTH_LOUD_THRESHOLD = 3
_HEARTBEAT_AUTH_RELOG_INTERVAL = 20
def _platform_register(platform_url: str, workspace_id: str, token: str) -> None:
"""One-shot register at startup; fails fast on auth errors.
@ -145,6 +155,7 @@ def _heartbeat_loop(
return
start_time = time.time()
consecutive_auth_failures = 0
while True:
body = {
"workspace_id": workspace_id,
@ -165,19 +176,69 @@ def _heartbeat_loop(
json=body,
headers=headers,
)
if resp.status_code >= 400:
if resp.status_code in (401, 403):
consecutive_auth_failures += 1
_log_heartbeat_auth_failure(
consecutive_auth_failures, workspace_id, resp.status_code,
)
elif resp.status_code >= 400:
# Non-auth HTTP error — log, but DO NOT touch the
# auth-failure counter (5xx blips, 429, etc. are
# transient and unrelated to token validity).
logger.warning(
"molecule-mcp: heartbeat HTTP %d: %s",
resp.status_code,
(resp.text or "")[:200],
)
else:
consecutive_auth_failures = 0
_persist_inbound_secret_from_heartbeat(resp)
except Exception as exc: # noqa: BLE001
logger.warning("molecule-mcp: heartbeat failed: %s", exc)
time.sleep(interval)
def _log_heartbeat_auth_failure(count: int, workspace_id: str, status_code: int) -> None:
"""Escalate consecutive heartbeat 401/403s from quiet WARNING to
actionable ERROR.
The operator's first sign of trouble shouldn't be "tools 401 with no
explanation" — that was the failure mode that motivated this code,
triggered by a workspace being deleted server-side and its tokens
revoked while the runtime kept heartbeating in silence.
Cadence:
* count < threshold: WARNING per tick (transient could be a
platform blip, don't shout yet)
* count == threshold: ERROR with re-onboard instructions
(the first signal the operator can't miss)
* count > threshold and (count - threshold) % relog == 0: re-log
ERROR (so a session that started after the first ERROR still
sees the message scrolling past in their logs)
"""
if count < _HEARTBEAT_AUTH_LOUD_THRESHOLD:
logger.warning(
"molecule-mcp: heartbeat HTTP %d (auth failure %d/%d) — "
"token may be revoked. Will retry; if persistent, regenerate "
"from canvas → Tokens.",
status_code, count, _HEARTBEAT_AUTH_LOUD_THRESHOLD,
)
return
# At or past the threshold — this is the loud actionable error.
if count == _HEARTBEAT_AUTH_LOUD_THRESHOLD or (
count - _HEARTBEAT_AUTH_LOUD_THRESHOLD
) % _HEARTBEAT_AUTH_RELOG_INTERVAL == 0:
logger.error(
"molecule-mcp: %d consecutive heartbeat auth failures (HTTP %d) — "
"the token in MOLECULE_WORKSPACE_TOKEN has been REVOKED, likely "
"because workspace %s was deleted server-side. The MCP server is "
"still running but every platform call will fail. Regenerate the "
"workspace + token from the canvas (Tokens tab), update your MCP "
"config, and restart your runtime.",
count, status_code, workspace_id,
)
def _persist_inbound_secret_from_heartbeat(resp: object) -> None:
"""Persist ``platform_inbound_secret`` from a heartbeat response, if any.

View File

@ -355,3 +355,149 @@ def test_on_done_restarts_loop():
mock_create.assert_called_once()
# New task should have done callback
mock_new_task.add_done_callback.assert_called_once()
# ============== In-container heartbeat persists platform_inbound_secret (2026-04-30) ==============
# Pairs with workspace-server PR #2421's heartbeat-delivers-secret change.
# The standalone wrapper (mcp_cli.py) got persistence in #2421; the
# in-container heartbeat (heartbeat.py) was missed and the symptom
# returned: hongmingwang Claude Code agent stayed 401-forever on chat
# upload because the workspace's runtime never picked up the lazy-healed
# secret without a restart.
import heartbeat as heartbeat_mod # noqa: E402
def test_persist_inbound_secret_happy_path(monkeypatch):
"""200 with platform_inbound_secret in body → save_inbound_secret called."""
class FakeResp:
def json(self):
return {"status": "ok", "platform_inbound_secret": "fresh-secret"}
saved: list[str] = []
import platform_inbound_auth
monkeypatch.setattr(platform_inbound_auth, "save_inbound_secret", saved.append)
heartbeat_mod._persist_inbound_secret_from_heartbeat(FakeResp())
assert saved == ["fresh-secret"]
def test_persist_inbound_secret_skips_when_absent(monkeypatch):
class FakeResp:
def json(self):
return {"status": "ok"}
saved: list[str] = []
import platform_inbound_auth
monkeypatch.setattr(platform_inbound_auth, "save_inbound_secret", saved.append)
heartbeat_mod._persist_inbound_secret_from_heartbeat(FakeResp())
assert saved == []
def test_persist_inbound_secret_skips_on_empty(monkeypatch):
class FakeResp:
def json(self):
return {"status": "ok", "platform_inbound_secret": ""}
saved: list[str] = []
import platform_inbound_auth
monkeypatch.setattr(platform_inbound_auth, "save_inbound_secret", saved.append)
heartbeat_mod._persist_inbound_secret_from_heartbeat(FakeResp())
assert saved == []
def test_persist_inbound_secret_swallows_non_json(monkeypatch):
class FakeResp:
def json(self):
raise ValueError("not json")
saved: list[str] = []
import platform_inbound_auth
monkeypatch.setattr(platform_inbound_auth, "save_inbound_secret", saved.append)
# Must not raise
heartbeat_mod._persist_inbound_secret_from_heartbeat(FakeResp())
assert saved == []
def test_persist_inbound_secret_handles_non_dict(monkeypatch):
class FakeResp:
def json(self):
return ["unexpected", "list"]
saved: list[str] = []
import platform_inbound_auth
monkeypatch.setattr(platform_inbound_auth, "save_inbound_secret", saved.append)
heartbeat_mod._persist_inbound_secret_from_heartbeat(FakeResp())
assert saved == []
def test_persist_inbound_secret_swallows_save_oserror(monkeypatch):
class FakeResp:
def json(self):
return {"platform_inbound_secret": "x"}
def boom(_secret):
raise OSError("disk full")
import platform_inbound_auth
monkeypatch.setattr(platform_inbound_auth, "save_inbound_secret", boom)
# Heartbeat liveness > secret persistence — must not raise.
heartbeat_mod._persist_inbound_secret_from_heartbeat(FakeResp())
@pytest.mark.asyncio
async def test_heartbeat_loop_persists_secret_from_response(monkeypatch):
"""End-to-end: in-container _loop persists secret when the heartbeat
response carries platform_inbound_secret."""
saved: list[str] = []
def fake_persist(resp):
try:
body = resp.json()
except Exception:
return
if isinstance(body, dict) and body.get("platform_inbound_secret"):
saved.append(body["platform_inbound_secret"])
monkeypatch.setattr(
heartbeat_mod,
"_persist_inbound_secret_from_heartbeat",
fake_persist,
)
hb = HeartbeatLoop("http://platform:8080", "ws-abc")
mock_response = MagicMock()
mock_response.json = MagicMock(
return_value={"status": "ok", "platform_inbound_secret": "from-heartbeat"}
)
mock_client = AsyncMock()
mock_client.post = AsyncMock(return_value=mock_response)
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
with patch("heartbeat.httpx.AsyncClient", return_value=mock_client):
task = asyncio.create_task(hb._loop())
await asyncio.sleep(0.05)
task.cancel()
try:
await task
except asyncio.CancelledError:
pass
assert saved == ["from-heartbeat"], (
"in-container heartbeat must persist platform_inbound_secret from 200 response"
)

View File

@ -703,3 +703,167 @@ def test_heartbeat_loop_skips_persist_on_4xx(monkeypatch):
)
assert saw == [], "4xx response must NOT trigger persist call"
# ============== Heartbeat auth-failure escalation (2026-05-01) ==============
# When a workspace is deleted server-side (DELETE /workspaces/:id), the
# platform revokes the workspace's auth token. The heartbeat starts
# 401-ing. The previous behavior just logged WARNING on every tick — a
# user tailing logs might miss it, and there was no actionable signal
# anywhere. Escalate after a small number of consecutive auth failures
# so the operator gets a clear "token revoked, re-onboard" message and
# isn't left to puzzle out why their MCP tools 401.
#
# Pairs with the register-time 401 hard-fail path that already exists
# at mcp_cli.py:104-111.
def _multi_iter_runner(monkeypatch, response_status_codes):
"""Run _heartbeat_loop for ``len(response_status_codes)`` iterations.
Each call to FakeClient.post returns a response with the next status
code from ``response_status_codes``. After all responses are consumed,
the next sleep raises SystemExit to break the loop.
"""
import types
iterations = {"count": 0}
target = len(response_status_codes)
class FakeResp:
def __init__(self, status_code):
self.status_code = status_code
self.text = "" if status_code < 400 else '{"error":"invalid workspace auth token"}'
def json(self):
if self.status_code >= 400:
return {"error": "invalid workspace auth token"}
return {"status": "ok"}
class FakeClient:
def __init__(self, **_kw): pass
def __enter__(self): return self
def __exit__(self, *_a): return False
def post(self, *_a, **_kw):
i = iterations["count"]
sc = response_status_codes[i] if i < len(response_status_codes) else 200
return FakeResp(sc)
fake_httpx = types.ModuleType("httpx")
fake_httpx.Client = FakeClient
monkeypatch.setitem(sys.modules, "httpx", fake_httpx)
def fake_sleep(_):
iterations["count"] += 1
if iterations["count"] >= target:
raise SystemExit
monkeypatch.setattr("time.sleep", fake_sleep)
with pytest.raises(SystemExit):
mcp_cli._heartbeat_loop(
"https://test.moleculesai.app",
"ws-deleted-12345678",
"stale-token",
interval=20.0,
)
def test_heartbeat_single_401_logs_warning_not_error(monkeypatch, caplog):
"""One 401 alone is not enough to declare the token dead — could be a
transient platform blip. Log at WARNING; don't shout."""
import logging
caplog.set_level(logging.WARNING, logger="mcp_cli")
_multi_iter_runner(monkeypatch, [401])
auth_records = [r for r in caplog.records if "401" in r.message
or "auth" in r.message.lower()
or "revoked" in r.message.lower()]
# At least the WARNING-level mention of HTTP 401 must appear.
assert any(r.levelno == logging.WARNING for r in auth_records), (
f"expected at least one WARNING about 401, got: "
f"{[(r.levelname, r.message) for r in auth_records]}"
)
# Crucially, NOT escalated to ERROR yet — only one failure.
assert not any(r.levelno >= logging.ERROR for r in auth_records), (
"single 401 must not escalate to ERROR — premature alarm"
)
def test_heartbeat_three_consecutive_401s_escalates_to_error(monkeypatch, caplog):
"""Token-revoked is the canonical failure mode after a workspace is
deleted server-side. After 3 consecutive 401s the operator gets a
LOUD ERROR with re-onboard guidance not buried at WARNING."""
import logging
caplog.set_level(logging.WARNING, logger="mcp_cli")
_multi_iter_runner(monkeypatch, [401, 401, 401])
error_records = [r for r in caplog.records if r.levelno >= logging.ERROR]
assert error_records, (
f"expected ERROR after 3 consecutive 401s, got only: "
f"{[(r.levelname, r.message[:80]) for r in caplog.records]}"
)
# The message must be actionable — operator needs to know what to do.
msg = " ".join(r.message for r in error_records).lower()
assert "revoked" in msg or "deleted" in msg, (
f"ERROR must explain WHY (token revoked / workspace deleted), got: {msg}"
)
assert "regenerate" in msg or "re-onboard" in msg or "tokens" in msg, (
f"ERROR must point at the canvas Tokens tab so operator knows how to recover, got: {msg}"
)
# The workspace_id should appear so the operator knows which one is dead.
assert "ws-deleted" in msg, f"ERROR must name the dead workspace_id, got: {msg}"
def test_heartbeat_403_treated_same_as_401(monkeypatch, caplog):
"""403 Forbidden is the other auth-failure shape (token valid but
not authorized for this workspace). Same escalation path."""
import logging
caplog.set_level(logging.WARNING, logger="mcp_cli")
_multi_iter_runner(monkeypatch, [403, 403, 403])
error_records = [r for r in caplog.records if r.levelno >= logging.ERROR]
assert error_records, "expected ERROR after 3 consecutive 403s"
def test_heartbeat_recovery_resets_consecutive_counter(monkeypatch, caplog):
"""If the platform comes back to 200 in the middle of an outage,
the auth-failure counter must reset. A subsequent isolated 401
later should NOT immediately escalate."""
import logging
caplog.set_level(logging.WARNING, logger="mcp_cli")
# Two 401s, then 200, then one 401. If counter resets correctly,
# the final 401 is "1 consecutive" and should NOT escalate.
_multi_iter_runner(monkeypatch, [401, 401, 200, 401])
error_records = [r for r in caplog.records if r.levelno >= logging.ERROR]
assert not error_records, (
f"recovered (200) → reset counter → final isolated 401 must NOT "
f"escalate. Got ERRORs: {[r.message[:80] for r in error_records]}"
)
def test_heartbeat_500_does_not_increment_auth_counter(monkeypatch, caplog):
"""5xx is a server-side blip, not auth. Three consecutive 500s
must NOT trigger the 'token revoked' escalation that would be
misleading the operator."""
import logging
caplog.set_level(logging.WARNING, logger="mcp_cli")
_multi_iter_runner(monkeypatch, [500, 500, 500])
error_records = [r for r in caplog.records if r.levelno >= logging.ERROR]
revoked_errors = [r for r in error_records if "revoked" in r.message.lower()]
assert not revoked_errors, (
f"5xx must NOT be classified as auth failure — would mislead operator. "
f"Got 'revoked' ERRORs: {[r.message[:80] for r in revoked_errors]}"
)