fix(mcp-doctor): heartbeat (idempotent) instead of register (UPSERT)

Self-review caught after #2954 landed: check_register() POSTed to
/registry/register with agent_card.name="doctor-probe". The endpoint
is an UPSERT, so the doctor probe overwrites the workspace's actual
agent_card metadata until the real agent's next register call. An
operator running `molecule-mcp doctor` against a live workspace
would see their canvas briefly display "doctor-probe" as the agent
name — invisible production-disruption.

Switches to POST /registry/heartbeat. heartbeat only updates
last_heartbeat_at (and clears awaiting_agent if needed) — the same
work a normal molecule-mcp boot does every 20s in steady state, so
the doctor's extra heartbeat is indistinguishable from background
traffic.

Function renamed check_register → check_token_auth to match what
it actually does. check_register kept as back-compat alias so any
external test/import still resolves.

Also unified the duplicated token-resolution paths into a single
_resolve_token() returning (value, source_label). Pre-fix:
check_register and _resolve_token_summary read env in parallel
ladders — a future env-var addition would have to touch both.

New tests:
  - test_check_token_auth_uses_heartbeat_endpoint: mocks urlopen,
    asserts the URL ends in /registry/heartbeat AND does NOT
    contain /registry/register. Pins the load-bearing invariant
    so a future refactor can't silently re-route through register.
  - test_resolve_token_returns_value_and_label_for_env: pins the
    consolidated resolver returns both pieces of info from the
    same source-decision.
  - test_resolve_token_returns_none_when_missing: missing-env
    happy path.

Verification:
  - 13/13 tests pass (10 existing + 3 new)
  - Manual stripped-env run still renders 4 FAIL + 2 WARN with
    actionable hints, exit 1.

Refs molecule-core#2934 item 6 (doctor side-effect fix-up).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-05-05 16:11:08 -07:00
parent 7ee696ec9a
commit 2652ea8342
2 changed files with 143 additions and 44 deletions

View File

@ -27,9 +27,14 @@ Six checks, in operator-encounter order:
Catches DNS/firewall/wrong-scheme issues
before the operator hits the real
register call.
6. Registry register POST ${PLATFORM_URL}/registry/register
6. Token auth POST ${PLATFORM_URL}/registry/heartbeat
with the resolved workspace_id+token
returns 2xx. End-to-end auth verification.
Uses heartbeat (idempotent timestamp
update) instead of register (UPSERT
would clobber agent_card metadata) so
the doctor is safe to run against a
live workspace.
Each check prints one of:
[OK] <one-line status>
@ -170,18 +175,31 @@ def check_path_for_binary() -> Verdict:
return "fail"
def _resolve_token_summary() -> Optional[str]:
"""Return a short non-secret description of how the token was
sourced (e.g. "env MOLECULE_WORKSPACE_TOKEN", "file .auth_token"),
or None if no token is reachable.
def _resolve_token() -> tuple[Optional[str], Optional[str]]:
"""Return ``(token_value, source_label)`` if the operator's
environment exposes a token, else ``(None, None)``.
Single source of truth used by both ``check_env_vars()`` (which
only needs the source label) and ``check_register()`` (which
needs the actual value to send a Bearer header). Keeping these
in one place means a future env-var addition only updates the
resolver not two parallel readers that can drift.
"""
if os.environ.get("MOLECULE_WORKSPACE_TOKEN", "").strip():
return "env MOLECULE_WORKSPACE_TOKEN"
val = os.environ.get("MOLECULE_WORKSPACE_TOKEN", "").strip()
if val:
return val, "env MOLECULE_WORKSPACE_TOKEN"
file_var = os.environ.get("MOLECULE_WORKSPACE_TOKEN_FILE", "").strip()
if file_var:
if os.path.isfile(file_var):
return f"file {file_var} (via MOLECULE_WORKSPACE_TOKEN_FILE)"
return None
try:
from pathlib import Path as _Path
return (
_Path(file_var).read_text().strip(),
f"file {file_var} (via MOLECULE_WORKSPACE_TOKEN_FILE)",
)
except OSError:
return None, None
return None, None
# Per-runtime container path used by the in-platform path; rarely
# set on external setups but check anyway so the message is
# accurate for both shapes.
@ -189,10 +207,22 @@ def _resolve_token_summary() -> Optional[str]:
import configs_dir
candidate = configs_dir.resolve() / ".auth_token"
if candidate.is_file():
return f"file {candidate}"
try:
return candidate.read_text().strip(), f"file {candidate}"
except OSError:
return None, None
except Exception:
pass
return None
return None, None
def _resolve_token_summary() -> Optional[str]:
"""Return just the source label (no secret value). Convenience
wrapper around :func:`_resolve_token` for callers that don't
need the value itself.
"""
_, label = _resolve_token()
return label
def check_env_vars() -> Verdict:
@ -281,42 +311,35 @@ def check_platform_health() -> Verdict:
return "ok"
def check_register() -> Verdict:
"""Light end-to-end auth check via POST /registry/register.
def check_token_auth() -> Verdict:
"""Light auth check via POST /registry/heartbeat.
Doesn't write a heartbeat or change platform state beyond what
a normal molecule-mcp boot would do the register endpoint is
idempotent. Skipped when env vars failed earlier so the operator
isn't shown a redundant 401.
Why heartbeat and not register: register is an UPSERT sending
it from doctor would clobber the workspace's actual agent_card
(name, description, version) until the real agent next calls
register. That's an invisible production-disruption: someone
runs ``molecule-mcp doctor`` against a live workspace and the
canvas briefly displays "doctor-probe" as the agent name.
Heartbeat only updates last_heartbeat_at (and clears
awaiting_agent if needed) that's exactly what a normal
molecule-mcp boot does every 20s, so an extra heartbeat from
the doctor is indistinguishable from background traffic.
Skipped when env vars failed earlier so the operator isn't shown
a redundant 401.
"""
label = "Registry register"
label = "Token auth"
base = os.environ.get("PLATFORM_URL", "").strip().rstrip("/")
workspace_id = os.environ.get("WORKSPACE_ID", "").strip()
token_summary = _resolve_token_summary()
if not (base and workspace_id and token_summary):
token, source_label = _resolve_token()
if not (base and workspace_id and token):
_warn(label, "skipped (Env vars must pass first)", "fix Env vars, re-run")
return "warn"
# Get the token value matching the resolver's source preference.
token = os.environ.get("MOLECULE_WORKSPACE_TOKEN", "").strip()
if not token:
file_var = os.environ.get("MOLECULE_WORKSPACE_TOKEN_FILE", "").strip()
if file_var and os.path.isfile(file_var):
try:
token = open(file_var).read().strip()
except Exception as e:
_fail(label, f"could not read token file: {e}", "fix the file permissions or path")
return "fail"
if not token:
_warn(label, "skipped (no token resolvable)", "set MOLECULE_WORKSPACE_TOKEN or MOLECULE_WORKSPACE_TOKEN_FILE")
return "warn"
import json
body = json.dumps({
"id": workspace_id,
"url": "", # external URL is unknown to doctor; register accepts empty
"agent_card": {"name": "doctor-probe", "version": "0.0.0"},
}).encode()
body = json.dumps({"id": workspace_id}).encode()
req = urllib_request.Request(
f"{base}/registry/register",
f"{base}/registry/heartbeat",
data=body,
method="POST",
headers={
@ -333,10 +356,10 @@ def check_register() -> Verdict:
status = getattr(e, "code", None)
err = str(e.reason if hasattr(e, "reason") else e)
if status is None:
_fail(label, f"POST {base}/registry/register failed: {err}", "check network")
_fail(label, f"POST {base}/registry/heartbeat failed: {err}", "check network")
return "fail"
except Exception as e:
_fail(label, f"POST register failed: {e}", "check network")
_fail(label, f"POST heartbeat failed: {e}", "check network")
return "fail"
if status == 401:
_fail(
@ -355,19 +378,26 @@ def check_register() -> Verdict:
)
return "fail"
if not (200 <= status < 300):
_fail(label, f"POST register returned HTTP {status}", "see platform logs")
_fail(label, f"POST heartbeat returned HTTP {status}", "see platform logs")
return "fail"
_ok(label, f"POST {base}/registry/register → {status}")
_ok(label, f"POST {base}/registry/heartbeat → {status} (token from {source_label})")
return "ok"
# Back-compat alias: the previous name was check_register, but the
# implementation switched to a non-mutating heartbeat probe (see
# check_token_auth's docstring). Kept so external test suites or
# pinned-import scripts don't break on the rename.
check_register = check_token_auth
CHECKS = [
check_python_version,
check_wheel_install,
check_path_for_binary,
check_env_vars,
check_platform_health,
check_register,
check_token_auth,
]

View File

@ -105,6 +105,75 @@ def test_check_register_skipped_without_env(monkeypatch):
assert mcp_doctor.check_register() == "warn"
def test_check_token_auth_uses_heartbeat_endpoint(monkeypatch):
"""Pin: doctor MUST hit /registry/heartbeat, not /registry/register.
register is an UPSERT using it from doctor would clobber the
workspace's actual agent_card metadata until the real agent next
calls register. heartbeat only updates last_heartbeat_at, which
a normal molecule-mcp boot does every 20s anyway, so the doctor's
extra heartbeat is indistinguishable from background traffic.
This test pins the URL via a urllib mock so a future refactor
that accidentally re-routes through /registry/register fails
here at PR-review time, not after operators report
"doctor-probe" briefly appearing as their agent name in canvas.
"""
monkeypatch.setenv("PLATFORM_URL", "https://x.moleculesai.app")
monkeypatch.setenv("WORKSPACE_ID", "ws-test")
monkeypatch.setenv("MOLECULE_WORKSPACE_TOKEN", "tok-abc")
monkeypatch.delenv("MOLECULE_WORKSPACE_TOKEN_FILE", raising=False)
captured: dict[str, object] = {}
class _FakeResp:
status = 200
def __enter__(self): return self
def __exit__(self, *a): pass
def fake_urlopen(req, timeout=None):
captured["full_url"] = req.full_url
captured["method"] = req.get_method()
return _FakeResp()
monkeypatch.setattr(mcp_doctor.urllib_request, "urlopen", fake_urlopen)
verdict = mcp_doctor.check_token_auth()
assert verdict == "ok"
assert captured["method"] == "POST"
# The load-bearing assertion — must use heartbeat, never register.
assert captured["full_url"].endswith("/registry/heartbeat"), (
f"doctor must use /registry/heartbeat (idempotent), not register "
f"(UPSERT — clobbers agent_card). Got: {captured['full_url']}"
)
assert "/registry/register" not in str(captured["full_url"]), (
"doctor must NEVER POST to /registry/register — that's a UPSERT "
"that overwrites agent_card metadata until the real agent next "
"calls register."
)
def test_resolve_token_returns_value_and_label_for_env(monkeypatch):
"""The single resolver returns both the value (for Bearer header)
and a non-secret label (for the env-vars summary). Drift between
label and value is the previous bug shape."""
monkeypatch.setenv("PLATFORM_URL", "https://x.moleculesai.app")
monkeypatch.setenv("MOLECULE_WORKSPACE_TOKEN", "secret-tok-abc")
monkeypatch.delenv("MOLECULE_WORKSPACE_TOKEN_FILE", raising=False)
val, label = mcp_doctor._resolve_token()
assert val == "secret-tok-abc"
assert label == "env MOLECULE_WORKSPACE_TOKEN"
# Summary helper must agree with the resolver's source.
assert mcp_doctor._resolve_token_summary() == label
def test_resolve_token_returns_none_when_missing(monkeypatch):
monkeypatch.delenv("MOLECULE_WORKSPACE_TOKEN", raising=False)
monkeypatch.delenv("MOLECULE_WORKSPACE_TOKEN_FILE", raising=False)
val, label = mcp_doctor._resolve_token()
assert val is None
assert label is None
def test_run_returns_1_when_any_fail(monkeypatch, capsys):
"""End-to-end: stripped environment → at least one FAIL →
exit 1. Pin the exit-code contract so this is scriptable from