forked from molecule-ai/molecule-core
Wheel-build smoke gate detected `configs_dir` missing from scripts/build_runtime_package.py:TOP_LEVEL_MODULES. Without it the build would ship `import configs_dir` un-rewritten and every external-runtime install would die on `ModuleNotFoundError` at first import. Two callers used `import configs_dir as _configs_dir` to belt-and- suspenders against an imagined name collision, but the rewriter rejects `import X as Y` because the rewrite would produce `import molecule_runtime.X as X as Y` (invalid syntax). No actual collision exists (only docstring/comment references). Switched to plain `import configs_dir`. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
487 lines
19 KiB
Python
487 lines
19 KiB
Python
"""Console-script entry point for the ``molecule-mcp`` universal MCP server.
|
|
|
|
Validates required environment BEFORE importing the heavy
|
|
``a2a_mcp_server`` module — that module triggers a ``RuntimeError`` at
|
|
import time when ``WORKSPACE_ID`` is unset (a2a_client.py:22), and
|
|
console-script entry-point shims surface it as an ugly traceback. This
|
|
wrapper catches the missing-env case early and prints actionable help
|
|
to stderr so an operator running ``molecule-mcp`` for the first time
|
|
gets the right pointer in the first 3 lines of output instead of a
|
|
20-line traceback.
|
|
|
|
Standalone-runtime contract: this wrapper is responsible for keeping
|
|
the workspace ALIVE on the platform side, not just exposing tools.
|
|
Concretely it:
|
|
1. Calls ``POST /registry/register`` once at startup (idempotent —
|
|
the upsert flips status awaiting_agent → online for an external
|
|
workspace whose token matches).
|
|
2. Spawns a daemon heartbeat thread that POSTs to
|
|
``POST /registry/heartbeat`` every 20s. Without continuous
|
|
heartbeats the platform's healthsweep flips the workspace back
|
|
to awaiting_agent (visible as OFFLINE in the canvas with a
|
|
"Restart" CTA) within 60-90s.
|
|
3. Runs the MCP stdio loop in the foreground.
|
|
|
|
Why threads + sync requests: the MCP stdio server is async. The
|
|
heartbeat work is fire-and-forget HTTP. A daemon thread is the
|
|
lowest-friction integration — no asyncio bridging, dies automatically
|
|
when the main process exits, and ``requests`` is already a transitive
|
|
dependency via ``a2a-sdk``.
|
|
|
|
In-container usage (``python -m molecule_runtime.a2a_mcp_server`` or
|
|
direct import) bypasses this wrapper — the workspace runtime has its
|
|
own heartbeat loop in ``heartbeat.py`` so we don't double-heartbeat.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import sys
|
|
import threading
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import configs_dir
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Heartbeat cadence. Must be tighter than healthsweep's stale window
|
|
# (currently 60-90s — see registry/healthsweep.go) by a comfortable
|
|
# margin so a single missed heartbeat doesn't flip awaiting_agent.
|
|
# 20s gives the operator's network 3 attempts within the budget; long
|
|
# enough that it doesn't spam, short enough to recover quickly after
|
|
# laptop sleep.
|
|
HEARTBEAT_INTERVAL_SECONDS = 20.0
|
|
|
|
# After this many consecutive 401/403 heartbeats, escalate from
|
|
# WARNING to ERROR with re-onboard guidance. 3 ticks at 20s = ~1 minute
|
|
# of sustained auth failure — enough to rule out a transient platform
|
|
# blip but quick enough that an operator doesn't sit puzzled for 10
|
|
# minutes wondering why their MCP tools 401. Same threshold used for
|
|
# repeat-logging at 20-tick (~7 min) intervals so a long-running
|
|
# session that missed the first ERROR still sees the message.
|
|
_HEARTBEAT_AUTH_LOUD_THRESHOLD = 3
|
|
_HEARTBEAT_AUTH_RELOG_INTERVAL = 20
|
|
|
|
|
|
def _build_agent_card(workspace_id: str) -> dict:
|
|
"""Build the ``agent_card`` payload sent to /registry/register.
|
|
|
|
Three optional env vars override the defaults so an operator can
|
|
surface human-readable identity + capabilities to peers and the
|
|
canvas Skills tab without code changes:
|
|
|
|
* ``MOLECULE_AGENT_NAME`` — display name (defaults to
|
|
``molecule-mcp-{id[:8]}``). Surfaced in canvas workspace cards
|
|
and ``list_peers`` output.
|
|
* ``MOLECULE_AGENT_DESCRIPTION`` — one-liner about the agent's
|
|
purpose. Rendered in canvas Details + Skills tabs.
|
|
* ``MOLECULE_AGENT_SKILLS`` — comma-separated skill names
|
|
(e.g. ``research,code-review,memory-curation``). Each name is
|
|
expanded to a ``{"name": ...}`` skill object — the minimum
|
|
shape that satisfies both ``shared_runtime.summarize_peers``
|
|
(uses ``s["name"]``) and the canvas SkillsTab.tsx schema
|
|
(id falls back to name when omitted). Empty / whitespace
|
|
entries are dropped.
|
|
|
|
Defaults match the previous hardcoded behaviour exactly so this
|
|
is a strict superset — an operator who sets none of the env vars
|
|
sees no change.
|
|
"""
|
|
name = (os.environ.get("MOLECULE_AGENT_NAME") or "").strip()
|
|
if not name:
|
|
name = f"molecule-mcp-{workspace_id[:8]}"
|
|
|
|
description = (os.environ.get("MOLECULE_AGENT_DESCRIPTION") or "").strip()
|
|
|
|
skills_raw = (os.environ.get("MOLECULE_AGENT_SKILLS") or "").strip()
|
|
skills: list[dict] = []
|
|
if skills_raw:
|
|
for s in skills_raw.split(","):
|
|
label = s.strip()
|
|
if label:
|
|
skills.append({"name": label})
|
|
|
|
card: dict = {"name": name, "skills": skills}
|
|
if description:
|
|
card["description"] = description
|
|
return card
|
|
|
|
|
|
def _platform_register(platform_url: str, workspace_id: str, token: str) -> None:
|
|
"""One-shot register at startup; fails fast on auth errors.
|
|
|
|
Lifts the workspace from ``awaiting_agent`` to ``online`` for
|
|
operators who never ran the curl-register snippet. Safe to call
|
|
repeatedly: the platform's register handler is an upsert that
|
|
just refreshes ``url``, ``agent_card``, and ``status``.
|
|
|
|
Failure model (post-review):
|
|
- 401 / 403 → ``sys.exit(3)`` immediately. The operator's
|
|
token is wrong; silently looping in a broken state would
|
|
make this hard to diagnose because the MCP tools would 401
|
|
on every call too. Hard-fail is the kindest option.
|
|
- Other 4xx/5xx → log a warning + continue. The heartbeat
|
|
thread will surface persistent failures; transient platform
|
|
blips shouldn't abort the MCP loop.
|
|
- Network / transport errors → log + continue. Same reasoning.
|
|
|
|
Origin header is required by the SaaS edge WAF; without it
|
|
/registry/register currently still works (it's on the WAF
|
|
allowlist), but the heartbeat path needs Origin and we want one
|
|
consistent header set across both calls.
|
|
"""
|
|
try:
|
|
import httpx
|
|
except ImportError:
|
|
# httpx is a transitive dep via a2a-sdk; if missing, the MCP
|
|
# server won't import either. Let the caller's later import
|
|
# surface the real error.
|
|
return
|
|
|
|
payload = {
|
|
"id": workspace_id,
|
|
"url": "",
|
|
"agent_card": _build_agent_card(workspace_id),
|
|
"delivery_mode": "poll",
|
|
}
|
|
headers = {
|
|
"Authorization": f"Bearer {token}",
|
|
"Origin": platform_url,
|
|
"Content-Type": "application/json",
|
|
}
|
|
try:
|
|
with httpx.Client(timeout=10.0) as client:
|
|
resp = client.post(
|
|
f"{platform_url}/registry/register",
|
|
json=payload,
|
|
headers=headers,
|
|
)
|
|
if resp.status_code in (401, 403):
|
|
print(
|
|
f"molecule-mcp: register rejected with HTTP {resp.status_code} — "
|
|
f"the token in MOLECULE_WORKSPACE_TOKEN is invalid for workspace "
|
|
f"{workspace_id}. Regenerate from the canvas → Tokens tab.",
|
|
file=sys.stderr,
|
|
)
|
|
sys.exit(3)
|
|
if resp.status_code >= 400:
|
|
logger.warning(
|
|
"molecule-mcp: register POST returned HTTP %d: %s",
|
|
resp.status_code,
|
|
(resp.text or "")[:200],
|
|
)
|
|
else:
|
|
logger.info(
|
|
"molecule-mcp: registered workspace %s with platform",
|
|
workspace_id,
|
|
)
|
|
except SystemExit:
|
|
raise
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.warning("molecule-mcp: register POST failed: %s", exc)
|
|
|
|
|
|
def _heartbeat_loop(
|
|
platform_url: str,
|
|
workspace_id: str,
|
|
token: str,
|
|
interval: float = HEARTBEAT_INTERVAL_SECONDS,
|
|
) -> None:
|
|
"""Daemon thread body: POST /registry/heartbeat every ``interval``s.
|
|
|
|
Failures are logged at WARNING and the loop continues. The thread
|
|
exits when the main process does (daemon=True). Each iteration
|
|
rebuilds the payload + headers — cheap and ensures token rotation
|
|
via env var (rare but possible) is picked up on the next tick.
|
|
"""
|
|
try:
|
|
import httpx
|
|
except ImportError:
|
|
return
|
|
|
|
start_time = time.time()
|
|
consecutive_auth_failures = 0
|
|
while True:
|
|
body = {
|
|
"workspace_id": workspace_id,
|
|
"error_rate": 0.0,
|
|
"sample_error": "",
|
|
"active_tasks": 0,
|
|
"uptime_seconds": int(time.time() - start_time),
|
|
}
|
|
headers = {
|
|
"Authorization": f"Bearer {token}",
|
|
"Origin": platform_url,
|
|
"Content-Type": "application/json",
|
|
}
|
|
try:
|
|
with httpx.Client(timeout=10.0) as client:
|
|
resp = client.post(
|
|
f"{platform_url}/registry/heartbeat",
|
|
json=body,
|
|
headers=headers,
|
|
)
|
|
if resp.status_code in (401, 403):
|
|
consecutive_auth_failures += 1
|
|
_log_heartbeat_auth_failure(
|
|
consecutive_auth_failures, workspace_id, resp.status_code,
|
|
)
|
|
elif resp.status_code >= 400:
|
|
# Non-auth HTTP error — log, but DO NOT touch the
|
|
# auth-failure counter (5xx blips, 429, etc. are
|
|
# transient and unrelated to token validity).
|
|
logger.warning(
|
|
"molecule-mcp: heartbeat HTTP %d: %s",
|
|
resp.status_code,
|
|
(resp.text or "")[:200],
|
|
)
|
|
else:
|
|
consecutive_auth_failures = 0
|
|
_persist_inbound_secret_from_heartbeat(resp)
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.warning("molecule-mcp: heartbeat failed: %s", exc)
|
|
time.sleep(interval)
|
|
|
|
|
|
def _log_heartbeat_auth_failure(count: int, workspace_id: str, status_code: int) -> None:
|
|
"""Escalate consecutive heartbeat 401/403s from quiet WARNING to
|
|
actionable ERROR.
|
|
|
|
The operator's first sign of trouble shouldn't be "tools 401 with no
|
|
explanation" — that was the failure mode that motivated this code,
|
|
triggered by a workspace being deleted server-side and its tokens
|
|
revoked while the runtime kept heartbeating in silence.
|
|
|
|
Cadence:
|
|
* count < threshold: WARNING per tick (transient — could be a
|
|
platform blip, don't shout yet)
|
|
* count == threshold: ERROR with re-onboard instructions
|
|
(the first signal the operator can't miss)
|
|
* count > threshold and (count - threshold) % relog == 0: re-log
|
|
ERROR (so a session that started after the first ERROR still
|
|
sees the message scrolling past in their logs)
|
|
"""
|
|
if count < _HEARTBEAT_AUTH_LOUD_THRESHOLD:
|
|
logger.warning(
|
|
"molecule-mcp: heartbeat HTTP %d (auth failure %d/%d) — "
|
|
"token may be revoked. Will retry; if persistent, regenerate "
|
|
"from canvas → Tokens.",
|
|
status_code, count, _HEARTBEAT_AUTH_LOUD_THRESHOLD,
|
|
)
|
|
return
|
|
# At or past the threshold — this is the loud actionable error.
|
|
if count == _HEARTBEAT_AUTH_LOUD_THRESHOLD or (
|
|
count - _HEARTBEAT_AUTH_LOUD_THRESHOLD
|
|
) % _HEARTBEAT_AUTH_RELOG_INTERVAL == 0:
|
|
logger.error(
|
|
"molecule-mcp: %d consecutive heartbeat auth failures (HTTP %d) — "
|
|
"the token in MOLECULE_WORKSPACE_TOKEN has been REVOKED, likely "
|
|
"because workspace %s was deleted server-side. The MCP server is "
|
|
"still running but every platform call will fail. Regenerate the "
|
|
"workspace + token from the canvas (Tokens tab), update your MCP "
|
|
"config, and restart your runtime.",
|
|
count, status_code, workspace_id,
|
|
)
|
|
|
|
|
|
def _persist_inbound_secret_from_heartbeat(resp: object) -> None:
|
|
"""Persist ``platform_inbound_secret`` from a heartbeat response, if any.
|
|
|
|
The platform's heartbeat handler returns the secret on every beat
|
|
(mirroring /registry/register) so a workspace that lazy-healed the
|
|
secret on the platform side — typical recovery path for a workspace
|
|
whose row had a NULL ``platform_inbound_secret`` after a partial
|
|
bootstrap — picks it up within one heartbeat tick instead of
|
|
requiring a runtime restart.
|
|
|
|
Without this delivery path the chat-upload code path's "secret was
|
|
just minted, will pick up on next heartbeat" 503 message is a lie
|
|
and the workspace stays 401-forever until the operator restarts
|
|
the runtime. Caught 2026-04-30 on hongmingwang tenant.
|
|
|
|
Failure is non-fatal: if the body isn't JSON, doesn't carry the
|
|
field, or the disk write fails, the next heartbeat retries. This
|
|
matches the cold-start register flow in main.py:319-323.
|
|
"""
|
|
try:
|
|
body = resp.json()
|
|
except Exception: # noqa: BLE001
|
|
return
|
|
if not isinstance(body, dict):
|
|
return
|
|
secret = body.get("platform_inbound_secret")
|
|
if not secret:
|
|
return
|
|
try:
|
|
from platform_inbound_auth import save_inbound_secret
|
|
|
|
save_inbound_secret(secret)
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.warning(
|
|
"molecule-mcp: persist inbound secret from heartbeat failed: %s", exc
|
|
)
|
|
|
|
|
|
def _start_heartbeat_thread(
|
|
platform_url: str,
|
|
workspace_id: str,
|
|
token: str,
|
|
) -> threading.Thread:
|
|
"""Start the heartbeat daemon thread. Returns the Thread handle.
|
|
|
|
The MCP stdio loop runs in the foreground (asyncio); this thread
|
|
runs alongside it. ``daemon=True`` so when the operator hits
|
|
Ctrl-C / closes the runtime, the heartbeat dies with it instead
|
|
of leaking and writing to a stale workspace.
|
|
"""
|
|
t = threading.Thread(
|
|
target=_heartbeat_loop,
|
|
args=(platform_url, workspace_id, token),
|
|
name="molecule-mcp-heartbeat",
|
|
daemon=True,
|
|
)
|
|
t.start()
|
|
return t
|
|
|
|
|
|
def _print_missing_env_help(missing: list[str], have_token_file: bool) -> None:
|
|
print("molecule-mcp: missing required environment.\n", file=sys.stderr)
|
|
print("Set the following before running molecule-mcp:", file=sys.stderr)
|
|
print(" WORKSPACE_ID — your workspace UUID (from canvas)", file=sys.stderr)
|
|
print(
|
|
" PLATFORM_URL — base URL of your Molecule platform "
|
|
"(e.g. https://your-tenant.staging.moleculesai.app)",
|
|
file=sys.stderr,
|
|
)
|
|
if not have_token_file:
|
|
print(
|
|
" MOLECULE_WORKSPACE_TOKEN — bearer token for this workspace "
|
|
"(canvas → Tokens tab)",
|
|
file=sys.stderr,
|
|
)
|
|
print("", file=sys.stderr)
|
|
print(f"Currently missing: {', '.join(missing)}", file=sys.stderr)
|
|
|
|
|
|
def main() -> None:
|
|
"""Entry point for the ``molecule-mcp`` console script.
|
|
|
|
Returns nothing — calls ``sys.exit`` on validation failure or on
|
|
normal completion of the underlying MCP server loop.
|
|
"""
|
|
missing: list[str] = []
|
|
if not os.environ.get("WORKSPACE_ID", "").strip():
|
|
missing.append("WORKSPACE_ID")
|
|
if not os.environ.get("PLATFORM_URL", "").strip():
|
|
missing.append("PLATFORM_URL")
|
|
# Token can come from env OR file — only flag when both are absent.
|
|
# Mirrors platform_auth.get_token's resolution order (file-first,
|
|
# env-fallback). configs_dir.resolve() handles in-container vs
|
|
# external-runtime fallback so we don't probe a non-existent
|
|
# /configs on a laptop and falsely report no-token-file.
|
|
has_token_file = (configs_dir.resolve() / ".auth_token").is_file()
|
|
has_token_env = bool(os.environ.get("MOLECULE_WORKSPACE_TOKEN", "").strip())
|
|
if not has_token_file and not has_token_env:
|
|
missing.append("MOLECULE_WORKSPACE_TOKEN (or CONFIGS_DIR/.auth_token)")
|
|
|
|
if missing:
|
|
_print_missing_env_help(missing, have_token_file=has_token_file)
|
|
sys.exit(2)
|
|
|
|
# Resolve the effective token: env wins (operator override), then
|
|
# the on-disk file (in-container default). Mirrors
|
|
# platform_auth.get_token's resolution order so we don't
|
|
# double-implement.
|
|
token = (
|
|
os.environ.get("MOLECULE_WORKSPACE_TOKEN", "").strip()
|
|
or _read_token_file()
|
|
)
|
|
workspace_id = os.environ["WORKSPACE_ID"].strip()
|
|
platform_url = os.environ["PLATFORM_URL"].strip().rstrip("/")
|
|
|
|
# Configure logging so the operator sees register/heartbeat status
|
|
# without needing to set up logging themselves. WARNING by default
|
|
# keeps the steady-state quiet (only failures); MOLECULE_MCP_VERBOSE=1
|
|
# surfaces register-success + per-tick heartbeat info for debugging.
|
|
log_level = (
|
|
logging.INFO
|
|
if os.environ.get("MOLECULE_MCP_VERBOSE", "").strip()
|
|
else logging.WARNING
|
|
)
|
|
logging.basicConfig(level=log_level, format="[molecule-mcp] %(message)s")
|
|
|
|
# Standalone-mode register + heartbeat. Skipped via env var so an
|
|
# in-container caller (which has its own heartbeat loop) can reuse
|
|
# this entry point without double-heartbeating. The wheel's main
|
|
# console-script path always runs them; the
|
|
# MOLECULE_MCP_DISABLE_HEARTBEAT escape hatch exists for tests +
|
|
# the rare embedded use-case.
|
|
if not os.environ.get("MOLECULE_MCP_DISABLE_HEARTBEAT", "").strip():
|
|
_platform_register(platform_url, workspace_id, token)
|
|
_start_heartbeat_thread(platform_url, workspace_id, token)
|
|
|
|
# Inbox poller — the inbound side of the standalone path. Without
|
|
# this thread, the universal MCP server is OUTBOUND-ONLY: an agent
|
|
# can call delegate_task / send_message_to_user but never observe
|
|
# canvas-user or peer-agent messages. The poller fills an in-memory
|
|
# queue from the platform's /activity?type=a2a_receive endpoint;
|
|
# the agent reads via wait_for_message / inbox_peek / inbox_pop.
|
|
#
|
|
# Same disable pattern as heartbeat: in-container callers (with
|
|
# push delivery via canvas WebSocket) skip this to avoid duplicate
|
|
# delivery; tests use the env to keep imports cheap.
|
|
if not os.environ.get("MOLECULE_MCP_DISABLE_INBOX", "").strip():
|
|
_start_inbox_poller(platform_url, workspace_id)
|
|
|
|
# Env is valid — safe to import the heavy module now. Importing
|
|
# earlier would trigger a2a_client.py:22's module-level RuntimeError
|
|
# before our friendly help reaches the user.
|
|
from a2a_mcp_server import cli_main
|
|
cli_main()
|
|
|
|
|
|
def _start_inbox_poller(platform_url: str, workspace_id: str) -> None:
|
|
"""Activate the inbox singleton + spawn the poller daemon thread.
|
|
|
|
Done lazily here (not at module import) because importing inbox
|
|
pulls in platform_auth, which only resolves cleanly AFTER env
|
|
validation succeeds. Activation is idempotent within a process,
|
|
so a stray double-call (e.g. test harness re-entering main) is
|
|
harmless.
|
|
|
|
The poller thread is daemon=True — dies with the main process.
|
|
"""
|
|
try:
|
|
import inbox
|
|
except ImportError as exc:
|
|
logger.warning("molecule-mcp: inbox module unavailable: %s", exc)
|
|
return
|
|
|
|
state = inbox.InboxState(cursor_path=inbox.default_cursor_path())
|
|
inbox.activate(state)
|
|
inbox.start_poller_thread(state, platform_url, workspace_id)
|
|
|
|
|
|
def _read_token_file() -> str:
|
|
"""Read the token from the resolved configs dir's ``.auth_token`` if
|
|
present.
|
|
|
|
Mirrors platform_auth._token_file's location resolution but without
|
|
importing the heavy module here (that import triggers a2a_client's
|
|
WORKSPACE_ID guard which is fine after env validation, but cheaper
|
|
to inline a 4-line file read than pull in the whole stack just for
|
|
the path).
|
|
"""
|
|
path = configs_dir.resolve() / ".auth_token"
|
|
if not path.is_file():
|
|
return ""
|
|
try:
|
|
return path.read_text().strip()
|
|
except OSError:
|
|
return ""
|
|
|
|
|
|
if __name__ == "__main__": # pragma: no cover
|
|
main()
|