forked from molecule-ai/molecule-core
Follow-up A to PR #2449 — that PR taught the platform to return 410 Gone for status='removed' workspaces; this PR teaches get_workspace_info to consume that signal. Before: every non-200 collapsed into {"error": "not found"}, which made the 2026-04-30 incident impossible to diagnose — the operator KNEW the workspace_id existed (they'd just registered it), but the runtime kept reporting "not found" for a deleted-but-not-purged row. After: 410 produces a distinct {"error": "removed", "id", "removed_at", "hint"} dict so callers (heartbeat-loop, channel bridge, dashboard tools) can surface "your workspace was deleted, re-onboard" instead of "not found". Falls back to a default hint if the platform body isn't parseable so the actionable signal doesn't depend on body shape parity. Two new tests: - TestGetWorkspaceInfo.test_410_returns_removed_with_hint - TestGetWorkspaceInfo.test_410_with_unparseable_body_falls_back_to_default_hint Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
383 lines
17 KiB
Python
383 lines
17 KiB
Python
"""A2A protocol client — peer discovery, messaging, and workspace info.
|
|
|
|
Shared constants (WORKSPACE_ID, PLATFORM_URL) live here so that
|
|
a2a_tools and a2a_mcp_server can import them from a single place.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import random
|
|
import re
|
|
import time
|
|
import uuid
|
|
|
|
import httpx
|
|
|
|
from platform_auth import auth_headers, self_source_headers
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_WORKSPACE_ID_raw = os.environ.get("WORKSPACE_ID")
|
|
if not _WORKSPACE_ID_raw:
|
|
raise RuntimeError("WORKSPACE_ID environment variable is required but not set")
|
|
WORKSPACE_ID = _WORKSPACE_ID_raw
|
|
if os.path.exists("/.dockerenv") or os.environ.get("DOCKER_VERSION"):
|
|
PLATFORM_URL = os.environ.get("PLATFORM_URL", "http://host.docker.internal:8080")
|
|
else:
|
|
PLATFORM_URL = os.environ.get("PLATFORM_URL", "http://localhost:8080")
|
|
|
|
# Cache workspace ID → name mappings (populated by list_peers calls)
|
|
_peer_names: dict[str, str] = {}
|
|
|
|
# Sentinel prefix for errors originating from send_a2a_message / child agents.
|
|
# Used by delegate_task to distinguish real errors from normal response text.
|
|
_A2A_ERROR_PREFIX = "[A2A_ERROR] "
|
|
|
|
# Workspace IDs are UUIDs everywhere we generate them (platform's
|
|
# workspaces.id column, /registry/discover/:id route param, etc.) but
|
|
# the agent-facing tool surface receives them as free-form strings via
|
|
# tool args. ``_validate_peer_id`` enforces UUID-shape at the
|
|
# trust boundary so we never interpolate `..` or `/` into a URL path,
|
|
# never silently coerce malformed input into a 404, and surface a
|
|
# clear error to the agent rather than letting an HTTP 4xx bubble up
|
|
# from the platform with a generic error message.
|
|
#
|
|
# Lenient on case + whitespace because real-world peer-id strings
|
|
# come from list_peers/discover_peer responses (canonical lowercase)
|
|
# or hand-typed agent input (mixed-case acceptable). Strict on
|
|
# everything else.
|
|
_UUID_RE = re.compile(
|
|
r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$"
|
|
)
|
|
|
|
|
|
def _validate_peer_id(peer_id: str) -> str | None:
|
|
"""Return the canonicalised peer_id if valid, else None.
|
|
|
|
Returning None instead of raising so callers in tool surfaces can
|
|
convert to a friendly agent-facing string ("workspace_id is not a
|
|
valid UUID") rather than crashing with a stack trace.
|
|
"""
|
|
if not isinstance(peer_id, str):
|
|
return None
|
|
pid = peer_id.strip()
|
|
if not _UUID_RE.match(pid):
|
|
return None
|
|
return pid.lower()
|
|
|
|
|
|
async def discover_peer(target_id: str) -> dict | None:
|
|
"""Discover a peer workspace's URL via the platform registry.
|
|
|
|
Validates ``target_id`` is a UUID before constructing the URL — a
|
|
malformed id can't reach the platform handler now, which both
|
|
short-circuits an avoidable round-trip AND ensures we never
|
|
interpolate path-traversal characters into the URL.
|
|
"""
|
|
safe_id = _validate_peer_id(target_id)
|
|
if safe_id is None:
|
|
return None
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
try:
|
|
resp = await client.get(
|
|
f"{PLATFORM_URL}/registry/discover/{safe_id}",
|
|
headers={"X-Workspace-ID": WORKSPACE_ID, **auth_headers()},
|
|
)
|
|
if resp.status_code == 200:
|
|
return resp.json()
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Discovery failed for {target_id}: {e}")
|
|
return None
|
|
|
|
|
|
# httpx exception classes that indicate a transient transport-layer
|
|
# failure worth retrying — the request never produced an application
|
|
# response, so a fresh attempt has a real chance of succeeding. Any
|
|
# error not in this tuple is treated as deterministic (HTTP-status,
|
|
# JSON parse, runtime-returned JSON-RPC error, etc.) and surfaced to
|
|
# the caller on the first try.
|
|
#
|
|
# Why each one belongs here:
|
|
# - ConnectError / ConnectTimeout: peer's listening socket wasn't
|
|
# ready (mid-restart, not yet bound). Fast failure, fast recovery.
|
|
# - RemoteProtocolError: peer closed the TCP connection without
|
|
# writing a response — observed on 2026-04-27 when a peer's prior
|
|
# in-flight Claude SDK session aborted and the new request's
|
|
# connection was reset mid-handler.
|
|
# - ReadError / WriteError: TCP read/write socket error mid-flight,
|
|
# typically a network blip on the Docker bridge or a peer worker
|
|
# crash.
|
|
# - ReadTimeout: peer didn't write ANY response bytes within the
|
|
# 300s read budget. Distinct from "peer is slow but progressing"
|
|
# (which httpx surfaces as a successful read with chunked bytes).
|
|
# Retry budget caps the worst case — see _DELEGATE_TOTAL_BUDGET_S.
|
|
_TRANSIENT_HTTP_ERRORS: tuple[type[Exception], ...] = (
|
|
httpx.ConnectError,
|
|
httpx.ConnectTimeout,
|
|
httpx.ReadError,
|
|
httpx.WriteError,
|
|
httpx.RemoteProtocolError,
|
|
httpx.ReadTimeout,
|
|
)
|
|
|
|
# Retry budget. Up to 5 attempts (1 initial + 4 retries) with
|
|
# exponential backoff (1, 2, 4, 8 seconds), each backoff jittered ±25%
|
|
# to prevent synchronized retry storms across siblings if a peer flaps.
|
|
# _DELEGATE_TOTAL_BUDGET_S caps cumulative wall-clock so a string of
|
|
# ReadTimeouts can't make the caller wait 25 minutes — once the
|
|
# deadline elapses we stop retrying even if attempts remain. 600s = 10
|
|
# minutes is the agreed worst case the caller can tolerate before
|
|
# falling back to "peer unavailable" handling in tool_delegate_task.
|
|
_DELEGATE_MAX_ATTEMPTS = 5
|
|
_DELEGATE_BACKOFF_BASE_S = 1.0
|
|
_DELEGATE_BACKOFF_CAP_S = 16.0
|
|
_DELEGATE_TOTAL_BUDGET_S = 600.0
|
|
|
|
|
|
def _delegate_backoff_seconds(attempt_zero_indexed: int) -> float:
|
|
"""Return the (jittered) backoff delay before retrying after the
|
|
given attempt index (0 = backoff before retry #1).
|
|
|
|
Pure function so the schedule is unit-testable without monkey-
|
|
patching asyncio.sleep. Jitter is symmetric ±25% on top of the
|
|
capped exponential — enough to break sync across simultaneous
|
|
callers without making the schedule unpredictable.
|
|
"""
|
|
base = min(_DELEGATE_BACKOFF_BASE_S * (2 ** attempt_zero_indexed), _DELEGATE_BACKOFF_CAP_S)
|
|
jitter = base * (0.5 * random.random() - 0.25)
|
|
return max(0.0, base + jitter)
|
|
|
|
|
|
def _format_a2a_error(exc: BaseException, target_url: str) -> str:
|
|
"""Format an httpx exception as an [A2A_ERROR] string.
|
|
|
|
Some httpx exceptions stringify to empty (RemoteProtocolError,
|
|
ConnectionReset variants) — the canvas would then render
|
|
"[A2A_ERROR] " with no detail and the operator has no signal to
|
|
act on. Always include the exception class name and the target
|
|
URL so the activity log + Agent Comms panel have actionable
|
|
information without a trip through container logs.
|
|
"""
|
|
msg = str(exc).strip()
|
|
type_name = type(exc).__name__
|
|
if not msg:
|
|
detail = f"{type_name} (no message — likely connection reset or silent timeout)"
|
|
elif msg.startswith(f"{type_name}:") or msg.startswith(f"{type_name} "):
|
|
# Already prefixed with the type — don't double-prefix.
|
|
# Prefix-anchored check (not substring) so a message that
|
|
# happens to mention some OTHER class name mid-string
|
|
# (e.g. "got OSError on read") doesn't suppress our own
|
|
# type prefix and lose the diagnostic signal.
|
|
detail = msg
|
|
else:
|
|
detail = f"{type_name}: {msg}"
|
|
return f"{_A2A_ERROR_PREFIX}{detail} [target={target_url}]"
|
|
|
|
|
|
async def send_a2a_message(peer_id: str, message: str) -> str:
|
|
"""Send an A2A ``message/send`` to a peer workspace via the platform proxy.
|
|
|
|
The target URL is constructed internally as
|
|
``${PLATFORM_URL}/workspaces/{peer_id}/a2a``. Going through the
|
|
platform's A2A proxy is the only path that works for both
|
|
in-container and external runtimes — see
|
|
a2a_tools.tool_delegate_task for the rationale.
|
|
|
|
Auto-retries up to _DELEGATE_MAX_ATTEMPTS times on transient
|
|
transport-layer errors (RemoteProtocolError, ConnectError,
|
|
ReadTimeout, etc.) with exponential-backoff + jitter, capped by
|
|
_DELEGATE_TOTAL_BUDGET_S. Application-level failures (HTTP 4xx,
|
|
JSON-RPC error response, malformed JSON) are NOT retried — they
|
|
indicate a deterministic problem retry won't fix.
|
|
"""
|
|
safe_id = _validate_peer_id(peer_id)
|
|
if safe_id is None:
|
|
return f"{_A2A_ERROR_PREFIX}invalid peer_id (expected UUID): {peer_id!r}"
|
|
target_url = f"{PLATFORM_URL}/workspaces/{safe_id}/a2a"
|
|
|
|
# Fix F (Cycle 5 / H2 — flagged 5 consecutive audits): timeout=None allowed
|
|
# a hung upstream to block the agent indefinitely. Use a generous but bounded
|
|
# timeout: 30s connect + 300s read (long enough for slow LLM responses).
|
|
timeout_cfg = httpx.Timeout(connect=30.0, read=300.0, write=30.0, pool=30.0)
|
|
deadline = time.monotonic() + _DELEGATE_TOTAL_BUDGET_S
|
|
last_exc: BaseException | None = None
|
|
|
|
for attempt in range(_DELEGATE_MAX_ATTEMPTS):
|
|
async with httpx.AsyncClient(timeout=timeout_cfg) as client:
|
|
try:
|
|
# self_source_headers() includes X-Workspace-ID so the
|
|
# platform's a2a_receive logger records source_id =
|
|
# WORKSPACE_ID. Otherwise peer-A2A messages — including
|
|
# the case where target_url resolves to this workspace's
|
|
# own /a2a — get logged with source_id=NULL and surface
|
|
# in the recipient's My Chat tab as user-typed input.
|
|
resp = await client.post(
|
|
target_url,
|
|
headers=self_source_headers(WORKSPACE_ID),
|
|
json={
|
|
"jsonrpc": "2.0",
|
|
"id": str(uuid.uuid4()),
|
|
"method": "message/send",
|
|
"params": {
|
|
"message": {
|
|
"role": "user",
|
|
"messageId": str(uuid.uuid4()),
|
|
"parts": [{"kind": "text", "text": message}],
|
|
}
|
|
},
|
|
},
|
|
)
|
|
data = resp.json()
|
|
if "result" in data:
|
|
parts = data["result"].get("parts", [])
|
|
text = parts[0].get("text", "") if parts else "(no response)"
|
|
# Tag child-reported errors so the caller can detect them reliably
|
|
if text.startswith("Agent error:"):
|
|
return f"{_A2A_ERROR_PREFIX}{text}"
|
|
return text
|
|
elif "error" in data:
|
|
err = data["error"]
|
|
msg = (err.get("message") or "").strip()
|
|
code = err.get("code")
|
|
if msg and code is not None:
|
|
detail = f"{msg} (code={code})"
|
|
elif msg:
|
|
detail = msg
|
|
elif code is not None:
|
|
detail = f"JSON-RPC error with no message (code={code})"
|
|
else:
|
|
detail = "JSON-RPC error with no message"
|
|
return f"{_A2A_ERROR_PREFIX}{detail} [target={target_url}]"
|
|
return f"{_A2A_ERROR_PREFIX}unexpected response shape (no result, no error): {str(data)[:200]} [target={target_url}]"
|
|
except _TRANSIENT_HTTP_ERRORS as e:
|
|
last_exc = e
|
|
attempts_remaining = _DELEGATE_MAX_ATTEMPTS - (attempt + 1)
|
|
if attempts_remaining <= 0 or time.monotonic() >= deadline:
|
|
# Out of attempts OR out of total budget — surface
|
|
# the last error to the caller.
|
|
break
|
|
delay = _delegate_backoff_seconds(attempt)
|
|
# Don't sleep past the deadline — clamp.
|
|
remaining = deadline - time.monotonic()
|
|
if delay > remaining:
|
|
delay = max(0.0, remaining)
|
|
logger.warning(
|
|
"send_a2a_message: transient %s on attempt %d/%d, retrying in %.1fs (target=%s)",
|
|
type(e).__name__,
|
|
attempt + 1,
|
|
_DELEGATE_MAX_ATTEMPTS,
|
|
delay,
|
|
target_url,
|
|
)
|
|
await asyncio.sleep(delay)
|
|
continue
|
|
except Exception as e:
|
|
# Non-transient (HTTP-status, JSON parse, etc.) — don't retry.
|
|
return _format_a2a_error(e, target_url)
|
|
# Retries exhausted (or budget elapsed). last_exc must be set
|
|
# because we only break out of the loop after assigning it.
|
|
assert last_exc is not None # noqa: S101
|
|
return _format_a2a_error(last_exc, target_url)
|
|
|
|
|
|
async def get_peers_with_diagnostic() -> tuple[list[dict], str | None]:
|
|
"""Get this workspace's peers, returning (peers, diagnostic).
|
|
|
|
diagnostic is None when the call succeeded (status 200, even if the list
|
|
is empty). When peers is [] for a non-trivial reason (auth failure,
|
|
workspace-id missing from registry, platform error, network error),
|
|
diagnostic is a short human-readable string explaining what went wrong
|
|
so callers can surface it instead of "may be isolated" — see #2397.
|
|
|
|
The legacy get_peers() shim below preserves the bare-list contract for
|
|
non-tool callers.
|
|
"""
|
|
url = f"{PLATFORM_URL}/registry/{WORKSPACE_ID}/peers"
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
try:
|
|
resp = await client.get(
|
|
url,
|
|
headers={"X-Workspace-ID": WORKSPACE_ID, **auth_headers()},
|
|
)
|
|
except Exception as e:
|
|
return [], f"Cannot reach platform at {PLATFORM_URL}: {e}"
|
|
|
|
if resp.status_code == 200:
|
|
try:
|
|
data = resp.json()
|
|
except Exception as e:
|
|
return [], f"Platform returned 200 but body was not JSON: {e}"
|
|
if not isinstance(data, list):
|
|
return [], f"Platform returned 200 but body was not a list: {type(data).__name__}"
|
|
return data, None
|
|
|
|
if resp.status_code in (401, 403):
|
|
return [], (
|
|
f"Authentication to platform failed (HTTP {resp.status_code}). "
|
|
"The workspace bearer token may be invalid — restarting the workspace usually re-mints it."
|
|
)
|
|
if resp.status_code == 404:
|
|
return [], (
|
|
f"Workspace ID {WORKSPACE_ID} is not registered with the platform (HTTP 404). "
|
|
"Re-registration via the platform's /registry/register endpoint is needed."
|
|
)
|
|
if 500 <= resp.status_code < 600:
|
|
return [], f"Platform error: HTTP {resp.status_code}."
|
|
return [], f"Unexpected platform response: HTTP {resp.status_code}."
|
|
|
|
|
|
async def get_peers() -> list[dict]:
|
|
"""Get this workspace's peers from the platform registry.
|
|
|
|
Bare-list shim over get_peers_with_diagnostic() — discards the diagnostic
|
|
so callers that don't care about the failure reason (e.g. system-prompt
|
|
bootstrap formatters) get the same shape they always had.
|
|
"""
|
|
peers, _ = await get_peers_with_diagnostic()
|
|
return peers
|
|
|
|
|
|
async def get_workspace_info() -> dict:
|
|
"""Get this workspace's info from the platform.
|
|
|
|
Distinguishes three failure shapes so callers can handle them
|
|
distinctly (#2429):
|
|
- 410 Gone → workspace was deleted; re-onboard required
|
|
- 404 / other → workspace never existed (or transient)
|
|
- exception → network / auth failure
|
|
"""
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
try:
|
|
resp = await client.get(
|
|
f"{PLATFORM_URL}/workspaces/{WORKSPACE_ID}",
|
|
headers=auth_headers(),
|
|
)
|
|
if resp.status_code == 200:
|
|
return resp.json()
|
|
if resp.status_code == 410:
|
|
# #2429: platform returns 410 when status='removed'.
|
|
# Surface "removed" + the actionable hint so callers
|
|
# can prompt re-onboard instead of falling through to
|
|
# "not found" — which made the 2026-04-30 incident
|
|
# impossible to diagnose ("workspace not found" with
|
|
# a workspace_id we KNEW we'd just registered).
|
|
try:
|
|
body = resp.json()
|
|
except Exception:
|
|
body = {}
|
|
return {
|
|
"error": "removed",
|
|
"id": body.get("id", WORKSPACE_ID),
|
|
"removed_at": body.get("removed_at"),
|
|
"hint": body.get(
|
|
"hint",
|
|
"Workspace was deleted on the platform. "
|
|
"Regenerate workspace + token from the canvas → Tokens tab.",
|
|
),
|
|
}
|
|
return {"error": "not found"}
|
|
except Exception as e:
|
|
return {"error": str(e)}
|