Symptom: Activity tab and Agent Comms surfaced bare "[A2A_ERROR] "
(prefix + nothing) for failed delegations. Operator had no signal
to act on — no exception type, no target, no hint about what went
wrong, no next step. Fix is in three layers.
1. workspace/a2a_client.py — every error path now produces an
actionable detail string:
- except branch: some httpx exceptions (RemoteProtocolError,
ConnectionReset variants) stringify to "". Pre-fix the catch
was `f"{_A2A_ERROR_PREFIX}{e}"` → bare prefix. Now falls back
to `<TypeName> (no message — likely connection reset or silent
timeout)` and always appends `[target=<url>]` for traceability
in chained delegations.
- JSON-RPC error branch: previously dropped error.code on the
floor and printed "unknown" when message was missing. Now
surfaces both, including the well-defined "JSON-RPC error
with no message (code=N)" path.
- "neither result nor error" branch: pre-fix returned
str(payload) which the canvas rendered as a successful
response block. Now tagged as A2A_ERROR with a payload
snippet so downstream UI routes through the error path.
2. workspace/a2a_tools.py — tool_delegate_task now passes
error_detail (the stripped error message) through to the
activity-log POST. The platform's activity_logs.error_detail
column is the canvas's red error chip source; populating it
makes the failure visible in the row header without the user
having to expand into raw response_body JSON. The summary line
also gets a 120-char prefix of the cause so the collapsed row
reads "React Engineer failed: ConnectionResetError: ... [target=...]"
instead of "React Engineer failed".
3. canvas/src/components/tabs/ActivityTab.tsx — MessagePreview
now detects [A2A_ERROR]-prefixed bodies and renders a
structured error block (red chip, stripped detail, cause hint)
instead of the previous gray text-block that showed the literal
"[A2A_ERROR]" string. inferA2AErrorHint mirrors the patterns
from AgentCommsPanel.inferCauseHint so the same symptom reads
the same way in both surfaces (Claude SDK init wedge → restart
workspace; timeout → busy/stuck; connection-reset → transient
blip then check logs).
Tests: 9 send_a2a_message tests pass (including a new regression
test for the empty-stringifying-exception case that the user
reported); 995 canvas tests pass; tsc clean.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
149 lines
6.1 KiB
Python
149 lines
6.1 KiB
Python
"""A2A protocol client — peer discovery, messaging, and workspace info.
|
|
|
|
Shared constants (WORKSPACE_ID, PLATFORM_URL) live here so that
|
|
a2a_tools and a2a_mcp_server can import them from a single place.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import uuid
|
|
|
|
import httpx
|
|
|
|
from platform_auth import auth_headers, self_source_headers
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_WORKSPACE_ID_raw = os.environ.get("WORKSPACE_ID")
|
|
if not _WORKSPACE_ID_raw:
|
|
raise RuntimeError("WORKSPACE_ID environment variable is required but not set")
|
|
WORKSPACE_ID = _WORKSPACE_ID_raw
|
|
if os.path.exists("/.dockerenv") or os.environ.get("DOCKER_VERSION"):
|
|
PLATFORM_URL = os.environ.get("PLATFORM_URL", "http://host.docker.internal:8080")
|
|
else:
|
|
PLATFORM_URL = os.environ.get("PLATFORM_URL", "http://localhost:8080")
|
|
|
|
# Cache workspace ID → name mappings (populated by list_peers calls)
|
|
_peer_names: dict[str, str] = {}
|
|
|
|
# Sentinel prefix for errors originating from send_a2a_message / child agents.
|
|
# Used by delegate_task to distinguish real errors from normal response text.
|
|
_A2A_ERROR_PREFIX = "[A2A_ERROR] "
|
|
|
|
|
|
async def discover_peer(target_id: str) -> dict | None:
|
|
"""Discover a peer workspace's URL via the platform registry."""
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
try:
|
|
resp = await client.get(
|
|
f"{PLATFORM_URL}/registry/discover/{target_id}",
|
|
headers={"X-Workspace-ID": WORKSPACE_ID, **auth_headers()},
|
|
)
|
|
if resp.status_code == 200:
|
|
return resp.json()
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Discovery failed for {target_id}: {e}")
|
|
return None
|
|
|
|
|
|
async def send_a2a_message(target_url: str, message: str) -> str:
|
|
"""Send an A2A message/send to a target workspace."""
|
|
# Fix F (Cycle 5 / H2 — flagged 5 consecutive audits): timeout=None allowed
|
|
# a hung upstream to block the agent indefinitely. Use a generous but bounded
|
|
# timeout: 30s connect + 300s read (long enough for slow LLM responses).
|
|
async with httpx.AsyncClient(
|
|
timeout=httpx.Timeout(connect=30.0, read=300.0, write=30.0, pool=30.0)
|
|
) as client:
|
|
try:
|
|
# self_source_headers() includes X-Workspace-ID so the
|
|
# platform's a2a_receive logger records source_id =
|
|
# WORKSPACE_ID. Otherwise peer-A2A messages — including
|
|
# the case where target_url resolves to this workspace's
|
|
# own /a2a — get logged with source_id=NULL and surface
|
|
# in the recipient's My Chat tab as user-typed input.
|
|
resp = await client.post(
|
|
target_url,
|
|
headers=self_source_headers(WORKSPACE_ID),
|
|
json={
|
|
"jsonrpc": "2.0",
|
|
"id": str(uuid.uuid4()),
|
|
"method": "message/send",
|
|
"params": {
|
|
"message": {
|
|
"role": "user",
|
|
"messageId": str(uuid.uuid4()),
|
|
"parts": [{"kind": "text", "text": message}],
|
|
}
|
|
},
|
|
},
|
|
)
|
|
data = resp.json()
|
|
if "result" in data:
|
|
parts = data["result"].get("parts", [])
|
|
text = parts[0].get("text", "") if parts else "(no response)"
|
|
# Tag child-reported errors so the caller can detect them reliably
|
|
if text.startswith("Agent error:"):
|
|
return f"{_A2A_ERROR_PREFIX}{text}"
|
|
return text
|
|
elif "error" in data:
|
|
err = data["error"]
|
|
msg = (err.get("message") or "").strip()
|
|
code = err.get("code")
|
|
if msg and code is not None:
|
|
detail = f"{msg} (code={code})"
|
|
elif msg:
|
|
detail = msg
|
|
elif code is not None:
|
|
detail = f"JSON-RPC error with no message (code={code})"
|
|
else:
|
|
detail = "JSON-RPC error with no message"
|
|
return f"{_A2A_ERROR_PREFIX}{detail} [target={target_url}]"
|
|
return f"{_A2A_ERROR_PREFIX}unexpected response shape (no result, no error): {str(data)[:200]} [target={target_url}]"
|
|
except Exception as e:
|
|
# Some httpx exceptions stringify to empty (RemoteProtocolError,
|
|
# ConnectionReset variants) — the canvas would then render
|
|
# "[A2A_ERROR] " with no detail and the operator has no signal
|
|
# to act on. Always include the exception class name and the
|
|
# target URL so the activity log + Agent Comms panel have
|
|
# actionable information without a trip through container logs.
|
|
msg = str(e).strip()
|
|
type_name = type(e).__name__
|
|
if not msg:
|
|
detail = f"{type_name} (no message — likely connection reset or silent timeout)"
|
|
elif type_name in msg:
|
|
detail = msg
|
|
else:
|
|
detail = f"{type_name}: {msg}"
|
|
return f"{_A2A_ERROR_PREFIX}{detail} [target={target_url}]"
|
|
|
|
|
|
async def get_peers() -> list[dict]:
|
|
"""Get this workspace's peers from the platform registry."""
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
try:
|
|
resp = await client.get(
|
|
f"{PLATFORM_URL}/registry/{WORKSPACE_ID}/peers",
|
|
headers={"X-Workspace-ID": WORKSPACE_ID, **auth_headers()},
|
|
)
|
|
if resp.status_code == 200:
|
|
return resp.json()
|
|
return []
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
async def get_workspace_info() -> dict:
|
|
"""Get this workspace's info from the platform."""
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
try:
|
|
resp = await client.get(
|
|
f"{PLATFORM_URL}/workspaces/{WORKSPACE_ID}",
|
|
headers=auth_headers(),
|
|
)
|
|
if resp.status_code == 200:
|
|
return resp.json()
|
|
return {"error": "not found"}
|
|
except Exception as e:
|
|
return {"error": str(e)}
|