Audited every a2a-sdk surface in workspace/ against the installed
1.0.2 wheel. Found and fixed:
main.py (the live workspace startup path):
• create_jsonrpc_routes(rpc_url='/', enable_v0_3_compat=True) —
rpc_url required in 1.x; v0.3 compat enables inbound legacy
clients (`"role": "user"` lowercase) without forcing them to
upgrade. Pairs with the outbound rename below.
a2a_executor.py:
• TextPart/FilePart/FileWithUri removed in 1.x. Part is now a
flat proto message: Part(text=…) / Part(url=…, filename=…,
media_type=…). Updated the file-attachment branch (only
reachable when an agent emits files; the harness's PONG path
didn't exercise this, but it's a latent crash).
• Message field names: messageId/taskId/contextId →
message_id/task_id/context_id (proto3 snake_case).
• Role enum: Role.agent → Role.ROLE_AGENT (proto enum).
Outbound JSON-RPC payloads (8 files):
• "role": "user" → "role": "ROLE_USER" — proto3 JSON serialization
is strict about enum values. Sites: a2a_client, a2a_cli, main
(initial+idle prompts), heartbeat, builtin_tools/a2a_tools,
builtin_tools/delegation. Wire JSON keys stay camelCase
(proto3 default), only the role enum value changed.
google-adk/adapter.py:
• new_agent_text_message → new_text_message (4 sites). This
adapter's directory has a hyphen, so it can't be imported as a
Python module — effectively dead code, but the wheel ships the
file and a future fix should keep it correct against 1.x.
Why one PR instead of seven: every previous a2a-sdk migration find
landed as its own publish → cascade → harness → next-bug cycle.
Today's audit ran every a2a-sdk symbol/type/method in workspace/
against the installed 1.0.2 wheel in a single sweep + tested the
critical paths (Message construction, Part construction, Role enum
parsing) against the actual SDK. Should be the last migration PR.
Verified locally:
python3 scripts/build_runtime_package.py --version 0.1.99 \
--out /tmp/build-final
pip install /tmp/build-final
python -c "import molecule_runtime.main; \
from molecule_runtime.a2a_executor import LangGraphA2AExecutor"
→ ✓ all imports clean against a2a-sdk 1.0.2
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
154 lines
6.5 KiB
Python
154 lines
6.5 KiB
Python
"""A2A protocol client — peer discovery, messaging, and workspace info.
|
|
|
|
Shared constants (WORKSPACE_ID, PLATFORM_URL) live here so that
|
|
a2a_tools and a2a_mcp_server can import them from a single place.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import uuid
|
|
|
|
import httpx
|
|
|
|
from platform_auth import auth_headers, self_source_headers
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_WORKSPACE_ID_raw = os.environ.get("WORKSPACE_ID")
|
|
if not _WORKSPACE_ID_raw:
|
|
raise RuntimeError("WORKSPACE_ID environment variable is required but not set")
|
|
WORKSPACE_ID = _WORKSPACE_ID_raw
|
|
if os.path.exists("/.dockerenv") or os.environ.get("DOCKER_VERSION"):
|
|
PLATFORM_URL = os.environ.get("PLATFORM_URL", "http://host.docker.internal:8080")
|
|
else:
|
|
PLATFORM_URL = os.environ.get("PLATFORM_URL", "http://localhost:8080")
|
|
|
|
# Cache workspace ID → name mappings (populated by list_peers calls)
|
|
_peer_names: dict[str, str] = {}
|
|
|
|
# Sentinel prefix for errors originating from send_a2a_message / child agents.
|
|
# Used by delegate_task to distinguish real errors from normal response text.
|
|
_A2A_ERROR_PREFIX = "[A2A_ERROR] "
|
|
|
|
|
|
async def discover_peer(target_id: str) -> dict | None:
|
|
"""Discover a peer workspace's URL via the platform registry."""
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
try:
|
|
resp = await client.get(
|
|
f"{PLATFORM_URL}/registry/discover/{target_id}",
|
|
headers={"X-Workspace-ID": WORKSPACE_ID, **auth_headers()},
|
|
)
|
|
if resp.status_code == 200:
|
|
return resp.json()
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Discovery failed for {target_id}: {e}")
|
|
return None
|
|
|
|
|
|
async def send_a2a_message(target_url: str, message: str) -> str:
|
|
"""Send an A2A message/send to a target workspace."""
|
|
# Fix F (Cycle 5 / H2 — flagged 5 consecutive audits): timeout=None allowed
|
|
# a hung upstream to block the agent indefinitely. Use a generous but bounded
|
|
# timeout: 30s connect + 300s read (long enough for slow LLM responses).
|
|
async with httpx.AsyncClient(
|
|
timeout=httpx.Timeout(connect=30.0, read=300.0, write=30.0, pool=30.0)
|
|
) as client:
|
|
try:
|
|
# self_source_headers() includes X-Workspace-ID so the
|
|
# platform's a2a_receive logger records source_id =
|
|
# WORKSPACE_ID. Otherwise peer-A2A messages — including
|
|
# the case where target_url resolves to this workspace's
|
|
# own /a2a — get logged with source_id=NULL and surface
|
|
# in the recipient's My Chat tab as user-typed input.
|
|
resp = await client.post(
|
|
target_url,
|
|
headers=self_source_headers(WORKSPACE_ID),
|
|
json={
|
|
"jsonrpc": "2.0",
|
|
"id": str(uuid.uuid4()),
|
|
"method": "message/send",
|
|
"params": {
|
|
"message": {
|
|
"role": "ROLE_USER",
|
|
"messageId": str(uuid.uuid4()),
|
|
"parts": [{"kind": "text", "text": message}],
|
|
}
|
|
},
|
|
},
|
|
)
|
|
data = resp.json()
|
|
if "result" in data:
|
|
parts = data["result"].get("parts", [])
|
|
text = parts[0].get("text", "") if parts else "(no response)"
|
|
# Tag child-reported errors so the caller can detect them reliably
|
|
if text.startswith("Agent error:"):
|
|
return f"{_A2A_ERROR_PREFIX}{text}"
|
|
return text
|
|
elif "error" in data:
|
|
err = data["error"]
|
|
msg = (err.get("message") or "").strip()
|
|
code = err.get("code")
|
|
if msg and code is not None:
|
|
detail = f"{msg} (code={code})"
|
|
elif msg:
|
|
detail = msg
|
|
elif code is not None:
|
|
detail = f"JSON-RPC error with no message (code={code})"
|
|
else:
|
|
detail = "JSON-RPC error with no message"
|
|
return f"{_A2A_ERROR_PREFIX}{detail} [target={target_url}]"
|
|
return f"{_A2A_ERROR_PREFIX}unexpected response shape (no result, no error): {str(data)[:200]} [target={target_url}]"
|
|
except Exception as e:
|
|
# Some httpx exceptions stringify to empty (RemoteProtocolError,
|
|
# ConnectionReset variants) — the canvas would then render
|
|
# "[A2A_ERROR] " with no detail and the operator has no signal
|
|
# to act on. Always include the exception class name and the
|
|
# target URL so the activity log + Agent Comms panel have
|
|
# actionable information without a trip through container logs.
|
|
msg = str(e).strip()
|
|
type_name = type(e).__name__
|
|
if not msg:
|
|
detail = f"{type_name} (no message — likely connection reset or silent timeout)"
|
|
elif msg.startswith(f"{type_name}:") or msg.startswith(f"{type_name} "):
|
|
# Already prefixed with the type — don't double-prefix.
|
|
# Prefix-anchored check (not substring) so a message that
|
|
# happens to mention some OTHER class name mid-string
|
|
# (e.g. "got OSError on read") doesn't suppress our own
|
|
# type prefix and lose the diagnostic signal.
|
|
detail = msg
|
|
else:
|
|
detail = f"{type_name}: {msg}"
|
|
return f"{_A2A_ERROR_PREFIX}{detail} [target={target_url}]"
|
|
|
|
|
|
async def get_peers() -> list[dict]:
|
|
"""Get this workspace's peers from the platform registry."""
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
try:
|
|
resp = await client.get(
|
|
f"{PLATFORM_URL}/registry/{WORKSPACE_ID}/peers",
|
|
headers={"X-Workspace-ID": WORKSPACE_ID, **auth_headers()},
|
|
)
|
|
if resp.status_code == 200:
|
|
return resp.json()
|
|
return []
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
async def get_workspace_info() -> dict:
|
|
"""Get this workspace's info from the platform."""
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
try:
|
|
resp = await client.get(
|
|
f"{PLATFORM_URL}/workspaces/{WORKSPACE_ID}",
|
|
headers=auth_headers(),
|
|
)
|
|
if resp.status_code == 200:
|
|
return resp.json()
|
|
return {"error": "not found"}
|
|
except Exception as e:
|
|
return {"error": str(e)}
|