From 391e187281df0f13edae72615d0dd9d5eaf3e6d5 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Fri, 24 Apr 2026 23:40:05 -0700 Subject: [PATCH] fix(a2a,canvas): make delivery failures comprehensive instead of "[A2A_ERROR] " MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Symptom: Activity tab and Agent Comms surfaced bare "[A2A_ERROR] " (prefix + nothing) for failed delegations. Operator had no signal to act on — no exception type, no target, no hint about what went wrong, no next step. Fix is in three layers. 1. workspace/a2a_client.py — every error path now produces an actionable detail string: - except branch: some httpx exceptions (RemoteProtocolError, ConnectionReset variants) stringify to "". Pre-fix the catch was `f"{_A2A_ERROR_PREFIX}{e}"` → bare prefix. Now falls back to ` (no message — likely connection reset or silent timeout)` and always appends `[target=]` for traceability in chained delegations. - JSON-RPC error branch: previously dropped error.code on the floor and printed "unknown" when message was missing. Now surfaces both, including the well-defined "JSON-RPC error with no message (code=N)" path. - "neither result nor error" branch: pre-fix returned str(payload) which the canvas rendered as a successful response block. Now tagged as A2A_ERROR with a payload snippet so downstream UI routes through the error path. 2. workspace/a2a_tools.py — tool_delegate_task now passes error_detail (the stripped error message) through to the activity-log POST. The platform's activity_logs.error_detail column is the canvas's red error chip source; populating it makes the failure visible in the row header without the user having to expand into raw response_body JSON. The summary line also gets a 120-char prefix of the cause so the collapsed row reads "React Engineer failed: ConnectionResetError: ... [target=...]" instead of "React Engineer failed". 3. canvas/src/components/tabs/ActivityTab.tsx — MessagePreview now detects [A2A_ERROR]-prefixed bodies and renders a structured error block (red chip, stripped detail, cause hint) instead of the previous gray text-block that showed the literal "[A2A_ERROR]" string. inferA2AErrorHint mirrors the patterns from AgentCommsPanel.inferCauseHint so the same symptom reads the same way in both surfaces (Claude SDK init wedge → restart workspace; timeout → busy/stuck; connection-reset → transient blip then check logs). Tests: 9 send_a2a_message tests pass (including a new regression test for the empty-stringifying-exception case that the user reported); 995 canvas tests pass; tsc clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- canvas/src/components/tabs/ActivityTab.tsx | 51 ++++++++++++++++++++ workspace/a2a_client.py | 31 +++++++++++-- workspace/a2a_tools.py | 16 ++++++- workspace/tests/test_a2a_client.py | 54 ++++++++++++++++++++-- 4 files changed, 143 insertions(+), 9 deletions(-) diff --git a/canvas/src/components/tabs/ActivityTab.tsx b/canvas/src/components/tabs/ActivityTab.tsx index fc857842..3a3d47fa 100644 --- a/canvas/src/components/tabs/ActivityTab.tsx +++ b/canvas/src/components/tabs/ActivityTab.tsx @@ -286,6 +286,49 @@ function ActivityRow({ ); } +const A2A_ERROR_PREFIX = "[A2A_ERROR]"; + +/** Best-effort cause hint for an A2A delivery failure. Mirrors the + * patterns used by AgentCommsPanel's inferCauseHint so the same + * symptom reads the same way in both surfaces. */ +function inferA2AErrorHint(detail: string): string { + const t = detail.toLowerCase(); + if (t.includes("control request timeout")) { + return "The remote agent's Claude Code SDK is wedged on initialization. Restart the workspace to clear it."; + } + if (t.includes("readtimeout") || t.includes("connecttimeout") || t.includes("deadline exceeded") || t.includes("timeout")) { + return "The remote agent didn't respond within the proxy timeout. It may be busy with a long task or its runtime is stuck — restart if this repeats."; + } + if (t.includes("connectionreset") || t.includes("remoteprotocolerror") || t.includes("connection reset") || t.includes("no message")) { + return "The connection to the remote agent dropped before a reply arrived. Usually a transient network blip; retry once. If it repeats, the remote container may have crashed mid-request — check its logs."; + } + if (t.includes("agent error") || t.includes("exception")) { + return "The remote agent's runtime threw an exception. Check the workspace's container logs for the traceback. Restart usually clears transient runtime crashes."; + } + if (t.includes("not found") || t.includes("not accessible") || t.includes("offline")) { + return "The remote workspace can't be reached — it may be stopped, removed, or outside the access control list. Verify the peer is online before retrying."; + } + return "Check the remote agent's container logs for context. Restart the workspace if the failure repeats."; +} + +/** Render a [A2A_ERROR]-prefixed response as a structured error block + * with a stripped detail line + a cause hint. The previous raw render + * ("[A2A_ERROR] " literal in the response area) gave the user no + * signal to act on. */ +function A2AErrorPreview({ label, raw }: { label: string; raw: string }) { + const detail = raw.slice(A2A_ERROR_PREFIX.length).trim() || "(no detail provided)"; + const hint = inferA2AErrorHint(detail); + return ( +
+
{label} — delivery failed
+
+
{detail}
+
{hint}
+
+
+ ); +} + /** Extract human-readable text from A2A request/response JSON */ function MessagePreview({ label, body }: { label: string; body: Record }) { // Try to extract text from A2A message parts @@ -295,6 +338,14 @@ function MessagePreview({ label, body }: { label: string; body: Record; + } return (
{label}
diff --git a/workspace/a2a_client.py b/workspace/a2a_client.py index 514dc438..8893e2e7 100644 --- a/workspace/a2a_client.py +++ b/workspace/a2a_client.py @@ -87,10 +87,35 @@ async def send_a2a_message(target_url: str, message: str) -> str: return f"{_A2A_ERROR_PREFIX}{text}" return text elif "error" in data: - return f"{_A2A_ERROR_PREFIX}{data['error'].get('message', 'unknown')}" - return str(data) + err = data["error"] + msg = (err.get("message") or "").strip() + code = err.get("code") + if msg and code is not None: + detail = f"{msg} (code={code})" + elif msg: + detail = msg + elif code is not None: + detail = f"JSON-RPC error with no message (code={code})" + else: + detail = "JSON-RPC error with no message" + return f"{_A2A_ERROR_PREFIX}{detail} [target={target_url}]" + return f"{_A2A_ERROR_PREFIX}unexpected response shape (no result, no error): {str(data)[:200]} [target={target_url}]" except Exception as e: - return f"{_A2A_ERROR_PREFIX}{e}" + # Some httpx exceptions stringify to empty (RemoteProtocolError, + # ConnectionReset variants) — the canvas would then render + # "[A2A_ERROR] " with no detail and the operator has no signal + # to act on. Always include the exception class name and the + # target URL so the activity log + Agent Comms panel have + # actionable information without a trip through container logs. + msg = str(e).strip() + type_name = type(e).__name__ + if not msg: + detail = f"{type_name} (no message — likely connection reset or silent timeout)" + elif type_name in msg: + detail = msg + else: + detail = f"{type_name}: {msg}" + return f"{_A2A_ERROR_PREFIX}{detail} [target={target_url}]" async def get_peers() -> list[dict]: diff --git a/workspace/a2a_tools.py b/workspace/a2a_tools.py index 691491d7..4834870b 100644 --- a/workspace/a2a_tools.py +++ b/workspace/a2a_tools.py @@ -112,7 +112,7 @@ def _auth_headers_for_heartbeat() -> dict[str, str]: async def report_activity( activity_type: str, target_id: str = "", summary: str = "", status: str = "ok", - task_text: str = "", response_text: str = "", + task_text: str = "", response_text: str = "", error_detail: str = "", ): """Report activity to the platform for live progress tracking.""" try: @@ -129,6 +129,13 @@ async def report_activity( payload["request_body"] = {"task": task_text} if response_text: payload["response_body"] = {"result": response_text} + if error_detail: + # error_detail is a top-level activity row column on the + # platform (handlers/activity.go). Surfacing the cleaned + # exception string here lets the Activity tab render a + # red error chip + the cause without forcing the user + # to scroll into the raw response_body JSON. + payload["error_detail"] = error_detail await client.post( f"{PLATFORM_URL}/workspaces/{WORKSPACE_ID}/activity", json=payload, @@ -178,11 +185,16 @@ async def tool_delegate_task(workspace_id: str, task: str) -> str: # Detect delegation failures — wrap them clearly so the calling agent # can decide to retry, use another peer, or handle the task itself. is_error = result.startswith(_A2A_ERROR_PREFIX) + # Strip the sentinel prefix so error_detail is the human-readable + # cause directly. The Activity tab's red error chip surfaces this + # without the user having to scroll into the raw response JSON. + error_detail = result[len(_A2A_ERROR_PREFIX):].strip() if is_error else "" await report_activity( "a2a_receive", workspace_id, - f"{peer_name} responded ({len(result)} chars)" if not is_error else f"{peer_name} failed", + f"{peer_name} responded ({len(result)} chars)" if not is_error else f"{peer_name} failed: {error_detail[:120]}", task_text=task, response_text=result, status="error" if is_error else "ok", + error_detail=error_detail, ) if is_error: return ( diff --git a/workspace/tests/test_a2a_client.py b/workspace/tests/test_a2a_client.py index fbae0ea0..a9fb522a 100644 --- a/workspace/tests/test_a2a_client.py +++ b/workspace/tests/test_a2a_client.py @@ -199,10 +199,16 @@ class TestSendA2AMessage: result = await a2a_client.send_a2a_message("http://target/a2a", "task") assert result.startswith(a2a_client._A2A_ERROR_PREFIX) - assert "unknown" in result + # The error includes the JSON-RPC code so the operator can look it + # up; "no message" surfaces the missing-message condition explicitly + # instead of the previous opaque "unknown". + assert "code=-32600" in result + assert "no message" in result.lower() + # Target URL is included so chained delegations are traceable. + assert "target=http://target/a2a" in result - async def test_neither_result_nor_error_returns_str_of_data(self): - """Response with neither 'result' nor 'error' → str(data).""" + async def test_neither_result_nor_error_returns_a2a_error_with_payload(self): + """Response with neither 'result' nor 'error' → A2A_ERROR + payload context.""" import a2a_client payload = {"jsonrpc": "2.0", "id": "abc123"} @@ -212,7 +218,14 @@ class TestSendA2AMessage: with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): result = await a2a_client.send_a2a_message("http://target/a2a", "task") - assert result == str(payload) + # Pre-fix this returned bare str(payload) which the canvas + # rendered as a confusing "looks like a successful response" + # block. Now it's tagged so downstream UI / delegate_task + # routes it through the error path. + assert result.startswith(a2a_client._A2A_ERROR_PREFIX) + assert "unexpected response shape" in result + assert "abc123" in result # snippet of payload included for context + assert "target=http://target/a2a" in result async def test_exception_returns_error_prefix_and_message(self): """Network exception → returns _A2A_ERROR_PREFIX + exception text.""" @@ -225,6 +238,39 @@ class TestSendA2AMessage: assert result.startswith(a2a_client._A2A_ERROR_PREFIX) assert "connection refused" in result + # Exception class name is prepended when the message doesn't + # already include it — gives the operator a typed handle to + # search for in container logs. + assert "ConnectionError" in result + assert "target=http://target/a2a" in result + + async def test_empty_stringifying_exception_falls_back_to_class_name(self): + """The user's reported bug: httpx.RemoteProtocolError and similar + exceptions can stringify to "" — pre-fix the canvas rendered + "[A2A_ERROR] " with no detail. Verify the empty path now + produces an actionable message including the exception type + and the target URL.""" + import a2a_client + + # Subclass Exception with __str__ → "" to simulate the + # silent-exception variants without depending on a specific + # httpx version's behavior. + class _SilentRemoteProtocolError(Exception): + def __str__(self) -> str: + return "" + + mock_client = _make_mock_client(post_exc=_SilentRemoteProtocolError()) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result = await a2a_client.send_a2a_message("http://target/a2a", "task") + + # Must NOT be just the bare prefix — that's the regression. + assert result != a2a_client._A2A_ERROR_PREFIX.strip() + assert result != f"{a2a_client._A2A_ERROR_PREFIX}" + # Must include the class name + something explanatory. + assert "_SilentRemoteProtocolError" in result + assert "no message" in result.lower() + assert "target=http://target/a2a" in result async def test_result_text_part_missing_text_key_returns_empty(self): """Part dict without 'text' key → falls back to '' (empty string returned)."""