From c159d85eb544f6ac0fa7dec035bccae896a2a3c5 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Fri, 24 Apr 2026 23:47:44 -0700 Subject: [PATCH] =?UTF-8?q?fix(a2a):=20review-driven=20hardening=20?= =?UTF-8?q?=E2=80=94=20prefix-anchored=20type=20check,=20error=5Fdetail=20?= =?UTF-8?q?cap,=20shared=20hint=20module?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three required fixes from the bundle review of 391e1872: 1. workspace/a2a_client.py: substring `type_name in msg` could miss the diagnostic prefix when an exception's message embedded a different class name mid-string (e.g. `OSError("see ConnectionError below")` → printed as plain msg, type lost). Switched to a prefix-anchored check (`msg.startswith(f"{type_name}:")` etc.) so the type label is always added when not already at the start of the message. 2. workspace/a2a_tools.py: `activity_logs.error_detail` is unbounded TEXT on the platform (handlers/activity.go does not validate length). A buggy or hostile peer could stream arbitrarily large error messages into the caller's activity log. Cap at 4096 chars at the producer — comfortably above any real exception traceback, well below an obvious-DoS threshold. 3. New regression test for JSON-RPC `code=0` — pins the `code is not None` semantics so the code is preserved in the detail rather than collapsing into the no-code path. Code=0 is not valid per the spec, but a malformed peer can still emit it and we want it visible for diagnosis. Plus one optional taken: extracted the A2A-error → hint mapping into canvas/src/components/tabs/chat/a2aErrorHint.ts. The two prior copies (AgentCommsPanel.inferCauseHint + ActivityTab.inferA2AErrorHint) had already drifted — Activity tab gained `not found`/`offline` cases the chat panel never picked up, AgentCommsPanel handled empty-input explicitly while Activity didn't. The shared module is the merged superset, with 10 unit tests pinning each named pattern + the "most specific first" ordering (Claude SDK wedge wins over generic timeout). Skipped (per analysis): - Unicode-naive 120-char slice — Python str[:N] slices on code points, not bytes. Safe. - Nested [A2A_ERROR] confusion — non-issue per reviewer; outer prefix winning still produces a structured render. - MessagePreview + JsonBlock dual render on errors — intentional drilldown; raw JSON is below the fold for operators who need it. - console.warn dedup — refetches don't happen per-event so spam risk is low. - str(data)[:200] materialization — A2A response bodies aren't typically MB-sized. Verified: 1005 canvas tests pass (10 new hint tests); 10 Python send_a2a_message tests pass (1 new for code=0); tsc clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- canvas/src/components/tabs/ActivityTab.tsx | 24 +------ .../components/tabs/chat/AgentCommsPanel.tsx | 28 ++------ .../tabs/chat/__tests__/a2aErrorHint.test.ts | 67 +++++++++++++++++++ .../src/components/tabs/chat/a2aErrorHint.ts | 54 +++++++++++++++ workspace/a2a_client.py | 7 +- workspace/a2a_tools.py | 9 ++- workspace/tests/test_a2a_client.py | 18 +++++ 7 files changed, 158 insertions(+), 49 deletions(-) create mode 100644 canvas/src/components/tabs/chat/__tests__/a2aErrorHint.test.ts create mode 100644 canvas/src/components/tabs/chat/a2aErrorHint.ts diff --git a/canvas/src/components/tabs/ActivityTab.tsx b/canvas/src/components/tabs/ActivityTab.tsx index 3a3d47fa..d0e31630 100644 --- a/canvas/src/components/tabs/ActivityTab.tsx +++ b/canvas/src/components/tabs/ActivityTab.tsx @@ -5,6 +5,7 @@ import { api } from "@/lib/api"; import { ConversationTraceModal } from "@/components/ConversationTraceModal"; import { type ActivityEntry } from "@/types/activity"; import { useWorkspaceName } from "@/hooks/useWorkspaceName"; +import { inferA2AErrorHint } from "./chat/a2aErrorHint"; interface Props { workspaceId: string; @@ -288,29 +289,6 @@ function ActivityRow({ const A2A_ERROR_PREFIX = "[A2A_ERROR]"; -/** Best-effort cause hint for an A2A delivery failure. Mirrors the - * patterns used by AgentCommsPanel's inferCauseHint so the same - * symptom reads the same way in both surfaces. */ -function inferA2AErrorHint(detail: string): string { - const t = detail.toLowerCase(); - if (t.includes("control request timeout")) { - return "The remote agent's Claude Code SDK is wedged on initialization. Restart the workspace to clear it."; - } - if (t.includes("readtimeout") || t.includes("connecttimeout") || t.includes("deadline exceeded") || t.includes("timeout")) { - return "The remote agent didn't respond within the proxy timeout. It may be busy with a long task or its runtime is stuck — restart if this repeats."; - } - if (t.includes("connectionreset") || t.includes("remoteprotocolerror") || t.includes("connection reset") || t.includes("no message")) { - return "The connection to the remote agent dropped before a reply arrived. Usually a transient network blip; retry once. If it repeats, the remote container may have crashed mid-request — check its logs."; - } - if (t.includes("agent error") || t.includes("exception")) { - return "The remote agent's runtime threw an exception. Check the workspace's container logs for the traceback. Restart usually clears transient runtime crashes."; - } - if (t.includes("not found") || t.includes("not accessible") || t.includes("offline")) { - return "The remote workspace can't be reached — it may be stopped, removed, or outside the access control list. Verify the peer is online before retrying."; - } - return "Check the remote agent's container logs for context. Restart the workspace if the failure repeats."; -} - /** Render a [A2A_ERROR]-prefixed response as a structured error block * with a stripped detail line + a cause hint. The previous raw render * ("[A2A_ERROR] " literal in the response area) gave the user no diff --git a/canvas/src/components/tabs/chat/AgentCommsPanel.tsx b/canvas/src/components/tabs/chat/AgentCommsPanel.tsx index 15af5b03..909616bf 100644 --- a/canvas/src/components/tabs/chat/AgentCommsPanel.tsx +++ b/canvas/src/components/tabs/chat/AgentCommsPanel.tsx @@ -9,6 +9,7 @@ import { WS_URL } from "@/store/socket"; import { closeWebSocketGracefully } from "@/lib/ws-close"; import { showToast } from "../../Toaster"; import { extractResponseText, extractRequestText } from "./message-parser"; +import { inferA2AErrorHint } from "./a2aErrorHint"; export interface ActivityEntry { id: string; @@ -115,29 +116,8 @@ function unwrapErrorText(raw: string | null): string { return trimmed; } -/** Best-effort cause hint based on what we can see in the error text. - * These map known runtime symptoms to operator-actionable language so - * the user isn't left staring at "[A2A_ERROR]" with no next step. */ -function inferCauseHint(errorText: string): string { - const t = errorText.toLowerCase(); - // "control request timeout" is the specific Claude Code SDK init - // wedge symptom. Don't pattern on bare "initialize" — too broad - // (a user task containing "failed to initialize database" would - // false-positive into the SDK-wedge hint). - if (t.includes("control request timeout")) { - return "The remote agent's Claude Code SDK is wedged on initialization (often after a long idle period or OAuth refresh). A workspace restart usually clears it."; - } - if (t.includes("deadline exceeded") || t.includes("timeout")) { - return "The remote agent didn't respond within the proxy timeout. It may be busy with a long-running task, or the runtime is stuck. Restart the workspace if this repeats."; - } - if (t.includes("agent error") || t.includes("exception")) { - return "The remote agent's runtime threw an exception. Check the workspace's container logs for the traceback. Restart usually clears transient runtime crashes."; - } - if (errorText === "") { - return "The remote agent returned no error detail (the underlying httpx exception had an empty message — typically a connection-reset or silent timeout). A workspace restart is the safe first move."; - } - return "The remote agent reported a delivery failure. Check the workspace logs or try restarting."; -} +// inferA2AErrorHint moved to ./a2aErrorHint so the Activity tab and +// this panel render identical hints for the same symptom. export function AgentCommsPanel({ workspaceId }: { workspaceId: string }) { const [messages, setMessages] = useState([]); @@ -314,7 +294,7 @@ function ErrorMessage({ msg }: { msg: CommMessage }) { const selectNode = useCanvasStore((s) => s.selectNode); const [restarting, setRestarting] = useState(false); const errorText = unwrapErrorText(msg.responseText); - const hint = inferCauseHint(errorText); + const hint = inferA2AErrorHint(errorText); // Guard against acting on a peer whose workspace has been deleted // since this row was logged. Without the guard, restart 404s diff --git a/canvas/src/components/tabs/chat/__tests__/a2aErrorHint.test.ts b/canvas/src/components/tabs/chat/__tests__/a2aErrorHint.test.ts new file mode 100644 index 00000000..8518829b --- /dev/null +++ b/canvas/src/components/tabs/chat/__tests__/a2aErrorHint.test.ts @@ -0,0 +1,67 @@ +import { describe, it, expect } from "vitest"; +import { inferA2AErrorHint } from "../a2aErrorHint"; + +// Pure logic. Pin every named pattern so a future contributor adding a +// new symptom doesn't accidentally collapse the buckets — and so the +// "most specific first" ordering can't drift without a test failing. + +describe("inferA2AErrorHint", () => { + it("matches the Claude Code SDK init wedge specifically", () => { + const hint = inferA2AErrorHint("Control request timeout: initialize"); + expect(hint).toMatch(/Claude Code SDK is wedged/); + }); + + it("does NOT misfire on user tasks containing 'initialize' generally", () => { + // Regression: an earlier bare-`initialize` pattern would have + // false-positived "failed to initialize database" into the SDK + // wedge hint. Confirm the full-phrase guard holds. + const hint = inferA2AErrorHint("failed to initialize database connection"); + expect(hint).not.toMatch(/Claude Code SDK/); + }); + + it("recognises httpx ReadTimeout / ConnectTimeout class names", () => { + expect(inferA2AErrorHint("ReadTimeout: timeout")).toMatch(/proxy timeout/); + expect(inferA2AErrorHint("ConnectTimeout: ...")).toMatch(/proxy timeout/); + }); + + it("recognises generic timeout / deadline-exceeded language", () => { + expect(inferA2AErrorHint("deadline exceeded after 300s")).toMatch(/proxy timeout/); + expect(inferA2AErrorHint("Operation timeout")).toMatch(/proxy timeout/); + }); + + it("handles connection-reset family (RemoteProtocolError, ConnectionReset, no-message)", () => { + expect(inferA2AErrorHint("RemoteProtocolError: ...")).toMatch(/connection.*dropped/); + expect(inferA2AErrorHint("ConnectionResetError")).toMatch(/connection.*dropped/); + expect(inferA2AErrorHint("connection reset by peer")).toMatch(/connection.*dropped/); + expect(inferA2AErrorHint("RemoteProtocolError (no message — likely connection reset)")).toMatch(/connection.*dropped/); + }); + + it("recognises agent-runtime exceptions", () => { + expect(inferA2AErrorHint("Agent error: ValueError raised")).toMatch(/runtime threw an exception/); + expect(inferA2AErrorHint("RuntimeException in tool call")).toMatch(/runtime threw an exception/); + }); + + it("recognises peer-unreachable cases (Activity-tab originals)", () => { + expect(inferA2AErrorHint("workspace not found")).toMatch(/can't be reached/); + expect(inferA2AErrorHint("not accessible")).toMatch(/can't be reached/); + expect(inferA2AErrorHint("workspace is offline")).toMatch(/can't be reached/); + }); + + it("returns the empty-detail-specific hint when input is exactly empty", () => { + expect(inferA2AErrorHint("")).toMatch(/no error detail/); + }); + + it("returns a generic fallback for unrecognised text", () => { + const hint = inferA2AErrorHint("some completely novel error nobody has matched yet"); + expect(hint).toMatch(/Check the workspace logs|delivery failure/); + }); + + it("Claude SDK wedge wins over the more general timeout pattern", () => { + // Both 'control request timeout' and 'timeout' match the same + // input. The SDK wedge hint is more actionable; the ordering in + // the function must keep it first. Lock that priority in. + const hint = inferA2AErrorHint("Control request timeout: initialize"); + expect(hint).toMatch(/Claude Code SDK/); + expect(hint).not.toMatch(/proxy timeout/); + }); +}); diff --git a/canvas/src/components/tabs/chat/a2aErrorHint.ts b/canvas/src/components/tabs/chat/a2aErrorHint.ts new file mode 100644 index 00000000..e29e643a --- /dev/null +++ b/canvas/src/components/tabs/chat/a2aErrorHint.ts @@ -0,0 +1,54 @@ +/** + * Maps an A2A delivery-failure detail string (the bit AFTER stripping + * the [A2A_ERROR] sentinel prefix) to a one-line operator-actionable + * hint. Pattern matches are lowercase substring checks, ordered most- + * specific first so the right hint wins when multiple patterns + * overlap (e.g. "control request timeout" wins over generic "timeout"). + * + * Used by both the chat Agent Comms panel and the Activity tab so the + * same symptom reads identically across surfaces. Two prior copies + * had already drifted (Activity tab gained `not found`/`offline` + * cases AgentCommsPanel never picked up) — this module is the merged + * superset and the only place hint text should change. + */ +export function inferA2AErrorHint(detail: string): string { + const t = detail.toLowerCase(); + + // "control request timeout" is the specific Claude Code SDK init + // wedge symptom. Pattern on the full phrase, not bare "initialize" + // — a user task containing "failed to initialize database" would + // false-positive into the SDK-wedge hint. + if (t.includes("control request timeout")) { + return "The remote agent's Claude Code SDK is wedged on initialization (often after a long idle period or OAuth refresh). A workspace restart usually clears it."; + } + if ( + t.includes("readtimeout") || + t.includes("connecttimeout") || + t.includes("deadline exceeded") || + t.includes("timeout") + ) { + return "The remote agent didn't respond within the proxy timeout. It may be busy with a long task, or the runtime is stuck — restart the workspace if this repeats."; + } + if ( + t.includes("connectionreset") || + t.includes("remoteprotocolerror") || + t.includes("connection reset") || + t.includes("no message") + ) { + return "The connection to the remote agent dropped before a reply arrived. Usually a transient network blip — retry once. If it repeats, the remote container may have crashed mid-request; check its logs."; + } + if (t.includes("agent error") || t.includes("exception")) { + return "The remote agent's runtime threw an exception. Check the workspace's container logs for the traceback. Restart usually clears transient runtime crashes."; + } + if ( + t.includes("not found") || + t.includes("not accessible") || + t.includes("offline") + ) { + return "The remote workspace can't be reached — it may be stopped, removed, or outside the access control list. Verify the peer is online before retrying."; + } + if (detail === "") { + return "The remote agent returned no error detail (the underlying httpx exception had an empty message — typically a connection-reset or silent timeout). A workspace restart is the safe first move."; + } + return "The remote agent reported a delivery failure. Check the workspace logs or try restarting."; +} diff --git a/workspace/a2a_client.py b/workspace/a2a_client.py index 8893e2e7..d740bd6a 100644 --- a/workspace/a2a_client.py +++ b/workspace/a2a_client.py @@ -111,7 +111,12 @@ async def send_a2a_message(target_url: str, message: str) -> str: type_name = type(e).__name__ if not msg: detail = f"{type_name} (no message — likely connection reset or silent timeout)" - elif type_name in msg: + elif msg.startswith(f"{type_name}:") or msg.startswith(f"{type_name} "): + # Already prefixed with the type — don't double-prefix. + # Prefix-anchored check (not substring) so a message that + # happens to mention some OTHER class name mid-string + # (e.g. "got OSError on read") doesn't suppress our own + # type prefix and lose the diagnostic signal. detail = msg else: detail = f"{type_name}: {msg}" diff --git a/workspace/a2a_tools.py b/workspace/a2a_tools.py index 4834870b..3e3671bb 100644 --- a/workspace/a2a_tools.py +++ b/workspace/a2a_tools.py @@ -188,7 +188,14 @@ async def tool_delegate_task(workspace_id: str, task: str) -> str: # Strip the sentinel prefix so error_detail is the human-readable # cause directly. The Activity tab's red error chip surfaces this # without the user having to scroll into the raw response JSON. - error_detail = result[len(_A2A_ERROR_PREFIX):].strip() if is_error else "" + # + # Cap at 4096 chars before sending — the platform's + # activity_logs.error_detail column is unbounded TEXT and a + # malicious or buggy peer could otherwise stream an arbitrarily + # large error message into the caller's activity log. 4096 is + # comfortably above any real exception traceback we've seen and + # well below an obvious-DoS threshold. + error_detail = result[len(_A2A_ERROR_PREFIX):].strip()[:4096] if is_error else "" await report_activity( "a2a_receive", workspace_id, f"{peer_name} responded ({len(result)} chars)" if not is_error else f"{peer_name} failed: {error_detail[:120]}", diff --git a/workspace/tests/test_a2a_client.py b/workspace/tests/test_a2a_client.py index a9fb522a..fd813f3e 100644 --- a/workspace/tests/test_a2a_client.py +++ b/workspace/tests/test_a2a_client.py @@ -207,6 +207,24 @@ class TestSendA2AMessage: # Target URL is included so chained delegations are traceable. assert "target=http://target/a2a" in result + async def test_jsonrpc_error_with_code_zero_includes_code_in_detail(self): + """JSON-RPC error code=0 is technically not valid in the spec, + but a malformed peer can still send it — make sure the code is + preserved in the detail rather than collapsing into the + no-code path. Locks in the `code is not None` semantics over + the truthy-check shortcut.""" + import a2a_client + + resp = _make_response(200, {"error": {"code": 0, "message": "weird"}}) + mock_client = _make_mock_client(post_resp=resp) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result = await a2a_client.send_a2a_message("http://target/a2a", "task") + + assert result.startswith(a2a_client._A2A_ERROR_PREFIX) + assert "code=0" in result + assert "weird" in result + async def test_neither_result_nor_error_returns_a2a_error_with_payload(self): """Response with neither 'result' nor 'error' → A2A_ERROR + payload context.""" import a2a_client