fix(a2a): review-driven hardening — prefix-anchored type check, error_detail cap, shared hint module

Three required fixes from the bundle review of 391e1872:

1. workspace/a2a_client.py: substring `type_name in msg` could miss
   the diagnostic prefix when an exception's message embedded a
   different class name mid-string (e.g. `OSError("see ConnectionError
   below")` → printed as plain msg, type lost). Switched to a
   prefix-anchored check (`msg.startswith(f"{type_name}:")` etc.) so
   the type label is always added when not already at the start of
   the message.

2. workspace/a2a_tools.py: `activity_logs.error_detail` is unbounded
   TEXT on the platform (handlers/activity.go does not validate
   length). A buggy or hostile peer could stream arbitrarily large
   error messages into the caller's activity log. Cap at 4096 chars
   at the producer — comfortably above any real exception traceback,
   well below an obvious-DoS threshold.

3. New regression test for JSON-RPC `code=0` — pins the
   `code is not None` semantics so the code is preserved in the
   detail rather than collapsing into the no-code path. Code=0 is
   not valid per the spec, but a malformed peer can still emit it
   and we want it visible for diagnosis.

Plus one optional taken: extracted the A2A-error → hint mapping into
canvas/src/components/tabs/chat/a2aErrorHint.ts. The two prior copies
(AgentCommsPanel.inferCauseHint + ActivityTab.inferA2AErrorHint) had
already drifted — Activity tab gained `not found`/`offline` cases the
chat panel never picked up, AgentCommsPanel handled empty-input
explicitly while Activity didn't. The shared module is the merged
superset, with 10 unit tests pinning each named pattern + the
"most specific first" ordering (Claude SDK wedge wins over generic
timeout).

Skipped (per analysis):
- Unicode-naive 120-char slice — Python str[:N] slices on code
  points, not bytes. Safe.
- Nested [A2A_ERROR] confusion — non-issue per reviewer; outer
  prefix winning still produces a structured render.
- MessagePreview + JsonBlock dual render on errors — intentional
  drilldown; raw JSON is below the fold for operators who need it.
- console.warn dedup — refetches don't happen per-event so spam
  risk is low.
- str(data)[:200] materialization — A2A response bodies aren't
  typically MB-sized.

Verified: 1005 canvas tests pass (10 new hint tests); 10 Python
send_a2a_message tests pass (1 new for code=0); tsc clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-24 23:47:44 -07:00
parent 391e187281
commit c159d85eb5
7 changed files with 158 additions and 49 deletions

View File

@ -5,6 +5,7 @@ import { api } from "@/lib/api";
import { ConversationTraceModal } from "@/components/ConversationTraceModal";
import { type ActivityEntry } from "@/types/activity";
import { useWorkspaceName } from "@/hooks/useWorkspaceName";
import { inferA2AErrorHint } from "./chat/a2aErrorHint";
interface Props {
workspaceId: string;
@ -288,29 +289,6 @@ function ActivityRow({
const A2A_ERROR_PREFIX = "[A2A_ERROR]";
/** Best-effort cause hint for an A2A delivery failure. Mirrors the
* patterns used by AgentCommsPanel's inferCauseHint so the same
* symptom reads the same way in both surfaces. */
function inferA2AErrorHint(detail: string): string {
const t = detail.toLowerCase();
if (t.includes("control request timeout")) {
return "The remote agent's Claude Code SDK is wedged on initialization. Restart the workspace to clear it.";
}
if (t.includes("readtimeout") || t.includes("connecttimeout") || t.includes("deadline exceeded") || t.includes("timeout")) {
return "The remote agent didn't respond within the proxy timeout. It may be busy with a long task or its runtime is stuck — restart if this repeats.";
}
if (t.includes("connectionreset") || t.includes("remoteprotocolerror") || t.includes("connection reset") || t.includes("no message")) {
return "The connection to the remote agent dropped before a reply arrived. Usually a transient network blip; retry once. If it repeats, the remote container may have crashed mid-request — check its logs.";
}
if (t.includes("agent error") || t.includes("exception")) {
return "The remote agent's runtime threw an exception. Check the workspace's container logs for the traceback. Restart usually clears transient runtime crashes.";
}
if (t.includes("not found") || t.includes("not accessible") || t.includes("offline")) {
return "The remote workspace can't be reached — it may be stopped, removed, or outside the access control list. Verify the peer is online before retrying.";
}
return "Check the remote agent's container logs for context. Restart the workspace if the failure repeats.";
}
/** Render a [A2A_ERROR]-prefixed response as a structured error block
* with a stripped detail line + a cause hint. The previous raw render
* ("[A2A_ERROR] " literal in the response area) gave the user no

View File

@ -9,6 +9,7 @@ import { WS_URL } from "@/store/socket";
import { closeWebSocketGracefully } from "@/lib/ws-close";
import { showToast } from "../../Toaster";
import { extractResponseText, extractRequestText } from "./message-parser";
import { inferA2AErrorHint } from "./a2aErrorHint";
export interface ActivityEntry {
id: string;
@ -115,29 +116,8 @@ function unwrapErrorText(raw: string | null): string {
return trimmed;
}
/** Best-effort cause hint based on what we can see in the error text.
* These map known runtime symptoms to operator-actionable language so
* the user isn't left staring at "[A2A_ERROR]" with no next step. */
function inferCauseHint(errorText: string): string {
const t = errorText.toLowerCase();
// "control request timeout" is the specific Claude Code SDK init
// wedge symptom. Don't pattern on bare "initialize" — too broad
// (a user task containing "failed to initialize database" would
// false-positive into the SDK-wedge hint).
if (t.includes("control request timeout")) {
return "The remote agent's Claude Code SDK is wedged on initialization (often after a long idle period or OAuth refresh). A workspace restart usually clears it.";
}
if (t.includes("deadline exceeded") || t.includes("timeout")) {
return "The remote agent didn't respond within the proxy timeout. It may be busy with a long-running task, or the runtime is stuck. Restart the workspace if this repeats.";
}
if (t.includes("agent error") || t.includes("exception")) {
return "The remote agent's runtime threw an exception. Check the workspace's container logs for the traceback. Restart usually clears transient runtime crashes.";
}
if (errorText === "") {
return "The remote agent returned no error detail (the underlying httpx exception had an empty message — typically a connection-reset or silent timeout). A workspace restart is the safe first move.";
}
return "The remote agent reported a delivery failure. Check the workspace logs or try restarting.";
}
// inferA2AErrorHint moved to ./a2aErrorHint so the Activity tab and
// this panel render identical hints for the same symptom.
export function AgentCommsPanel({ workspaceId }: { workspaceId: string }) {
const [messages, setMessages] = useState<CommMessage[]>([]);
@ -314,7 +294,7 @@ function ErrorMessage({ msg }: { msg: CommMessage }) {
const selectNode = useCanvasStore((s) => s.selectNode);
const [restarting, setRestarting] = useState(false);
const errorText = unwrapErrorText(msg.responseText);
const hint = inferCauseHint(errorText);
const hint = inferA2AErrorHint(errorText);
// Guard against acting on a peer whose workspace has been deleted
// since this row was logged. Without the guard, restart 404s

View File

@ -0,0 +1,67 @@
import { describe, it, expect } from "vitest";
import { inferA2AErrorHint } from "../a2aErrorHint";
// Pure logic. Pin every named pattern so a future contributor adding a
// new symptom doesn't accidentally collapse the buckets — and so the
// "most specific first" ordering can't drift without a test failing.
describe("inferA2AErrorHint", () => {
it("matches the Claude Code SDK init wedge specifically", () => {
const hint = inferA2AErrorHint("Control request timeout: initialize");
expect(hint).toMatch(/Claude Code SDK is wedged/);
});
it("does NOT misfire on user tasks containing 'initialize' generally", () => {
// Regression: an earlier bare-`initialize` pattern would have
// false-positived "failed to initialize database" into the SDK
// wedge hint. Confirm the full-phrase guard holds.
const hint = inferA2AErrorHint("failed to initialize database connection");
expect(hint).not.toMatch(/Claude Code SDK/);
});
it("recognises httpx ReadTimeout / ConnectTimeout class names", () => {
expect(inferA2AErrorHint("ReadTimeout: timeout")).toMatch(/proxy timeout/);
expect(inferA2AErrorHint("ConnectTimeout: ...")).toMatch(/proxy timeout/);
});
it("recognises generic timeout / deadline-exceeded language", () => {
expect(inferA2AErrorHint("deadline exceeded after 300s")).toMatch(/proxy timeout/);
expect(inferA2AErrorHint("Operation timeout")).toMatch(/proxy timeout/);
});
it("handles connection-reset family (RemoteProtocolError, ConnectionReset, no-message)", () => {
expect(inferA2AErrorHint("RemoteProtocolError: ...")).toMatch(/connection.*dropped/);
expect(inferA2AErrorHint("ConnectionResetError")).toMatch(/connection.*dropped/);
expect(inferA2AErrorHint("connection reset by peer")).toMatch(/connection.*dropped/);
expect(inferA2AErrorHint("RemoteProtocolError (no message — likely connection reset)")).toMatch(/connection.*dropped/);
});
it("recognises agent-runtime exceptions", () => {
expect(inferA2AErrorHint("Agent error: ValueError raised")).toMatch(/runtime threw an exception/);
expect(inferA2AErrorHint("RuntimeException in tool call")).toMatch(/runtime threw an exception/);
});
it("recognises peer-unreachable cases (Activity-tab originals)", () => {
expect(inferA2AErrorHint("workspace not found")).toMatch(/can't be reached/);
expect(inferA2AErrorHint("not accessible")).toMatch(/can't be reached/);
expect(inferA2AErrorHint("workspace is offline")).toMatch(/can't be reached/);
});
it("returns the empty-detail-specific hint when input is exactly empty", () => {
expect(inferA2AErrorHint("")).toMatch(/no error detail/);
});
it("returns a generic fallback for unrecognised text", () => {
const hint = inferA2AErrorHint("some completely novel error nobody has matched yet");
expect(hint).toMatch(/Check the workspace logs|delivery failure/);
});
it("Claude SDK wedge wins over the more general timeout pattern", () => {
// Both 'control request timeout' and 'timeout' match the same
// input. The SDK wedge hint is more actionable; the ordering in
// the function must keep it first. Lock that priority in.
const hint = inferA2AErrorHint("Control request timeout: initialize");
expect(hint).toMatch(/Claude Code SDK/);
expect(hint).not.toMatch(/proxy timeout/);
});
});

View File

@ -0,0 +1,54 @@
/**
* Maps an A2A delivery-failure detail string (the bit AFTER stripping
* the [A2A_ERROR] sentinel prefix) to a one-line operator-actionable
* hint. Pattern matches are lowercase substring checks, ordered most-
* specific first so the right hint wins when multiple patterns
* overlap (e.g. "control request timeout" wins over generic "timeout").
*
* Used by both the chat Agent Comms panel and the Activity tab so the
* same symptom reads identically across surfaces. Two prior copies
* had already drifted (Activity tab gained `not found`/`offline`
* cases AgentCommsPanel never picked up) this module is the merged
* superset and the only place hint text should change.
*/
export function inferA2AErrorHint(detail: string): string {
const t = detail.toLowerCase();
// "control request timeout" is the specific Claude Code SDK init
// wedge symptom. Pattern on the full phrase, not bare "initialize"
// — a user task containing "failed to initialize database" would
// false-positive into the SDK-wedge hint.
if (t.includes("control request timeout")) {
return "The remote agent's Claude Code SDK is wedged on initialization (often after a long idle period or OAuth refresh). A workspace restart usually clears it.";
}
if (
t.includes("readtimeout") ||
t.includes("connecttimeout") ||
t.includes("deadline exceeded") ||
t.includes("timeout")
) {
return "The remote agent didn't respond within the proxy timeout. It may be busy with a long task, or the runtime is stuck — restart the workspace if this repeats.";
}
if (
t.includes("connectionreset") ||
t.includes("remoteprotocolerror") ||
t.includes("connection reset") ||
t.includes("no message")
) {
return "The connection to the remote agent dropped before a reply arrived. Usually a transient network blip — retry once. If it repeats, the remote container may have crashed mid-request; check its logs.";
}
if (t.includes("agent error") || t.includes("exception")) {
return "The remote agent's runtime threw an exception. Check the workspace's container logs for the traceback. Restart usually clears transient runtime crashes.";
}
if (
t.includes("not found") ||
t.includes("not accessible") ||
t.includes("offline")
) {
return "The remote workspace can't be reached — it may be stopped, removed, or outside the access control list. Verify the peer is online before retrying.";
}
if (detail === "") {
return "The remote agent returned no error detail (the underlying httpx exception had an empty message — typically a connection-reset or silent timeout). A workspace restart is the safe first move.";
}
return "The remote agent reported a delivery failure. Check the workspace logs or try restarting.";
}

View File

@ -111,7 +111,12 @@ async def send_a2a_message(target_url: str, message: str) -> str:
type_name = type(e).__name__
if not msg:
detail = f"{type_name} (no message — likely connection reset or silent timeout)"
elif type_name in msg:
elif msg.startswith(f"{type_name}:") or msg.startswith(f"{type_name} "):
# Already prefixed with the type — don't double-prefix.
# Prefix-anchored check (not substring) so a message that
# happens to mention some OTHER class name mid-string
# (e.g. "got OSError on read") doesn't suppress our own
# type prefix and lose the diagnostic signal.
detail = msg
else:
detail = f"{type_name}: {msg}"

View File

@ -188,7 +188,14 @@ async def tool_delegate_task(workspace_id: str, task: str) -> str:
# Strip the sentinel prefix so error_detail is the human-readable
# cause directly. The Activity tab's red error chip surfaces this
# without the user having to scroll into the raw response JSON.
error_detail = result[len(_A2A_ERROR_PREFIX):].strip() if is_error else ""
#
# Cap at 4096 chars before sending — the platform's
# activity_logs.error_detail column is unbounded TEXT and a
# malicious or buggy peer could otherwise stream an arbitrarily
# large error message into the caller's activity log. 4096 is
# comfortably above any real exception traceback we've seen and
# well below an obvious-DoS threshold.
error_detail = result[len(_A2A_ERROR_PREFIX):].strip()[:4096] if is_error else ""
await report_activity(
"a2a_receive", workspace_id,
f"{peer_name} responded ({len(result)} chars)" if not is_error else f"{peer_name} failed: {error_detail[:120]}",

View File

@ -207,6 +207,24 @@ class TestSendA2AMessage:
# Target URL is included so chained delegations are traceable.
assert "target=http://target/a2a" in result
async def test_jsonrpc_error_with_code_zero_includes_code_in_detail(self):
"""JSON-RPC error code=0 is technically not valid in the spec,
but a malformed peer can still send it make sure the code is
preserved in the detail rather than collapsing into the
no-code path. Locks in the `code is not None` semantics over
the truthy-check shortcut."""
import a2a_client
resp = _make_response(200, {"error": {"code": 0, "message": "weird"}})
mock_client = _make_mock_client(post_resp=resp)
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
result = await a2a_client.send_a2a_message("http://target/a2a", "task")
assert result.startswith(a2a_client._A2A_ERROR_PREFIX)
assert "code=0" in result
assert "weird" in result
async def test_neither_result_nor_error_returns_a2a_error_with_payload(self):
"""Response with neither 'result' nor 'error' → A2A_ERROR + payload context."""
import a2a_client