From dd3d11c51da01fd74a6a0d352b052b30217f4475 Mon Sep 17 00:00:00 2001 From: hongming Date: Wed, 20 May 2026 03:23:47 -0700 Subject: [PATCH] fix(canvas/a2a-hint): route poll-mode budget timeout away from restart anti-pattern (P1 #348) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per `feedback_surface_actionable_failure_reason_to_user`, an opaque "workspace restart is the safe first move" prompt is the anti-pattern when the underlying error is a still-in-flight long-running task — which is exactly the canonical PM→Researcher codex coordination case that surfaces "FAILED TO DELIVER" today. Three improvements: 1. New `polling timeout after`/`check_task_status` pattern, ordered ABOVE the generic timeout bucket, routes to a hint that explicitly tells the user NOT to restart and to call check_task_status with the delegation_id to retrieve the in-flight result. 2. New optional `context.peerKind` parameter. When the caller knows the callee is a codex runtime, the empty-detail and generic-timeout hints specialise to call out that codex tasks can legitimately exceed the 600s sync-proxy budget. 3. The empty-detail branch (the most common "useless hint" path) no longer claims "restart is the safe first move" by default — it now tells the user to check the callee's Activity tab before restarting. Existing single-arg callers continue to compile unchanged (context is optional). Regression test count goes 9 → 17. Refs P1 #348, feedback_surface_actionable_failure_reason_to_user. --- .../tabs/chat/__tests__/a2aErrorHint.test.ts | 62 +++++++++++++++++++ .../src/components/tabs/chat/a2aErrorHint.ts | 47 +++++++++++++- 2 files changed, 107 insertions(+), 2 deletions(-) diff --git a/canvas/src/components/tabs/chat/__tests__/a2aErrorHint.test.ts b/canvas/src/components/tabs/chat/__tests__/a2aErrorHint.test.ts index 8518829ba..ebc350712 100644 --- a/canvas/src/components/tabs/chat/__tests__/a2aErrorHint.test.ts +++ b/canvas/src/components/tabs/chat/__tests__/a2aErrorHint.test.ts @@ -64,4 +64,66 @@ describe("inferA2AErrorHint", () => { expect(hint).toMatch(/Claude Code SDK/); expect(hint).not.toMatch(/proxy timeout/); }); + + // ---- P1 #348: poll-mode timeout-class detection ---- + + it("routes poll-mode budget exhaustion to its specific actionable hint", () => { + // a2a_tools_delegation.py emits this exact shape after the 600s + // budget. The user must NOT be told to restart — the work is + // still in flight on the platform side. + const hint = inferA2AErrorHint( + "polling timeout after 600s (delegation_id=abc, last_status=processing); the platform is still working on it — call check_task_status('abc') to retrieve later", + ); + expect(hint).toMatch(/Do NOT restart/); + expect(hint).toMatch(/check_task_status/); + }); + + it("matches the check_task_status hint clue even without the 'polling timeout' phrase", () => { + const hint = inferA2AErrorHint( + "platform busy — call check_task_status('xyz')", + ); + expect(hint).toMatch(/check_task_status/); + }); + + it("poll-mode hint wins over the generic timeout bucket", () => { + // The string contains both "polling timeout after" and "timeout" + // — the more-specific poll-mode hint must win so users don't get + // the generic "restart" advice for a still-in-flight task. + const hint = inferA2AErrorHint("polling timeout after 600s ..."); + expect(hint).toMatch(/Do NOT restart/); + expect(hint).not.toMatch(/restart the workspace if this repeats/); + }); + + // ---- P1 #348: codex-aware specialization ---- + + it("specialises the empty-detail hint for codex callees", () => { + // Per feedback_surface_actionable_failure_reason_to_user: opaque + // restart prompts are the anti-pattern. With peerKind=codex the + // hint explicitly de-recommends restart. + const hint = inferA2AErrorHint("", { peerKind: "codex" }); + expect(hint).toMatch(/codex/); + expect(hint).toMatch(/check its Activity tab/i); + expect(hint).not.toMatch(/A workspace restart is the safe first move/); + }); + + it("specialises generic-timeout hint for codex callees", () => { + const hint = inferA2AErrorHint("ReadTimeout", { peerKind: "codex" }); + expect(hint).toMatch(/codex/); + expect(hint).toMatch(/600s/); + }); + + it("falls back to the non-codex generic timeout hint when no peerKind given", () => { + const hint = inferA2AErrorHint("ReadTimeout"); + expect(hint).toMatch(/proxy timeout/); + expect(hint).not.toMatch(/600s sync-proxy/); + }); + + it("preserves existing empty-detail wording when no peer context provided", () => { + const hint = inferA2AErrorHint(""); + expect(hint).toMatch(/no error detail/); + // Updated wording: must NOT be the bare "restart is the safe + // first move" line — that violates surface-actionable-reason. + expect(hint).not.toMatch(/safe first move/); + expect(hint).toMatch(/Activity tab/); + }); }); diff --git a/canvas/src/components/tabs/chat/a2aErrorHint.ts b/canvas/src/components/tabs/chat/a2aErrorHint.ts index e29e643a6..7096a2bc3 100644 --- a/canvas/src/components/tabs/chat/a2aErrorHint.ts +++ b/canvas/src/components/tabs/chat/a2aErrorHint.ts @@ -10,10 +10,37 @@ * had already drifted (Activity tab gained `not found`/`offline` * cases AgentCommsPanel never picked up) — this module is the merged * superset and the only place hint text should change. + * + * Optional `context.peerKind` lets callers signal "the callee was a + * codex-runtime task" so the timeout-class hints can be more specific + * about expected long completion times (PM-coordinating-Researcher is + * the canonical case where the 600s sync-proxy budget is too tight). */ -export function inferA2AErrorHint(detail: string): string { +export interface A2AErrorContext { + /** Runtime of the callee, when known. e.g. "codex", "claude-code". */ + peerKind?: string; +} + +export function inferA2AErrorHint( + detail: string, + context?: A2AErrorContext, +): string { const t = detail.toLowerCase(); + // Poll-mode budget exhaustion (a2a_tools_delegation.py emits + // "polling timeout after Ns ... call check_task_status(...) to + // retrieve later"). This is NOT a delivery failure — the work is + // still in flight on the platform side. Route to a specific hint + // BEFORE the generic timeout bucket so the user gets the actionable + // "wait + check_task_status" guidance instead of the misleading + // "restart the workspace" anti-pattern. + if ( + t.includes("polling timeout after") || + t.includes("call check_task_status") + ) { + return "The 600s sync-polling budget expired but the platform is still working on the delegation. Do NOT restart — the work isn't lost. Wait, then call check_task_status with the delegation_id to retrieve the result. If the callee is a long-running codex task, this is expected."; + } + // "control request timeout" is the specific Claude Code SDK init // wedge symptom. Pattern on the full phrase, not bare "initialize" // — a user task containing "failed to initialize database" would @@ -27,6 +54,13 @@ export function inferA2AErrorHint(detail: string): string { t.includes("deadline exceeded") || t.includes("timeout") ) { + // For codex callees, a 600s sync-proxy timeout is the EXPECTED + // shape when the task is genuinely long-running. Calling out the + // workspace-restart anti-pattern explicitly per + // `feedback_surface_actionable_failure_reason_to_user`. + if ((context?.peerKind || "").toLowerCase() === "codex") { + return "The codex remote agent didn't respond within the 600s sync-proxy timeout. Codex tasks can legitimately run longer than this — check the callee's Activity tab; the work may still be progressing. Restart only if the container is genuinely stuck (no activity for several minutes)."; + } return "The remote agent didn't respond within the proxy timeout. It may be busy with a long task, or the runtime is stuck — restart the workspace if this repeats."; } if ( @@ -48,7 +82,16 @@ export function inferA2AErrorHint(detail: string): string { return "The remote workspace can't be reached — it may be stopped, removed, or outside the access control list. Verify the peer is online before retrying."; } if (detail === "") { - return "The remote agent returned no error detail (the underlying httpx exception had an empty message — typically a connection-reset or silent timeout). A workspace restart is the safe first move."; + // Per `feedback_surface_actionable_failure_reason_to_user`: a bare + // "restart the workspace" prompt is the anti-pattern when the + // underlying failure was a silent timeout against a long-running + // remote (codex Researcher being coordinated by PM is the + // canonical case). If the caller knows the peer is codex, route + // to the more specific hint that explicitly de-recommends restart. + if ((context?.peerKind || "").toLowerCase() === "codex") { + return "The codex remote agent returned no error detail — most often the 600s sync-proxy budget expired before the task finished. The work may still be progressing on the callee side; check its Activity tab before restarting."; + } + return "The remote agent returned no error detail (the underlying httpx exception had an empty message — typically a connection-reset or silent timeout). Check the callee's Activity tab to see if work is still in flight before restarting."; } return "The remote agent reported a delivery failure. Check the workspace logs or try restarting."; } -- 2.52.0