fix(canvas/a2a-hint): route poll-mode budget timeout away from restart anti-pattern (P1 #348) #1607
@@ -64,4 +64,66 @@ describe("inferA2AErrorHint", () => {
|
||||
expect(hint).toMatch(/Claude Code SDK/);
|
||||
expect(hint).not.toMatch(/proxy timeout/);
|
||||
});
|
||||
|
||||
// ---- P1 #348: poll-mode timeout-class detection ----
|
||||
|
||||
it("routes poll-mode budget exhaustion to its specific actionable hint", () => {
|
||||
// a2a_tools_delegation.py emits this exact shape after the 600s
|
||||
// budget. The user must NOT be told to restart — the work is
|
||||
// still in flight on the platform side.
|
||||
const hint = inferA2AErrorHint(
|
||||
"polling timeout after 600s (delegation_id=abc, last_status=processing); the platform is still working on it — call check_task_status('abc') to retrieve later",
|
||||
);
|
||||
expect(hint).toMatch(/Do NOT restart/);
|
||||
expect(hint).toMatch(/check_task_status/);
|
||||
});
|
||||
|
||||
it("matches the check_task_status hint clue even without the 'polling timeout' phrase", () => {
|
||||
const hint = inferA2AErrorHint(
|
||||
"platform busy — call check_task_status('xyz')",
|
||||
);
|
||||
expect(hint).toMatch(/check_task_status/);
|
||||
});
|
||||
|
||||
it("poll-mode hint wins over the generic timeout bucket", () => {
|
||||
// The string contains both "polling timeout after" and "timeout"
|
||||
// — the more-specific poll-mode hint must win so users don't get
|
||||
// the generic "restart" advice for a still-in-flight task.
|
||||
const hint = inferA2AErrorHint("polling timeout after 600s ...");
|
||||
expect(hint).toMatch(/Do NOT restart/);
|
||||
expect(hint).not.toMatch(/restart the workspace if this repeats/);
|
||||
});
|
||||
|
||||
// ---- P1 #348: codex-aware specialization ----
|
||||
|
||||
it("specialises the empty-detail hint for codex callees", () => {
|
||||
// Per feedback_surface_actionable_failure_reason_to_user: opaque
|
||||
// restart prompts are the anti-pattern. With peerKind=codex the
|
||||
// hint explicitly de-recommends restart.
|
||||
const hint = inferA2AErrorHint("", { peerKind: "codex" });
|
||||
expect(hint).toMatch(/codex/);
|
||||
expect(hint).toMatch(/check its Activity tab/i);
|
||||
expect(hint).not.toMatch(/A workspace restart is the safe first move/);
|
||||
});
|
||||
|
||||
it("specialises generic-timeout hint for codex callees", () => {
|
||||
const hint = inferA2AErrorHint("ReadTimeout", { peerKind: "codex" });
|
||||
expect(hint).toMatch(/codex/);
|
||||
expect(hint).toMatch(/600s/);
|
||||
});
|
||||
|
||||
it("falls back to the non-codex generic timeout hint when no peerKind given", () => {
|
||||
const hint = inferA2AErrorHint("ReadTimeout");
|
||||
expect(hint).toMatch(/proxy timeout/);
|
||||
expect(hint).not.toMatch(/600s sync-proxy/);
|
||||
});
|
||||
|
||||
it("preserves existing empty-detail wording when no peer context provided", () => {
|
||||
const hint = inferA2AErrorHint("");
|
||||
expect(hint).toMatch(/no error detail/);
|
||||
// Updated wording: must NOT be the bare "restart is the safe
|
||||
// first move" line — that violates surface-actionable-reason.
|
||||
expect(hint).not.toMatch(/safe first move/);
|
||||
expect(hint).toMatch(/Activity tab/);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -10,10 +10,37 @@
|
||||
* had already drifted (Activity tab gained `not found`/`offline`
|
||||
* cases AgentCommsPanel never picked up) — this module is the merged
|
||||
* superset and the only place hint text should change.
|
||||
*
|
||||
* Optional `context.peerKind` lets callers signal "the callee was a
|
||||
* codex-runtime task" so the timeout-class hints can be more specific
|
||||
* about expected long completion times (PM-coordinating-Researcher is
|
||||
* the canonical case where the 600s sync-proxy budget is too tight).
|
||||
*/
|
||||
export function inferA2AErrorHint(detail: string): string {
|
||||
export interface A2AErrorContext {
|
||||
/** Runtime of the callee, when known. e.g. "codex", "claude-code". */
|
||||
peerKind?: string;
|
||||
}
|
||||
|
||||
export function inferA2AErrorHint(
|
||||
detail: string,
|
||||
context?: A2AErrorContext,
|
||||
): string {
|
||||
const t = detail.toLowerCase();
|
||||
|
||||
// Poll-mode budget exhaustion (a2a_tools_delegation.py emits
|
||||
// "polling timeout after Ns ... call check_task_status(...) to
|
||||
// retrieve later"). This is NOT a delivery failure — the work is
|
||||
// still in flight on the platform side. Route to a specific hint
|
||||
// BEFORE the generic timeout bucket so the user gets the actionable
|
||||
// "wait + check_task_status" guidance instead of the misleading
|
||||
// "restart the workspace" anti-pattern.
|
||||
if (
|
||||
t.includes("polling timeout after") ||
|
||||
t.includes("call check_task_status")
|
||||
) {
|
||||
return "The 600s sync-polling budget expired but the platform is still working on the delegation. Do NOT restart — the work isn't lost. Wait, then call check_task_status with the delegation_id to retrieve the result. If the callee is a long-running codex task, this is expected.";
|
||||
}
|
||||
|
||||
// "control request timeout" is the specific Claude Code SDK init
|
||||
// wedge symptom. Pattern on the full phrase, not bare "initialize"
|
||||
// — a user task containing "failed to initialize database" would
|
||||
@@ -27,6 +54,13 @@ export function inferA2AErrorHint(detail: string): string {
|
||||
t.includes("deadline exceeded") ||
|
||||
t.includes("timeout")
|
||||
) {
|
||||
// For codex callees, a 600s sync-proxy timeout is the EXPECTED
|
||||
// shape when the task is genuinely long-running. Calling out the
|
||||
// workspace-restart anti-pattern explicitly per
|
||||
// `feedback_surface_actionable_failure_reason_to_user`.
|
||||
if ((context?.peerKind || "").toLowerCase() === "codex") {
|
||||
return "The codex remote agent didn't respond within the 600s sync-proxy timeout. Codex tasks can legitimately run longer than this — check the callee's Activity tab; the work may still be progressing. Restart only if the container is genuinely stuck (no activity for several minutes).";
|
||||
}
|
||||
return "The remote agent didn't respond within the proxy timeout. It may be busy with a long task, or the runtime is stuck — restart the workspace if this repeats.";
|
||||
}
|
||||
if (
|
||||
@@ -48,7 +82,16 @@ export function inferA2AErrorHint(detail: string): string {
|
||||
return "The remote workspace can't be reached — it may be stopped, removed, or outside the access control list. Verify the peer is online before retrying.";
|
||||
}
|
||||
if (detail === "") {
|
||||
return "The remote agent returned no error detail (the underlying httpx exception had an empty message — typically a connection-reset or silent timeout). A workspace restart is the safe first move.";
|
||||
// Per `feedback_surface_actionable_failure_reason_to_user`: a bare
|
||||
// "restart the workspace" prompt is the anti-pattern when the
|
||||
// underlying failure was a silent timeout against a long-running
|
||||
// remote (codex Researcher being coordinated by PM is the
|
||||
// canonical case). If the caller knows the peer is codex, route
|
||||
// to the more specific hint that explicitly de-recommends restart.
|
||||
if ((context?.peerKind || "").toLowerCase() === "codex") {
|
||||
return "The codex remote agent returned no error detail — most often the 600s sync-proxy budget expired before the task finished. The work may still be progressing on the callee side; check its Activity tab before restarting.";
|
||||
}
|
||||
return "The remote agent returned no error detail (the underlying httpx exception had an empty message — typically a connection-reset or silent timeout). Check the callee's Activity tab to see if work is still in flight before restarting.";
|
||||
}
|
||||
return "The remote agent reported a delivery failure. Check the workspace logs or try restarting.";
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user