From bcf7022d92f4bb562599738348c8b31a92b57106 Mon Sep 17 00:00:00 2001 From: core-devops Date: Tue, 9 Jun 2026 23:53:26 -0700 Subject: [PATCH] =?UTF-8?q?fix(chat):=20client=20timeout=20is=20not=20"unr?= =?UTF-8?q?eachable"=20=E2=80=94=20keep=20the=20thinking=20state=20for=20l?= =?UTF-8?q?ong=20agent=20turns?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jrs-auto, 2026-06-09: the chat showed "Failed to send message — agent may be unreachable" after 120s WHILE the agent visibly ran tools in the activity feed. Mechanism: the A2A proxy holds the send POST open for the agent's whole turn; a long tool-calling turn outlives the 120s client budget, AbortSignal.timeout fires (DOMException name=TimeoutError), and the catch-all released the guards + showed the unreachable banner — a false alarm on a message that WAS delivered and processing. The catch now classifies: TimeoutError → delivered + still working: keep the thinking state (no banner, guards stay up; the reply and the guard release arrive via the AGENT_MESSAGE WebSocket event — the documented poll-mode contract). Real transport errors (fast connection-refused / 4xx/5xx) keep the loud failure + guard release for retry. A truly dead agent is surfaced by the reactive-health path, not this client timeout. Tests: TimeoutError → no error + sending stays true; ECONNREFUSED → "unreachable" + guards released. Full chat-hooks suite: 296 passed. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../useChatSend.clientTimeout.test.tsx | 84 +++++++++++++++++++ .../components/tabs/chat/hooks/useChatSend.ts | 22 ++++- 2 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 canvas/src/components/tabs/chat/hooks/__tests__/useChatSend.clientTimeout.test.tsx diff --git a/canvas/src/components/tabs/chat/hooks/__tests__/useChatSend.clientTimeout.test.tsx b/canvas/src/components/tabs/chat/hooks/__tests__/useChatSend.clientTimeout.test.tsx new file mode 100644 index 000000000..47de56621 --- /dev/null +++ b/canvas/src/components/tabs/chat/hooks/__tests__/useChatSend.clientTimeout.test.tsx @@ -0,0 +1,84 @@ +// @vitest-environment jsdom +// +// jrs-auto, 2026-06-09 — "Failed to send message — agent may be unreachable" +// after 120s WHILE the agent visibly runs tools in the activity feed. +// +// Mechanism: the A2A proxy holds the POST open for the agent's whole turn; +// a long tool-calling turn outlives the 120s client budget and +// AbortSignal.timeout fires (DOMException name="TimeoutError"). The message +// WAS delivered — the timeout is a client-side stop-waiting, not transport +// failure. Pre-fix the catch-all released the guards and showed the +// unreachable banner (false alarm). Post-fix: a TimeoutError keeps the +// thinking state (reply + guard release arrive via the AGENT_MESSAGE WS +// event, the documented poll-mode contract); real transport errors keep +// the failure banner. + +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { renderHook, act } from "@testing-library/react"; + +const apiPostMock = vi.fn< + (url: string, body?: unknown, opts?: unknown) => Promise +>(); +vi.mock("@/lib/api", () => ({ + api: { + post: (url: string, body?: unknown, opts?: unknown) => + apiPostMock(url, body, opts), + get: vi.fn(), + }, +})); +vi.mock("../../uploads", () => ({ + uploadChatFiles: vi.fn(), + FileTooLargeError: class FileTooLargeError extends Error {}, +})); + +import { useChatSend } from "../useChatSend"; + +// AbortSignal.timeout rejects with a DOMException named "TimeoutError". +const timeoutError = () => { + try { + return new DOMException("signal timed out", "TimeoutError"); + } catch { + // jsdom fallback — only the .name contract matters. + const e = new Error("signal timed out"); + (e as Error & { name: string }).name = "TimeoutError"; + return e; + } +}; + +beforeEach(() => { + apiPostMock.mockReset(); +}); + +describe("useChatSend — client timeout is NOT 'unreachable'", () => { + it("keeps sending=true and shows NO error when the 120s client timeout fires (delivered, agent still working)", async () => { + apiPostMock.mockRejectedValueOnce(timeoutError()); + + const { result } = renderHook(() => + useChatSend("ws-long-turn", { getHistoryMessages: () => [] }), + ); + + await act(async () => { + await result.current.sendMessage("do a long multi-tool task"); + await Promise.resolve(); + }); + + expect(result.current.error).toBeNull(); // no false "unreachable" banner + expect(result.current.sending).toBe(true); // thinking persists until the WS reply + }); + + it("still fails loudly on a REAL transport error (non-timeout rejection)", async () => { + apiPostMock.mockRejectedValueOnce(new Error("connect ECONNREFUSED")); + + const { result } = renderHook(() => + useChatSend("ws-dead", { getHistoryMessages: () => [] }), + ); + + await act(async () => { + await result.current.sendMessage("hello?"); + await Promise.resolve(); + }); + + expect(result.current.error).toMatch(/unreachable/); + expect(result.current.sending).toBe(false); // guards released for retry + }); +}); diff --git a/canvas/src/components/tabs/chat/hooks/useChatSend.ts b/canvas/src/components/tabs/chat/hooks/useChatSend.ts index beb9724c0..6db570a80 100644 --- a/canvas/src/components/tabs/chat/hooks/useChatSend.ts +++ b/canvas/src/components/tabs/chat/hooks/useChatSend.ts @@ -252,12 +252,32 @@ export function useChatSend(workspaceId: string, options: UseChatSendOptions) { } releaseSendGuards(); }) - .catch(() => { + .catch((e: unknown) => { if (sendTokenRef.current !== myToken) return; if (!sendingFromAPIRef.current) { sendInFlightRef.current = false; return; } + // CLIENT TIMEOUT ≠ UNREACHABLE (jrs-auto, 2026-06-09). The A2A + // proxy holds this POST open for the agent's WHOLE turn; a long + // tool-calling turn routinely outlives the 120s client budget. + // AbortSignal.timeout firing after the server ACCEPTED and held + // the connection means the message was DELIVERED and the agent is + // still working — showing "agent may be unreachable" here is a + // false alarm (the user watches the agent run tools in the + // activity feed while the chat claims failure). Keep the thinking + // state up; the reply lands via the AGENT_MESSAGE WebSocket event, + // which releases the guards — exactly the documented poll-mode + // contract above. Genuine unreachability fails FAST (connection + // refused / 4xx / 5xx) and still takes the error branch; a truly + // dead agent is surfaced by the reactive-health path + // (maybeMarkContainerDead), not by this client timeout. + const isClientTimeout = + e !== null && typeof e === "object" && + "name" in e && (e as { name: unknown }).name === "TimeoutError"; + if (isClientTimeout) { + return; // delivered; reply (and guard release) arrives via WS + } releaseSendGuards(); setError("Failed to send message — agent may be unreachable"); }); -- 2.52.0