fix(chat): client timeout is not "unreachable" — keep thinking state for long agent turns #2515

Merged
agent-reviewer merged 1 commits from fix/chat-timeout-not-unreachable into main 2026-06-10 08:38:35 +00:00
2 changed files with 105 additions and 1 deletions
@@ -0,0 +1,84 @@
// @vitest-environment jsdom
//
// jrs-auto, 2026-06-09 — "Failed to send message — agent may be unreachable"
// after 120s WHILE the agent visibly runs tools in the activity feed.
//
// Mechanism: the A2A proxy holds the POST open for the agent's whole turn;
// a long tool-calling turn outlives the 120s client budget and
// AbortSignal.timeout fires (DOMException name="TimeoutError"). The message
// WAS delivered — the timeout is a client-side stop-waiting, not transport
// failure. Pre-fix the catch-all released the guards and showed the
// unreachable banner (false alarm). Post-fix: a TimeoutError keeps the
// thinking state (reply + guard release arrive via the AGENT_MESSAGE WS
// event, the documented poll-mode contract); real transport errors keep
// the failure banner.
import { describe, it, expect, vi, beforeEach } from "vitest";
import { renderHook, act } from "@testing-library/react";
const apiPostMock = vi.fn<
(url: string, body?: unknown, opts?: unknown) => Promise<unknown>
>();
vi.mock("@/lib/api", () => ({
api: {
post: (url: string, body?: unknown, opts?: unknown) =>
apiPostMock(url, body, opts),
get: vi.fn(),
},
}));
vi.mock("../../uploads", () => ({
uploadChatFiles: vi.fn(),
FileTooLargeError: class FileTooLargeError extends Error {},
}));
import { useChatSend } from "../useChatSend";
// AbortSignal.timeout rejects with a DOMException named "TimeoutError".
const timeoutError = () => {
try {
return new DOMException("signal timed out", "TimeoutError");
} catch {
// jsdom fallback — only the .name contract matters.
const e = new Error("signal timed out");
(e as Error & { name: string }).name = "TimeoutError";
return e;
}
};
beforeEach(() => {
apiPostMock.mockReset();
});
describe("useChatSend — client timeout is NOT 'unreachable'", () => {
it("keeps sending=true and shows NO error when the 120s client timeout fires (delivered, agent still working)", async () => {
apiPostMock.mockRejectedValueOnce(timeoutError());
const { result } = renderHook(() =>
useChatSend("ws-long-turn", { getHistoryMessages: () => [] }),
);
await act(async () => {
await result.current.sendMessage("do a long multi-tool task");
await Promise.resolve();
});
expect(result.current.error).toBeNull(); // no false "unreachable" banner
expect(result.current.sending).toBe(true); // thinking persists until the WS reply
});
it("still fails loudly on a REAL transport error (non-timeout rejection)", async () => {
apiPostMock.mockRejectedValueOnce(new Error("connect ECONNREFUSED"));
const { result } = renderHook(() =>
useChatSend("ws-dead", { getHistoryMessages: () => [] }),
);
await act(async () => {
await result.current.sendMessage("hello?");
await Promise.resolve();
});
expect(result.current.error).toMatch(/unreachable/);
expect(result.current.sending).toBe(false); // guards released for retry
});
});
@@ -252,12 +252,32 @@ export function useChatSend(workspaceId: string, options: UseChatSendOptions) {
}
releaseSendGuards();
})
.catch(() => {
.catch((e: unknown) => {
if (sendTokenRef.current !== myToken) return;
if (!sendingFromAPIRef.current) {
sendInFlightRef.current = false;
return;
}
// CLIENT TIMEOUT ≠ UNREACHABLE (jrs-auto, 2026-06-09). The A2A
// proxy holds this POST open for the agent's WHOLE turn; a long
// tool-calling turn routinely outlives the 120s client budget.
// AbortSignal.timeout firing after the server ACCEPTED and held
// the connection means the message was DELIVERED and the agent is
// still working — showing "agent may be unreachable" here is a
// false alarm (the user watches the agent run tools in the
// activity feed while the chat claims failure). Keep the thinking
// state up; the reply lands via the AGENT_MESSAGE WebSocket event,
// which releases the guards — exactly the documented poll-mode
// contract above. Genuine unreachability fails FAST (connection
// refused / 4xx / 5xx) and still takes the error branch; a truly
// dead agent is surfaced by the reactive-health path
// (maybeMarkContainerDead), not by this client timeout.
const isClientTimeout =
e !== null && typeof e === "object" &&
"name" in e && (e as { name: unknown }).name === "TimeoutError";
if (isClientTimeout) {
return; // delivered; reply (and guard release) arrives via WS
}
releaseSendGuards();
setError("Failed to send message — agent may be unreachable");
});