From 31fedd50afc0be00e4b17d910f6cfaa61b659101 Mon Sep 17 00:00:00 2001 From: core-fe Date: Sun, 17 May 2026 10:39:13 -0700 Subject: [PATCH] =?UTF-8?q?fix(canvas):=20mobile=20chat=20realtime=20?= =?UTF-8?q?=E2=80=94=20WS=20wake-recovery=20+=20resume=20back-fill?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: agent chat replies arrive ONLY via live AGENT_MESSAGE / A2A_RESPONSE WebSocket events (store/socket.ts ReconnectingSocket). The socket had NO visibilitychange/pageshow/online/focus recovery anywhere — reconnect was driven solely by ws.onclose. iOS Safari / Chrome-mobile freeze the page and silently tear the WS down WITHOUT firing onclose when backgrounded / device-locked; on thaw nothing re-arms (onclose never ran, interval timers were suspended), so the socket is dead and every reply during suspension is lost. store.rehydrate() only re-pulls /workspaces status, not chat, and useChatHistory fetches DB history mount-only — so missed replies never back-fill until remount (navigate away+back) or hard refresh. Desktop never hits this (its onclose fires promptly). Not forked code: MobileChat and desktop ChatTab share useChatSocket / useChatHistory; the divergence is purely mobile browser lifecycle. Fix (shared by desktop + mobile, restores the unified path): - ReconnectingSocket installs visibilitychange/pageshow/online/focus listeners that force a reconnect when the page is foregrounded and the socket isn't OPEN/CONNECTING (no-ops on desktop where onclose already handled it; SSR-safe via typeof guards). - On an open that follows a real loss, emit a socket-resume signal. - useChatHistory subscribes to resume and re-runs loadInitial(), so chat threads back-fill the persisted messages missed while frozen — exactly what a remount does today, automatically. Verification: code analysis + 9 new unit tests (socket.test.ts) simulating background-suspend + visibility/pageshow/online/focus transitions, asserting reconnect + resume emission + listener teardown; full canvas suite green (3315 passed, 0 fail); npm run build green. Mobile real-device confirmation still needed (cannot be proven without a physical device) — see PR body. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../tabs/chat/hooks/useChatHistory.ts | 18 ++ canvas/src/store/__tests__/socket.test.ts | 199 ++++++++++++++++++ canvas/src/store/socket-events.ts | 50 +++++ canvas/src/store/socket.ts | 118 ++++++++++- 4 files changed, 384 insertions(+), 1 deletion(-) diff --git a/canvas/src/components/tabs/chat/hooks/useChatHistory.ts b/canvas/src/components/tabs/chat/hooks/useChatHistory.ts index 00a16f671..bf7a16acd 100644 --- a/canvas/src/components/tabs/chat/hooks/useChatHistory.ts +++ b/canvas/src/components/tabs/chat/hooks/useChatHistory.ts @@ -2,6 +2,7 @@ import { useCallback, useEffect, useRef, useState } from "react"; import { api } from "@/lib/api"; +import { subscribeSocketResume } from "@/store/socket-events"; import { type ChatMessage, appendMessageDeduped as appendMessageDedupedFn } from "../types"; const INITIAL_HISTORY_LIMIT = 10; @@ -82,6 +83,23 @@ export function useChatHistory( loadInitial(); }, [loadInitial]); + // Back-fill on socket resume. The singleton WS emits this when it + // recovers from a down period (ordinary drop, or — the case this + // fixes — a mobile-browser background-suspend that silently killed + // the socket while the page was frozen). While the socket was dead + // every AGENT_MESSAGE / A2A_RESPONSE for this thread was missed, and + // the store's rehydrate() only re-pulls /workspaces status, not chat. + // Re-running loadInitial() re-fetches the latest persisted history — + // exactly what a navigate-away-and-back (remount) does today, but + // without the user having to do it. Shared by desktop ChatTab and + // MobileChat (both consume this hook), so the realtime path stays + // unified across surfaces rather than forked for mobile. + useEffect(() => { + return subscribeSocketResume(() => { + loadInitial(); + }); + }, [loadInitial]); + const loadOlder = useCallback(async () => { if (inflightRef.current || !hasMoreRef.current) return; const oldest = oldestMessageRef.current; diff --git a/canvas/src/store/__tests__/socket.test.ts b/canvas/src/store/__tests__/socket.test.ts index ee7f94b77..662c1f6af 100644 --- a/canvas/src/store/__tests__/socket.test.ts +++ b/canvas/src/store/__tests__/socket.test.ts @@ -21,12 +21,22 @@ vi.mock("../canvas", () => ({ class MockWebSocket { static instances: MockWebSocket[] = []; + // Mirror the real WebSocket readyState constants — socket.ts's wake + // path reads WebSocket.OPEN / WebSocket.CONNECTING and this.ws.readyState. + static readonly CONNECTING = 0; + static readonly OPEN = 1; + static readonly CLOSING = 2; + static readonly CLOSED = 3; + url: string; onopen: (() => void) | null = null; onmessage: ((event: { data: string }) => void) | null = null; onclose: (() => void) | null = null; onerror: (() => void) | null = null; closeCallCount = 0; + // Starts OPEN once triggerOpen runs; tests flip this to simulate a + // mobile background-suspend that left a dead/half-open socket. + readyState = MockWebSocket.CONNECTING; constructor(url: string) { this.url = url; @@ -35,10 +45,12 @@ class MockWebSocket { close() { this.closeCallCount++; + this.readyState = MockWebSocket.CLOSED; } // Helpers to trigger events in tests triggerOpen() { + this.readyState = MockWebSocket.OPEN; this.onopen?.(); } @@ -59,6 +71,46 @@ class MockWebSocket { } } +// --------------------------------------------------------------------------- +// Minimal DOM stub (vitest environment is 'node' — no window/document). +// socket.ts's wake-recovery attaches visibilitychange/pageshow/online/ +// focus listeners; under node it self-no-ops via a typeof guard, so to +// exercise the path we inject just enough of window/document here, the +// same way WebSocket is stubbed above. Kept tiny on purpose — a single +// listener registry keyed by event name, plus a settable +// visibilityState. +// --------------------------------------------------------------------------- + +interface FakeTarget { + _l: Record void>>; + addEventListener: (type: string, fn: () => void) => void; + removeEventListener: (type: string, fn: () => void) => void; + dispatch: (type: string) => void; +} + +function makeFakeTarget(): FakeTarget { + const l: Record void>> = {}; + return { + _l: l, + addEventListener(type, fn) { + (l[type] ||= []).push(fn); + }, + removeEventListener(type, fn) { + l[type] = (l[type] || []).filter((f) => f !== fn); + }, + dispatch(type) { + for (const fn of l[type] || []) fn(); + }, + }; +} + +const fakeWindow = makeFakeTarget(); +const fakeDocument = Object.assign(makeFakeTarget(), { + visibilityState: "visible" as string, +}); +(globalThis as unknown as Record).window = fakeWindow; +(globalThis as unknown as Record).document = fakeDocument; + // Install mock WebSocket globally before importing socket module (globalThis as unknown as Record).WebSocket = MockWebSocket; @@ -328,6 +380,153 @@ describe("WebSocket onerror", () => { }); }); +// --------------------------------------------------------------------------- +// Wake recovery — mobile background-suspend regression (mobile chat not +// updating in real time until refresh). Simulates: connect → open → +// the OS freezes the page and silently kills the WS WITHOUT firing +// onclose → user returns (visibilitychange / pageshow / online / +// focus) → assert the dead socket is replaced AND, on the new socket's +// open, the resume signal fires so chat history back-fills the missed +// AGENT_MESSAGE / A2A_RESPONSE events. +// --------------------------------------------------------------------------- + +import { + subscribeSocketResume, + _resetSocketResumeListenersForTests, +} from "../socket-events"; + +describe("wake recovery (mobile background-suspend)", () => { + beforeEach(() => { + _resetSocketResumeListenersForTests(); + fakeDocument.visibilityState = "visible"; + }); + + function suspendKill(ws: MockWebSocket) { + // Mobile background-suspend: the OS tore the transport down but the + // page was frozen so onclose never ran. The socket object survives + // with a CLOSED readyState and no reconnect was scheduled. + ws.readyState = MockWebSocket.CLOSED; + } + + it("reconnects on visibilitychange when the socket was silently killed", () => { + connectSocket(); + const ws = getLastWS(); + ws.triggerOpen(); + expect(MockWebSocket.instances).toHaveLength(1); + + suspendKill(ws); + fakeDocument.dispatch("visibilitychange"); + + // A fresh socket must have been created — the stale one is not + // reused. + expect(MockWebSocket.instances.length).toBeGreaterThan(1); + }); + + it("does NOT reconnect on visibilitychange while the socket is still healthy", () => { + connectSocket(); + const ws = getLastWS(); + ws.triggerOpen(); + expect(MockWebSocket.instances).toHaveLength(1); + + // Healthy OPEN socket + a spurious visibilitychange (e.g. quick tab + // peek that never actually suspended) → no churn. + fakeDocument.dispatch("visibilitychange"); + expect(MockWebSocket.instances).toHaveLength(1); + }); + + it("ignores visibilitychange when the page is hidden (the hide transition)", () => { + connectSocket(); + const ws = getLastWS(); + ws.triggerOpen(); + suspendKill(ws); + + fakeDocument.visibilityState = "hidden"; + fakeDocument.dispatch("visibilitychange"); + // Hidden → must not reconnect (would defeat the purpose; we only + // re-arm when the user is actually looking at the page again). + expect(MockWebSocket.instances).toHaveLength(1); + }); + + it.each(["pageshow", "online", "focus"])( + "reconnects on window '%s' after a silent kill", + (evt) => { + connectSocket(); + const ws = getLastWS(); + ws.triggerOpen(); + suspendKill(ws); + + fakeWindow.dispatch(evt); + expect(MockWebSocket.instances.length).toBeGreaterThan(1); + }, + ); + + it("emits the resume signal once the recovered socket re-opens (so chat back-fills missed messages)", () => { + const onResume = vi.fn(); + const unsub = subscribeSocketResume(onResume); + + connectSocket(); + const ws1 = getLastWS(); + ws1.triggerOpen(); + // First open must NOT fire resume — the mount-time chat-history + // fetch already covers the initial load. + expect(onResume).not.toHaveBeenCalled(); + + // Background-suspend silently kills the socket, then the user + // returns. + suspendKill(ws1); + fakeDocument.dispatch("visibilitychange"); + + // The wake handler force-reconnected; the new socket completing its + // handshake is what signals "we recovered from a gap — re-fetch". + const ws2 = getLastWS(); + expect(ws2).not.toBe(ws1); + ws2.triggerOpen(); + + expect(onResume).toHaveBeenCalledTimes(1); + unsub(); + }); + + it("does not emit resume on the very first connect", () => { + const onResume = vi.fn(); + const unsub = subscribeSocketResume(onResume); + connectSocket(); + getLastWS().triggerOpen(); + expect(onResume).not.toHaveBeenCalled(); + unsub(); + }); + + it("emits resume after an ordinary onclose-driven reconnect too (desktop path unchanged)", () => { + const onResume = vi.fn(); + const unsub = subscribeSocketResume(onResume); + + connectSocket(); + const ws1 = getLastWS(); + ws1.triggerOpen(); + // Ordinary network drop — onclose fires normally. + ws1.triggerClose(); + vi.advanceTimersByTime(1100); // past the 1s backoff + const ws2 = getLastWS(); + expect(ws2).not.toBe(ws1); + ws2.triggerOpen(); + + expect(onResume).toHaveBeenCalledTimes(1); + unsub(); + }); + + it("detaches wake listeners on disconnect (no reconnect after teardown)", () => { + connectSocket(); + const ws = getLastWS(); + ws.triggerOpen(); + disconnectSocket(); + + const countAfterDisconnect = MockWebSocket.instances.length; + // A wake event after teardown must be inert. + fakeDocument.dispatch("visibilitychange"); + fakeWindow.dispatch("focus"); + expect(MockWebSocket.instances.length).toBe(countAfterDisconnect); + }); +}); + // --------------------------------------------------------------------------- // Health check (startHealthCheck / stopHealthCheck via onopen / disconnect) // --------------------------------------------------------------------------- diff --git a/canvas/src/store/socket-events.ts b/canvas/src/store/socket-events.ts index b9b8c549e..2d0d2d2e2 100644 --- a/canvas/src/store/socket-events.ts +++ b/canvas/src/store/socket-events.ts @@ -61,3 +61,53 @@ export function subscribeSocketEvents(listener: Listener): () => void { export function _resetSocketEventListenersForTests(): void { listeners.clear(); } + +// --------------------------------------------------------------------------- +// Socket-resume signal +// --------------------------------------------------------------------------- +// +// Fired by the ReconnectingSocket when the WS comes back up AFTER having +// been down (drop, or a mobile-browser background-suspend that silently +// killed the socket while the page was frozen). Distinct from the raw +// event bus above: while the socket was dead the page missed every +// AGENT_MESSAGE / A2A_RESPONSE, and the store's rehydrate() only re-pulls +// /workspaces status — it does NOT back-fill chat messages. Components +// that render a live message thread (desktop ChatTab + MobileChat, both +// via useChatHistory) subscribe here to re-fetch their history on resume +// so missed agent replies appear without the user having to navigate +// away+back or hard-refresh. Shared by desktop and mobile — the recovery +// is in the singleton socket, not forked per-surface. + +type ResumeListener = () => void; + +const resumeListeners = new Set(); + +/** Notify every resume subscriber that the socket just recovered from a + * down period. Called by ReconnectingSocket.onopen, but only when the + * open follows a prior loss (not the very first connect — the initial + * mount-time history fetch already covers that). */ +export function emitSocketResume(): void { + for (const listener of resumeListeners) { + try { + listener(); + } catch (err) { + if (typeof console !== "undefined") { + console.error("socket-resume listener threw:", err); + } + } + } +} + +/** Register a resume subscriber. Returns an unsubscribe function the + * caller must invoke from its effect cleanup. */ +export function subscribeSocketResume(listener: ResumeListener): () => void { + resumeListeners.add(listener); + return () => { + resumeListeners.delete(listener); + }; +} + +/** Test-only: drop all resume subscribers. */ +export function _resetSocketResumeListenersForTests(): void { + resumeListeners.clear(); +} diff --git a/canvas/src/store/socket.ts b/canvas/src/store/socket.ts index 7b2adcd33..574dc0585 100644 --- a/canvas/src/store/socket.ts +++ b/canvas/src/store/socket.ts @@ -1,6 +1,6 @@ import { useCanvasStore } from "./canvas"; import { deriveWsBaseUrl } from "@/lib/ws-url"; -import { emitSocketEvent } from "./socket-events"; +import { emitSocketEvent, emitSocketResume } from "./socket-events"; // If explicit WS_URL is set, use it as-is (may include custom path). // Otherwise derive base + append /ws. @@ -98,9 +98,107 @@ class ReconnectingSocket { // caller can fire-and-forget without coordinating. private rehydrateInFlight: Promise | null = null; private rehydrateDedup = new RehydrateDedup(REHYDRATE_DEDUP_WINDOW_MS); + // True once any onopen has fired. Gates the resume signal so the very + // first connect doesn't fire it (the mount-time chat-history fetch + // already covers the initial load — a resume here would be a wasted + // duplicate). Set on the first successful open and stays true. + private everConnected = false; + // True between a loss (onclose / wake-detected stale socket) and the + // next successful onopen. Only when this is set does onopen emit the + // resume signal — i.e. we recovered from a real gap during which + // AGENT_MESSAGE / A2A_RESPONSE events may have been missed. + private wasDown = false; + // Bound wake handler. iOS Safari / Chrome-mobile freeze the page and + // its timers when the tab is backgrounded or the device locks, and + // tear the WS down WITHOUT reliably firing onclose before the freeze. + // On thaw nothing re-arms: onclose never ran so no reconnect was + // scheduled, and the health-check / fallback-poll intervals were + // suspended. The socket is silently dead until a manual refresh. This + // handler force-reconnects on any wake signal when the socket isn't + // healthy. Stored so disconnect() can detach the listeners. + private onWake: (() => void) | null = null; constructor(url: string) { this.url = url; + this.installWakeListeners(); + } + + /** Attach page-lifecycle listeners that force a reconnect when the + * page returns to the foreground / regains connectivity and the + * socket is not OPEN. Shared by desktop and mobile — desktop rarely + * hits the stale-socket path (its onclose fires promptly) so this is + * effectively a no-op there, while mobile depends on it because the + * background-suspend kills the socket without an onclose. */ + private installWakeListeners() { + if (typeof window === "undefined" || typeof document === "undefined") { + return; + } + const wake = () => { + if (this.disposed) return; + // Only act on a visible page — visibilitychange also fires on the + // hide transition, which we must ignore (closing here would defeat + // the point). + if ( + typeof document.visibilityState === "string" && + document.visibilityState !== "visible" + ) { + return; + } + // Healthy socket → nothing to do. A stale/half-open socket on + // mobile reports CLOSED or CLOSING (the OS tore the transport + // down); CONNECTING is also unhealthy from the user's POV but a + // reconnect attempt is already in flight, so leave it. + const live = + this.ws !== null && + (this.ws.readyState === WebSocket.OPEN || + this.ws.readyState === WebSocket.CONNECTING); + if (live) return; + // Tear down any zombie and reconnect immediately. Mark wasDown so + // the subsequent onopen emits the resume signal and chat threads + // back-fill the messages missed while frozen. + this.wasDown = true; + this.forceReconnect(); + }; + this.onWake = wake; + document.addEventListener("visibilitychange", wake); + window.addEventListener("pageshow", wake); + window.addEventListener("online", wake); + window.addEventListener("focus", wake); + } + + private removeWakeListeners() { + if (!this.onWake) return; + if (typeof window !== "undefined" && typeof document !== "undefined") { + document.removeEventListener("visibilitychange", this.onWake); + window.removeEventListener("pageshow", this.onWake); + window.removeEventListener("online", this.onWake); + window.removeEventListener("focus", this.onWake); + } + this.onWake = null; + } + + /** Detach the current (presumed dead/stale) socket without routing + * through its onclose, cancel any pending backoff timer, and + * reconnect now. Used by the wake path: the browser already killed + * the transport, so the exponential backoff that onclose would have + * scheduled is both absent and undesirable — the user is looking at + * the page and wants it live immediately. */ + private forceReconnect() { + if (this.disposed) return; + if (this.reconnectTimer) { + clearTimeout(this.reconnectTimer); + this.reconnectTimer = null; + } + if (this.ws) { + this.ws.onopen = null; + this.ws.onmessage = null; + this.ws.onclose = null; + this.ws.onerror = null; + try { this.ws.close(); } catch { /* noop */ } + this.ws = null; + } + this.attempt = 0; + this.connect(); } connect() { @@ -132,6 +230,18 @@ class ReconnectingSocket { this.stopFallbackPoll(); this.rehydrate(); this.startHealthCheck(); + // If this open follows a real loss (drop, or a mobile background- + // suspend that the wake handler recovered from), signal resume so + // live message threads re-fetch the AGENT_MESSAGE / A2A_RESPONSE + // history they missed while the socket was dead — rehydrate() + // above only refreshes /workspaces status, not chat. Gate on + // everConnected so the very first open (covered by the mount-time + // history fetch) doesn't fire a redundant resume. + if (this.everConnected && this.wasDown) { + emitSocketResume(); + } + this.everConnected = true; + this.wasDown = false; }; ws.onmessage = (event) => { @@ -157,6 +267,11 @@ class ReconnectingSocket { // corresponds to the WS we just tore down (prevents a stale // onclose from a zombie socket from re-arming the loop). if (this.disposed || this.ws !== ws) return; + // We had a live socket and lost it — mark down so the next onopen + // emits the resume signal and chat threads back-fill missed + // messages. (The wake path also sets this; setting it here covers + // the ordinary network-drop case.) + this.wasDown = true; this.stopHealthCheck(); useCanvasStore.getState().setWsStatus("connecting"); this.startFallbackPoll(); @@ -247,6 +362,7 @@ class ReconnectingSocket { disconnect() { this.disposed = true; + this.removeWakeListeners(); this.stopHealthCheck(); this.stopFallbackPoll(); if (this.reconnectTimer) { -- 2.52.0