molecule-core/canvas/src/store/socket.ts
Hongming Wang 4028b81e04 refactor(canvas): route panel WS subscriptions through global socket
Both AgentCommsPanel and ChatTab's activity-feed opened raw
`new WebSocket(WS_URL)` instances per mount, with no onclose handler
and no reconnect logic. When the underlying connection dropped — idle
timeout, browser background-tab throttle, network jitter — the per-
panel sockets stayed dead until the panel re-mounted (refresh or
sub-tab unmount/remount). Live agent-comms bubbles and live activity
feed lines silently went missing in the gap, manifesting as "the
delegation didn't show up until I refreshed."

The global ReconnectingSocket in store/socket.ts already owns
reconnect, exponential backoff, health-check, and HTTP fallback poll.
Routing component subscribers through it gives every consumer those
guarantees for free, with one TCP connection per tab instead of N.

Three new pieces:

  - store/socket-events.ts: tiny pub/sub bus. emitSocketEvent fan-outs
    every decoded WSMessage to the listener Set; subscribeSocketEvents
    returns an unsubscribe. A throwing listener is logged and isolated
    so it can't break siblings.

  - store/socket.ts: ws.onmessage now calls emitSocketEvent(msg) right
    after applyEvent(msg), so the store's derived state and component
    subscribers stay in lockstep on every event arrival.

  - hooks/useSocketEvent.ts: React hook that registers exactly once
    per mount, capturing the latest handler in a ref so the closure
    sees current state/props without re-subscribing on every render.

Refactored sites:

  - AgentCommsPanel: replaced its WebSocket-in-useEffect block with
    useSocketEvent. Same parsing logic; the panel no longer opens its
    own connection.

  - ChatTab activity feed: split the previous useEffect in two — one
    seeds the activity log when `sending` flips, the other subscribes
    unconditionally and gates work on `sending` inside the handler.
    Hooks can't be conditional, so the gate has to live in the body
    rather than around the effect.

The ws-close graceful-close helper is no longer needed in either
site; the global socket owns its own teardown.

Tests: 6 new tests for the bus contract (single delivery, fan-out
order, unsubscribe, throwing-listener isolation, no-subscriber emit,
duplicate-subscribe Set semantics). All 27 existing socket tests
still pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 13:12:47 -07:00

319 lines
12 KiB
TypeScript

import { useCanvasStore } from "./canvas";
import { deriveWsBaseUrl } from "@/lib/ws-url";
import { emitSocketEvent } from "./socket-events";
// If explicit WS_URL is set, use it as-is (may include custom path).
// Otherwise derive base + append /ws.
export const WS_URL = process.env.NEXT_PUBLIC_WS_URL || (deriveWsBaseUrl() + "/ws");
export interface WSMessage {
event: string;
workspace_id: string;
timestamp: string;
payload: Record<string, unknown>;
}
/** Window during which a freshly-completed rehydrate is reused
* instead of firing a new GET. Picked to absorb the connect→health-
* check sequence (rehydrate runs once on onopen, then the first
* health-check tick fires immediately after — both should share the
* same fetch) without holding back legitimately-spaced rehydrates
* triggered by genuine WS silence later. */
const REHYDRATE_DEDUP_WINDOW_MS = 1_500;
/** Pure dedup gate for rehydrate(). Tracks two states:
*
* - in-flight (between beginFetch and completeFetch): every
* shouldSkip returns true.
* - post-completion window (now < completedAt + windowMs):
* shouldSkip returns true.
*
* Extracted from ReconnectingSocket so the gate is unit-testable
* without mocking dynamic imports or fake timers. The class itself
* is stateful but tiny — instances are not shared across sockets. */
export class RehydrateDedup {
private inFlight = false;
// -Infinity so the very first shouldSkip(now) call always passes
// (now - (-Infinity) > windowMs). Initializing to 0 would false-
// trip on test runs where now is also 0 (vi.useFakeTimers default
// clock) AND on real runs in the first 1.5s after epoch on
// clock-skewed systems.
private completedAt = Number.NEGATIVE_INFINITY;
constructor(private readonly windowMs: number) {}
shouldSkip(now: number): boolean {
if (this.inFlight) return true;
if (now - this.completedAt < this.windowMs) return true;
return false;
}
beginFetch(): void {
this.inFlight = true;
}
completeFetch(now: number = Date.now()): void {
this.inFlight = false;
this.completedAt = now;
}
}
/** Cadence for the HTTP fallback rehydrate that runs while the WS is
* in connecting/disconnected limbo. 10s is short enough that the user
* sees STARTING → ONLINE within one tick after the platform finishes
* provisioning, but long enough to not pound /workspaces if the
* network truly is down. The dedup gate inside rehydrate() collapses
* this against the post-onopen rehydrate, so reconnect doesn't pay
* for a duplicate fetch. */
export const FALLBACK_POLL_MS = 10_000;
class ReconnectingSocket {
private ws: WebSocket | null = null;
private attempt = 0;
private url: string;
private lastEventTime = 0;
private healthCheckTimer: ReturnType<typeof setInterval> | null = null;
private reconnectTimer: ReturnType<typeof setTimeout> | null = null;
// Polls /workspaces while the WS is unhealthy so the canvas reflects
// truth even when realtime events aren't arriving. Without this the
// store can stay frozen for minutes — e.g. workspaces transition
// STARTING → ONLINE on the platform but the canvas keeps showing
// STARTING until the WS finally reconnects, triggering false
// "Provisioning Timeout" banners on already-online workspaces.
private fallbackPollTimer: ReturnType<typeof setInterval> | null = null;
// disposed signals that disconnect() has been called. Any in-flight
// reconnect / handshake must abort early rather than attach to a
// socket the caller no longer owns — otherwise React StrictMode's
// effect double-invoke (and any future intentional disconnect)
// leaves a zombie WebSocket alive forever.
private disposed = false;
// In-flight singleton + dedup window for rehydrate. Two reasons to
// collapse rapid calls:
// 1. connect.onopen fires rehydrate immediately, and the very next
// health-check tick may fire it again before the first GET
// returns — wasted round trip + rebuild churn that resets the
// mid-flight UI state (auto-rescue heuristics, grow passes).
// 2. Future call sites (a manual "Refresh" button, post-import
// hydrate, error-recovery rehydrate) might pile up.
// Keeping rehydrate idempotent at the call-site level means each
// caller can fire-and-forget without coordinating.
private rehydrateInFlight: Promise<void> | null = null;
private rehydrateDedup = new RehydrateDedup(REHYDRATE_DEDUP_WINDOW_MS);
constructor(url: string) {
this.url = url;
}
connect() {
if (this.disposed) return;
useCanvasStore.getState().setWsStatus("connecting");
// Start the HTTP fallback poll up-front, not just on onclose. Two
// scenarios this guards against:
// 1. The very first connect attempt — onclose hasn't fired yet
// because we never had a successful onopen.
// 2. A failed handshake where the browser takes tens of seconds
// to surface as onclose (Chrome can hold a SYN-SENT WebSocket
// open for ~75s before giving up).
// Idempotent — startFallbackPoll early-returns if a timer is
// already running, so calling it from both places is cheap.
this.startFallbackPoll();
const ws = new WebSocket(this.url);
this.ws = ws;
ws.onopen = () => {
if (this.disposed || this.ws !== ws) {
// Late-open on an abandoned socket. Close it cleanly; the
// caller already moved on.
try { ws.close(); } catch { /* noop */ }
return;
}
this.attempt = 0;
this.lastEventTime = Date.now();
useCanvasStore.getState().setWsStatus("connected");
this.stopFallbackPoll();
this.rehydrate();
this.startHealthCheck();
};
ws.onmessage = (event) => {
if (this.disposed || this.ws !== ws) return;
this.lastEventTime = Date.now();
try {
const msg: WSMessage = JSON.parse(event.data);
useCanvasStore.getState().applyEvent(msg);
// Fan out to component-level subscribers so panels (Agent
// Comms, MyChat activity feed) don't have to open their own
// raw WebSocket — that pattern silently dropped events on
// any reconnect because the per-component sockets had no
// onclose / no backoff. See store/socket-events.ts.
emitSocketEvent(msg);
} catch {
// Malformed WS message — skip silently
}
};
ws.onclose = () => {
// Fired on intentional close (disposed) OR server/network drop.
// Only schedule a reconnect when the socket is still live AND
// corresponds to the WS we just tore down (prevents a stale
// onclose from a zombie socket from re-arming the loop).
if (this.disposed || this.ws !== ws) return;
this.stopHealthCheck();
useCanvasStore.getState().setWsStatus("connecting");
this.startFallbackPoll();
const delay = Math.min(1000 * 2 ** this.attempt, 30000);
this.attempt++;
this.reconnectTimer = setTimeout(() => this.connect(), delay);
};
ws.onerror = () => {
// Suppressed — onclose handles reconnection. onerror fires before onclose
// and the Event object doesn't contain useful info (serializes to {}).
};
}
/** Periodically re-fetch state in case WebSocket events were missed (e.g. agent
* status changed while the socket stayed open but no event was emitted). */
private startHealthCheck() {
this.stopHealthCheck();
this.healthCheckTimer = setInterval(() => {
const silenceSec = (Date.now() - this.lastEventTime) / 1000;
// If no events for 30s, re-hydrate to catch missed status changes
if (silenceSec > 30) {
this.rehydrate();
this.lastEventTime = Date.now(); // prevent rapid re-fetches
}
}, 30_000);
}
private stopHealthCheck() {
if (this.healthCheckTimer) {
clearInterval(this.healthCheckTimer);
this.healthCheckTimer = null;
}
}
/** While the WS is in connecting/disconnected limbo, poll /workspaces
* so the store stays fresh. The reconnect attempts continue in
* parallel; whichever recovers first wins. rehydrate()'s own dedup
* gate prevents this from racing with the open-time rehydrate. */
private startFallbackPoll() {
if (this.fallbackPollTimer) return;
this.fallbackPollTimer = setInterval(() => {
if (this.disposed) {
this.stopFallbackPoll();
return;
}
void this.rehydrate();
}, FALLBACK_POLL_MS);
}
private stopFallbackPoll() {
if (this.fallbackPollTimer) {
clearInterval(this.fallbackPollTimer);
this.fallbackPollTimer = null;
}
}
private rehydrate(): Promise<void> {
// Reuse an in-flight fetch — a second caller during the GET
// shouldn't kick off a parallel one.
if (this.rehydrateInFlight) return this.rehydrateInFlight;
if (this.rehydrateDedup.shouldSkip(Date.now())) {
return Promise.resolve();
}
// beginFetch lives INSIDE the IIFE's try so any future code added
// between gate-check and IIFE-construction can't throw and leave
// the gate stuck at inFlight=true forever. Today there's nothing
// that can throw here, but the cost of being defensive is one
// extra microtask of "in flight" status — negligible.
const promise = (async () => {
this.rehydrateDedup.beginFetch();
try {
const { api } = await import("@/lib/api");
const workspaces = await api.get<WorkspaceData[]>("/workspaces");
if (this.disposed) return;
useCanvasStore.getState().hydrate(workspaces);
} catch {
// Rehydration failed — will retry on next health check cycle.
} finally {
this.rehydrateDedup.completeFetch(Date.now());
this.rehydrateInFlight = null;
}
})();
this.rehydrateInFlight = promise;
return promise;
}
disconnect() {
this.disposed = true;
this.stopHealthCheck();
this.stopFallbackPoll();
if (this.reconnectTimer) {
clearTimeout(this.reconnectTimer);
this.reconnectTimer = null;
}
if (this.ws) {
// Detach listeners before close() so we don't route the close
// event through our onclose → scheduleReconnect path. Belt +
// braces on top of the `disposed` check, because StrictMode
// cycles through so fast that an attached onclose can fire
// after disposed=true is set but before this assignment runs.
this.ws.onopen = null;
this.ws.onmessage = null;
this.ws.onclose = null;
this.ws.onerror = null;
try { this.ws.close(); } catch { /* noop */ }
this.ws = null;
}
useCanvasStore.getState().setWsStatus("disconnected");
}
}
export interface WorkspaceData {
id: string;
name: string;
role: string;
tier: number;
status: string;
agent_card: Record<string, unknown> | null;
url: string;
parent_id: string | null;
active_tasks: number;
last_error_rate: number;
last_sample_error: string;
uptime_seconds: number;
current_task: string;
runtime: string;
x: number;
y: number;
collapsed: boolean;
/** USD spend ceiling set by the user; null = unlimited. Added by issue #541. */
budget_limit: number | null;
/** Cumulative USD spend for this workspace. Present when the platform tracks spend. */
budget_used?: number | null;
/** Server-declared provisioning-timeout override in milliseconds (#2054).
* Sourced from the workspace's template manifest at provision time —
* lets a slow runtime declare its cold-boot expectation without a
* canvas release. Falls through to the per-runtime profile in
* `@/lib/runtimeProfiles` when absent (the default behavior for any
* template that hasn't yet declared the field). */
provision_timeout_ms?: number | null;
}
let socket: ReconnectingSocket | null = null;
export function connectSocket() {
if (!socket) {
socket = new ReconnectingSocket(WS_URL);
}
socket.connect();
}
export function disconnectSocket() {
if (socket) {
socket.disconnect();
socket = null;
}
}