Both AgentCommsPanel and ChatTab's activity-feed opened raw
`new WebSocket(WS_URL)` instances per mount, with no onclose handler
and no reconnect logic. When the underlying connection dropped — idle
timeout, browser background-tab throttle, network jitter — the per-
panel sockets stayed dead until the panel re-mounted (refresh or
sub-tab unmount/remount). Live agent-comms bubbles and live activity
feed lines silently went missing in the gap, manifesting as "the
delegation didn't show up until I refreshed."
The global ReconnectingSocket in store/socket.ts already owns
reconnect, exponential backoff, health-check, and HTTP fallback poll.
Routing component subscribers through it gives every consumer those
guarantees for free, with one TCP connection per tab instead of N.
Three new pieces:
- store/socket-events.ts: tiny pub/sub bus. emitSocketEvent fan-outs
every decoded WSMessage to the listener Set; subscribeSocketEvents
returns an unsubscribe. A throwing listener is logged and isolated
so it can't break siblings.
- store/socket.ts: ws.onmessage now calls emitSocketEvent(msg) right
after applyEvent(msg), so the store's derived state and component
subscribers stay in lockstep on every event arrival.
- hooks/useSocketEvent.ts: React hook that registers exactly once
per mount, capturing the latest handler in a ref so the closure
sees current state/props without re-subscribing on every render.
Refactored sites:
- AgentCommsPanel: replaced its WebSocket-in-useEffect block with
useSocketEvent. Same parsing logic; the panel no longer opens its
own connection.
- ChatTab activity feed: split the previous useEffect in two — one
seeds the activity log when `sending` flips, the other subscribes
unconditionally and gates work on `sending` inside the handler.
Hooks can't be conditional, so the gate has to live in the body
rather than around the effect.
The ws-close graceful-close helper is no longer needed in either
site; the global socket owns its own teardown.
Tests: 6 new tests for the bus contract (single delivery, fan-out
order, unsubscribe, throwing-listener isolation, no-subscriber emit,
duplicate-subscribe Set semantics). All 27 existing socket tests
still pass.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
319 lines
12 KiB
TypeScript
319 lines
12 KiB
TypeScript
import { useCanvasStore } from "./canvas";
|
|
import { deriveWsBaseUrl } from "@/lib/ws-url";
|
|
import { emitSocketEvent } from "./socket-events";
|
|
|
|
// If explicit WS_URL is set, use it as-is (may include custom path).
|
|
// Otherwise derive base + append /ws.
|
|
export const WS_URL = process.env.NEXT_PUBLIC_WS_URL || (deriveWsBaseUrl() + "/ws");
|
|
|
|
export interface WSMessage {
|
|
event: string;
|
|
workspace_id: string;
|
|
timestamp: string;
|
|
payload: Record<string, unknown>;
|
|
}
|
|
|
|
/** Window during which a freshly-completed rehydrate is reused
|
|
* instead of firing a new GET. Picked to absorb the connect→health-
|
|
* check sequence (rehydrate runs once on onopen, then the first
|
|
* health-check tick fires immediately after — both should share the
|
|
* same fetch) without holding back legitimately-spaced rehydrates
|
|
* triggered by genuine WS silence later. */
|
|
const REHYDRATE_DEDUP_WINDOW_MS = 1_500;
|
|
|
|
/** Pure dedup gate for rehydrate(). Tracks two states:
|
|
*
|
|
* - in-flight (between beginFetch and completeFetch): every
|
|
* shouldSkip returns true.
|
|
* - post-completion window (now < completedAt + windowMs):
|
|
* shouldSkip returns true.
|
|
*
|
|
* Extracted from ReconnectingSocket so the gate is unit-testable
|
|
* without mocking dynamic imports or fake timers. The class itself
|
|
* is stateful but tiny — instances are not shared across sockets. */
|
|
export class RehydrateDedup {
|
|
private inFlight = false;
|
|
// -Infinity so the very first shouldSkip(now) call always passes
|
|
// (now - (-Infinity) > windowMs). Initializing to 0 would false-
|
|
// trip on test runs where now is also 0 (vi.useFakeTimers default
|
|
// clock) AND on real runs in the first 1.5s after epoch on
|
|
// clock-skewed systems.
|
|
private completedAt = Number.NEGATIVE_INFINITY;
|
|
constructor(private readonly windowMs: number) {}
|
|
|
|
shouldSkip(now: number): boolean {
|
|
if (this.inFlight) return true;
|
|
if (now - this.completedAt < this.windowMs) return true;
|
|
return false;
|
|
}
|
|
|
|
beginFetch(): void {
|
|
this.inFlight = true;
|
|
}
|
|
|
|
completeFetch(now: number = Date.now()): void {
|
|
this.inFlight = false;
|
|
this.completedAt = now;
|
|
}
|
|
}
|
|
|
|
/** Cadence for the HTTP fallback rehydrate that runs while the WS is
|
|
* in connecting/disconnected limbo. 10s is short enough that the user
|
|
* sees STARTING → ONLINE within one tick after the platform finishes
|
|
* provisioning, but long enough to not pound /workspaces if the
|
|
* network truly is down. The dedup gate inside rehydrate() collapses
|
|
* this against the post-onopen rehydrate, so reconnect doesn't pay
|
|
* for a duplicate fetch. */
|
|
export const FALLBACK_POLL_MS = 10_000;
|
|
|
|
class ReconnectingSocket {
|
|
private ws: WebSocket | null = null;
|
|
private attempt = 0;
|
|
private url: string;
|
|
private lastEventTime = 0;
|
|
private healthCheckTimer: ReturnType<typeof setInterval> | null = null;
|
|
private reconnectTimer: ReturnType<typeof setTimeout> | null = null;
|
|
// Polls /workspaces while the WS is unhealthy so the canvas reflects
|
|
// truth even when realtime events aren't arriving. Without this the
|
|
// store can stay frozen for minutes — e.g. workspaces transition
|
|
// STARTING → ONLINE on the platform but the canvas keeps showing
|
|
// STARTING until the WS finally reconnects, triggering false
|
|
// "Provisioning Timeout" banners on already-online workspaces.
|
|
private fallbackPollTimer: ReturnType<typeof setInterval> | null = null;
|
|
// disposed signals that disconnect() has been called. Any in-flight
|
|
// reconnect / handshake must abort early rather than attach to a
|
|
// socket the caller no longer owns — otherwise React StrictMode's
|
|
// effect double-invoke (and any future intentional disconnect)
|
|
// leaves a zombie WebSocket alive forever.
|
|
private disposed = false;
|
|
// In-flight singleton + dedup window for rehydrate. Two reasons to
|
|
// collapse rapid calls:
|
|
// 1. connect.onopen fires rehydrate immediately, and the very next
|
|
// health-check tick may fire it again before the first GET
|
|
// returns — wasted round trip + rebuild churn that resets the
|
|
// mid-flight UI state (auto-rescue heuristics, grow passes).
|
|
// 2. Future call sites (a manual "Refresh" button, post-import
|
|
// hydrate, error-recovery rehydrate) might pile up.
|
|
// Keeping rehydrate idempotent at the call-site level means each
|
|
// caller can fire-and-forget without coordinating.
|
|
private rehydrateInFlight: Promise<void> | null = null;
|
|
private rehydrateDedup = new RehydrateDedup(REHYDRATE_DEDUP_WINDOW_MS);
|
|
|
|
constructor(url: string) {
|
|
this.url = url;
|
|
}
|
|
|
|
connect() {
|
|
if (this.disposed) return;
|
|
useCanvasStore.getState().setWsStatus("connecting");
|
|
// Start the HTTP fallback poll up-front, not just on onclose. Two
|
|
// scenarios this guards against:
|
|
// 1. The very first connect attempt — onclose hasn't fired yet
|
|
// because we never had a successful onopen.
|
|
// 2. A failed handshake where the browser takes tens of seconds
|
|
// to surface as onclose (Chrome can hold a SYN-SENT WebSocket
|
|
// open for ~75s before giving up).
|
|
// Idempotent — startFallbackPoll early-returns if a timer is
|
|
// already running, so calling it from both places is cheap.
|
|
this.startFallbackPoll();
|
|
const ws = new WebSocket(this.url);
|
|
this.ws = ws;
|
|
|
|
ws.onopen = () => {
|
|
if (this.disposed || this.ws !== ws) {
|
|
// Late-open on an abandoned socket. Close it cleanly; the
|
|
// caller already moved on.
|
|
try { ws.close(); } catch { /* noop */ }
|
|
return;
|
|
}
|
|
this.attempt = 0;
|
|
this.lastEventTime = Date.now();
|
|
useCanvasStore.getState().setWsStatus("connected");
|
|
this.stopFallbackPoll();
|
|
this.rehydrate();
|
|
this.startHealthCheck();
|
|
};
|
|
|
|
ws.onmessage = (event) => {
|
|
if (this.disposed || this.ws !== ws) return;
|
|
this.lastEventTime = Date.now();
|
|
try {
|
|
const msg: WSMessage = JSON.parse(event.data);
|
|
useCanvasStore.getState().applyEvent(msg);
|
|
// Fan out to component-level subscribers so panels (Agent
|
|
// Comms, MyChat activity feed) don't have to open their own
|
|
// raw WebSocket — that pattern silently dropped events on
|
|
// any reconnect because the per-component sockets had no
|
|
// onclose / no backoff. See store/socket-events.ts.
|
|
emitSocketEvent(msg);
|
|
} catch {
|
|
// Malformed WS message — skip silently
|
|
}
|
|
};
|
|
|
|
ws.onclose = () => {
|
|
// Fired on intentional close (disposed) OR server/network drop.
|
|
// Only schedule a reconnect when the socket is still live AND
|
|
// corresponds to the WS we just tore down (prevents a stale
|
|
// onclose from a zombie socket from re-arming the loop).
|
|
if (this.disposed || this.ws !== ws) return;
|
|
this.stopHealthCheck();
|
|
useCanvasStore.getState().setWsStatus("connecting");
|
|
this.startFallbackPoll();
|
|
const delay = Math.min(1000 * 2 ** this.attempt, 30000);
|
|
this.attempt++;
|
|
this.reconnectTimer = setTimeout(() => this.connect(), delay);
|
|
};
|
|
|
|
ws.onerror = () => {
|
|
// Suppressed — onclose handles reconnection. onerror fires before onclose
|
|
// and the Event object doesn't contain useful info (serializes to {}).
|
|
};
|
|
}
|
|
|
|
/** Periodically re-fetch state in case WebSocket events were missed (e.g. agent
|
|
* status changed while the socket stayed open but no event was emitted). */
|
|
private startHealthCheck() {
|
|
this.stopHealthCheck();
|
|
this.healthCheckTimer = setInterval(() => {
|
|
const silenceSec = (Date.now() - this.lastEventTime) / 1000;
|
|
// If no events for 30s, re-hydrate to catch missed status changes
|
|
if (silenceSec > 30) {
|
|
this.rehydrate();
|
|
this.lastEventTime = Date.now(); // prevent rapid re-fetches
|
|
}
|
|
}, 30_000);
|
|
}
|
|
|
|
private stopHealthCheck() {
|
|
if (this.healthCheckTimer) {
|
|
clearInterval(this.healthCheckTimer);
|
|
this.healthCheckTimer = null;
|
|
}
|
|
}
|
|
|
|
/** While the WS is in connecting/disconnected limbo, poll /workspaces
|
|
* so the store stays fresh. The reconnect attempts continue in
|
|
* parallel; whichever recovers first wins. rehydrate()'s own dedup
|
|
* gate prevents this from racing with the open-time rehydrate. */
|
|
private startFallbackPoll() {
|
|
if (this.fallbackPollTimer) return;
|
|
this.fallbackPollTimer = setInterval(() => {
|
|
if (this.disposed) {
|
|
this.stopFallbackPoll();
|
|
return;
|
|
}
|
|
void this.rehydrate();
|
|
}, FALLBACK_POLL_MS);
|
|
}
|
|
|
|
private stopFallbackPoll() {
|
|
if (this.fallbackPollTimer) {
|
|
clearInterval(this.fallbackPollTimer);
|
|
this.fallbackPollTimer = null;
|
|
}
|
|
}
|
|
|
|
private rehydrate(): Promise<void> {
|
|
// Reuse an in-flight fetch — a second caller during the GET
|
|
// shouldn't kick off a parallel one.
|
|
if (this.rehydrateInFlight) return this.rehydrateInFlight;
|
|
if (this.rehydrateDedup.shouldSkip(Date.now())) {
|
|
return Promise.resolve();
|
|
}
|
|
|
|
// beginFetch lives INSIDE the IIFE's try so any future code added
|
|
// between gate-check and IIFE-construction can't throw and leave
|
|
// the gate stuck at inFlight=true forever. Today there's nothing
|
|
// that can throw here, but the cost of being defensive is one
|
|
// extra microtask of "in flight" status — negligible.
|
|
const promise = (async () => {
|
|
this.rehydrateDedup.beginFetch();
|
|
try {
|
|
const { api } = await import("@/lib/api");
|
|
const workspaces = await api.get<WorkspaceData[]>("/workspaces");
|
|
if (this.disposed) return;
|
|
useCanvasStore.getState().hydrate(workspaces);
|
|
} catch {
|
|
// Rehydration failed — will retry on next health check cycle.
|
|
} finally {
|
|
this.rehydrateDedup.completeFetch(Date.now());
|
|
this.rehydrateInFlight = null;
|
|
}
|
|
})();
|
|
this.rehydrateInFlight = promise;
|
|
return promise;
|
|
}
|
|
|
|
disconnect() {
|
|
this.disposed = true;
|
|
this.stopHealthCheck();
|
|
this.stopFallbackPoll();
|
|
if (this.reconnectTimer) {
|
|
clearTimeout(this.reconnectTimer);
|
|
this.reconnectTimer = null;
|
|
}
|
|
if (this.ws) {
|
|
// Detach listeners before close() so we don't route the close
|
|
// event through our onclose → scheduleReconnect path. Belt +
|
|
// braces on top of the `disposed` check, because StrictMode
|
|
// cycles through so fast that an attached onclose can fire
|
|
// after disposed=true is set but before this assignment runs.
|
|
this.ws.onopen = null;
|
|
this.ws.onmessage = null;
|
|
this.ws.onclose = null;
|
|
this.ws.onerror = null;
|
|
try { this.ws.close(); } catch { /* noop */ }
|
|
this.ws = null;
|
|
}
|
|
useCanvasStore.getState().setWsStatus("disconnected");
|
|
}
|
|
}
|
|
|
|
export interface WorkspaceData {
|
|
id: string;
|
|
name: string;
|
|
role: string;
|
|
tier: number;
|
|
status: string;
|
|
agent_card: Record<string, unknown> | null;
|
|
url: string;
|
|
parent_id: string | null;
|
|
active_tasks: number;
|
|
last_error_rate: number;
|
|
last_sample_error: string;
|
|
uptime_seconds: number;
|
|
current_task: string;
|
|
runtime: string;
|
|
x: number;
|
|
y: number;
|
|
collapsed: boolean;
|
|
/** USD spend ceiling set by the user; null = unlimited. Added by issue #541. */
|
|
budget_limit: number | null;
|
|
/** Cumulative USD spend for this workspace. Present when the platform tracks spend. */
|
|
budget_used?: number | null;
|
|
/** Server-declared provisioning-timeout override in milliseconds (#2054).
|
|
* Sourced from the workspace's template manifest at provision time —
|
|
* lets a slow runtime declare its cold-boot expectation without a
|
|
* canvas release. Falls through to the per-runtime profile in
|
|
* `@/lib/runtimeProfiles` when absent (the default behavior for any
|
|
* template that hasn't yet declared the field). */
|
|
provision_timeout_ms?: number | null;
|
|
}
|
|
|
|
let socket: ReconnectingSocket | null = null;
|
|
|
|
export function connectSocket() {
|
|
if (!socket) {
|
|
socket = new ReconnectingSocket(WS_URL);
|
|
}
|
|
socket.connect();
|
|
}
|
|
|
|
export function disconnectSocket() {
|
|
if (socket) {
|
|
socket.disconnect();
|
|
socket = null;
|
|
}
|
|
}
|