fix(canvas): suppress stale provisioning banners + add WS-down HTTP fallback poll

Two related fixes for the case where the canvas thinks workspaces are stuck provisioning when they're actually online: 1. ProvisioningTimeout banners now gate on wsStatus === "connected". While the WS is in connecting/disconnected state, the local "provisioning" status reflects the last event received before the drop — workspaces may have transitioned to online minutes ago. The 8m timeout was firing against frozen state and showing a wall of yellow warnings on already-online workspaces. 2. Socket layer now starts a 10s rehydrate poll when the WS goes unhealthy (onclose) and stops it on onopen/disconnect. The reconnect attempts continue in parallel; whichever recovers first wins. rehydrate()'s existing dedup gate prevents the open-time rehydrate from racing with a fallback poll. Without this the store could stay frozen for minutes while WS exponential backoff chewed through retries. Plus the previously-uncommitted TemplatePalette flushSync change so the import modal unmounts synchronously before doImport runs (otherwise React batches the close with the import's setState prefix and the modal backdrop hides the spawn animation). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 20:22:15 -07:00 · 2026-04-24 20:22:15 -07:00 · 0b4dfbd121
commit 0b4dfbd121
parent 1d71b4e9e5
4 changed files with 113 additions and 5 deletions
--- a/canvas/src/components/ProvisioningTimeout.tsx
+++ b/canvas/src/components/ProvisioningTimeout.tsx
@ -65,6 +65,12 @@ export function ProvisioningTimeout({
  // banner even if they stay in provisioning. Cleared when the
  // workspace leaves provisioning (status changes).
  const [dismissed, setDismissed] = useState<Set<string>>(new Set());
+  // Watch the live WS health. While it's not "connected", local node
+  // status reflects the last event we received before the drop —
+  // workspaces may have actually transitioned to online minutes ago.
+  // Suppress the banner until WS recovers + rehydrate confirms each
+  // workspace is genuinely still provisioning.
+  const wsStatus = useCanvasStore((s) => s.wsStatus);

  // Subscribe to provisioning nodes — use shallow compare to avoid infinite re-render
  // (filter+map creates new array reference on every store update).
@ -251,8 +257,11 @@ export function ProvisioningTimeout({
  }, []);

  const visibleTimedOut = useMemo(
-    () => timedOut.filter((e) => !dismissed.has(e.workspaceId)),
-    [timedOut, dismissed],
+    () =>
+      wsStatus === "connected"
+        ? timedOut.filter((e) => !dismissed.has(e.workspaceId))
+        : [],
+    [timedOut, dismissed, wsStatus],
  );

  if (visibleTimedOut.length === 0) return null;
--- a/canvas/src/components/TemplatePalette.tsx
+++ b/canvas/src/components/TemplatePalette.tsx
@ -1,6 +1,7 @@
 "use client";

 import { useState, useEffect, useCallback, useRef } from "react";
+import { flushSync } from "react-dom";
 import { api } from "@/lib/api";
 import { useCanvasStore } from "@/store/canvas";
 import type { WorkspaceData } from "@/store/socket";
@ -326,7 +327,18 @@ export function OrgTemplatesSection() {
          onSecretSaved={refreshConfiguredKeys}
          onProceed={() => {
            const org = preflight.org;
-            setPreflight(null);
+            // flushSync guarantees the modal unmounts BEFORE we kick
+            // off the import network call. Without it, React batches
+            // setPreflight(null) with the setImporting(...) from
+            // doImport's synchronous prefix, both commit at the end
+            // of this handler, AND the await import() POST may yield
+            // a microtask before React schedules the paint. Net
+            // effect: the modal backdrop sat over the canvas during
+            // the first wave of WORKSPACE_PROVISIONING WS events,
+            // hiding the spawn animation. Force the close to land
+            // first so the user sees the canvas reveal + agents
+            // popping into place.
+            flushSync(() => setPreflight(null));
            void doImport(org);
          }}
          onCancel={() => setPreflight(null)}
--- a/canvas/src/store/tests/socket.test.ts
+++ b/canvas/src/store/tests/socket.test.ts
@ -263,13 +263,59 @@ describe("WebSocket onclose – auto-reconnect", () => {
    const ws = getLastWS();
    ws.triggerClose();

-    // Fast-forward timers to trigger the reconnect
-    vi.runAllTimers();
+    // First reconnect attempt is scheduled at 1s (Math.min(1000 * 2^0,
+    // 30000)). Advance just past that — vi.runAllTimers() would
+    // additionally re-fire the fallback poll setInterval forever and
+    // hit the 10000-timer abort.
+    vi.advanceTimersByTime(1100);

    expect(MockWebSocket.instances.length).toBeGreaterThan(1);
  });
 });

+describe("HTTP fallback poll while WS unhealthy", () => {
+  it("starts a setInterval after onclose so /workspaces stays fresh", () => {
+    const setIntervalSpy = vi.spyOn(globalThis, "setInterval");
+    connectSocket();
+    const ws = getLastWS();
+    ws.triggerClose();
+    // The fallback poll runs at 10s; the reconnect uses setTimeout, so
+    // any setInterval registered between connect and close must be the
+    // fallback poll.
+    const fallbackCalls = setIntervalSpy.mock.calls.filter(
+      ([, delay]) => delay === 10_000,
+    );
+    expect(fallbackCalls.length).toBeGreaterThan(0);
+    setIntervalSpy.mockRestore();
+  });
+
+  it("clears the fallback poll once the WS reconnects (onopen)", () => {
+    const clearIntervalSpy = vi.spyOn(globalThis, "clearInterval");
+    connectSocket();
+    const ws = getLastWS();
+    ws.triggerClose(); // starts fallback poll
+    clearIntervalSpy.mockClear();
+    // Advance past the first reconnect delay so a fresh ws exists,
+    // then trigger its open.
+    vi.advanceTimersByTime(1100);
+    const ws2 = getLastWS();
+    ws2.triggerOpen();
+    expect(clearIntervalSpy).toHaveBeenCalled();
+    clearIntervalSpy.mockRestore();
+  });
+
+  it("clears the fallback poll on disconnect", () => {
+    const clearIntervalSpy = vi.spyOn(globalThis, "clearInterval");
+    connectSocket();
+    const ws = getLastWS();
+    ws.triggerClose(); // starts fallback poll
+    clearIntervalSpy.mockClear();
+    disconnectSocket();
+    expect(clearIntervalSpy).toHaveBeenCalled();
+    clearIntervalSpy.mockRestore();
+  });
+});
+
 // ---------------------------------------------------------------------------
 // onerror handler
 // ---------------------------------------------------------------------------
--- a/canvas/src/store/socket.ts
+++ b/canvas/src/store/socket.ts
@ -56,6 +56,15 @@ export class RehydrateDedup {
  }
 }

+/** Cadence for the HTTP fallback rehydrate that runs while the WS is
+ *  in connecting/disconnected limbo. 10s is short enough that the user
+ *  sees STARTING → ONLINE within one tick after the platform finishes
+ *  provisioning, but long enough to not pound /workspaces if the
+ *  network truly is down. The dedup gate inside rehydrate() collapses
+ *  this against the post-onopen rehydrate, so reconnect doesn't pay
+ *  for a duplicate fetch. */
+const FALLBACK_POLL_MS = 10_000;
+
 class ReconnectingSocket {
  private ws: WebSocket | null = null;
  private attempt = 0;
@ -63,6 +72,13 @@ class ReconnectingSocket {
  private lastEventTime = 0;
  private healthCheckTimer: ReturnType<typeof setInterval> | null = null;
  private reconnectTimer: ReturnType<typeof setTimeout> | null = null;
+  // Polls /workspaces while the WS is unhealthy so the canvas reflects
+  // truth even when realtime events aren't arriving. Without this the
+  // store can stay frozen for minutes — e.g. workspaces transition
+  // STARTING → ONLINE on the platform but the canvas keeps showing
+  // STARTING until the WS finally reconnects, triggering false
+  // "Provisioning Timeout" banners on already-online workspaces.
+  private fallbackPollTimer: ReturnType<typeof setInterval> | null = null;
  // disposed signals that disconnect() has been called. Any in-flight
  // reconnect / handshake must abort early rather than attach to a
  // socket the caller no longer owns — otherwise React StrictMode's
@ -102,6 +118,7 @@ class ReconnectingSocket {
      this.attempt = 0;
      this.lastEventTime = Date.now();
      useCanvasStore.getState().setWsStatus("connected");
+      this.stopFallbackPoll();
      this.rehydrate();
      this.startHealthCheck();
    };
@ -125,6 +142,7 @@ class ReconnectingSocket {
      if (this.disposed || this.ws !== ws) return;
      this.stopHealthCheck();
      useCanvasStore.getState().setWsStatus("connecting");
+      this.startFallbackPoll();
      const delay = Math.min(1000 * 2 ** this.attempt, 30000);
      this.attempt++;
      this.reconnectTimer = setTimeout(() => this.connect(), delay);
@ -157,6 +175,28 @@ class ReconnectingSocket {
    }
  }

+  /** While the WS is in connecting/disconnected limbo, poll /workspaces
+   *  so the store stays fresh. The reconnect attempts continue in
+   *  parallel; whichever recovers first wins. rehydrate()'s own dedup
+   *  gate prevents this from racing with the open-time rehydrate. */
+  private startFallbackPoll() {
+    if (this.fallbackPollTimer) return;
+    this.fallbackPollTimer = setInterval(() => {
+      if (this.disposed) {
+        this.stopFallbackPoll();
+        return;
+      }
+      void this.rehydrate();
+    }, FALLBACK_POLL_MS);
+  }
+
+  private stopFallbackPoll() {
+    if (this.fallbackPollTimer) {
+      clearInterval(this.fallbackPollTimer);
+      this.fallbackPollTimer = null;
+    }
+  }
+
  private rehydrate(): Promise<void> {
    // Reuse an in-flight fetch — a second caller during the GET
    // shouldn't kick off a parallel one.
@ -191,6 +231,7 @@ class ReconnectingSocket {
  disconnect() {
    this.disposed = true;
    this.stopHealthCheck();
+    this.stopFallbackPoll();
    if (this.reconnectTimer) {
      clearTimeout(this.reconnectTimer);
      this.reconnectTimer = null;