Fix CommunicationOverlay rate-limit storm: cap fan-out + gate on visibility

User report 2026-05-04: 8+ workspace tenant (Design Director + 6 sub-agents + 3 standalones) saw sustained 429s in canvas console hitting /workspaces/<id>/activity?limit=5. Server-side rate limit is 600 req/min/IP. Three compounding issues in CommunicationOverlay: 1. Polled regardless of visibility — collapsed panel still hammered the API 2. 10s cadence — 6 req every 10s = 36 req/min from this overlay alone 3. Fan-out cap of 6 workspaces — scaled linearly with workspace count Fix: - Gate setInterval on `visible` (effect re-runs when collapsed/expanded) - Cadence 10s → 30s - Fan-out cap 6 → 3 Combined: ~36 req/min worst case → 6 req/min worst case (6x reduction), 0 req/min when collapsed. Tests: - Fan-out cap: 6 online nodes mounted → exactly 3 fetches (was 6) - Offline gate: offline workspace never polled - Cadence: timer at 10s = no new fetch; timer at 30s = next batch fires Each test would fail if the corresponding dial regressed. Follow-up (out of scope): structurally right fix is to consume the WORKSPACE_ACTIVITY WS broadcast instead of polling per-workspace. Server already publishes the events; canvas just isn't subscribing yet. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 03:06:34 -07:00 · 2026-05-04 03:06:34 -07:00 · 26b5b21238
commit 26b5b21238
parent 238f4d45df
2 changed files with 142 additions and 4 deletions
--- a/canvas/src/components/CommunicationOverlay.tsx
+++ b/canvas/src/components/CommunicationOverlay.tsx
@ -32,11 +32,18 @@ export function CommunicationOverlay() {

  const fetchComms = useCallback(async () => {
    try {
-      // Fetch activity from all online workspaces
+      // Fan-out cap: each polled workspace = 1 round-trip. The platform
+      // rate limits at 600 req/min/IP; combined with heartbeats + other
+      // canvas polling, every workspace polled here costs ~6 req/min
+      // (1 every 30s × 1 per workspace). Capping at 3 keeps this
+      // overlay's footprint at 18 req/min worst case — well under
+      // budget even with 8+ workspaces visible. Caught 2026-05-04 when
+      // a user with 8+ workspaces (Design Director + 6 sub-agents +
+      // 3 standalones) saw sustained 429s in canvas console.
      const onlineNodes = nodesRef.current.filter((n) => n.data.status === "online");
      const allComms: Communication[] = [];

-      for (const node of onlineNodes.slice(0, 6)) {
+      for (const node of onlineNodes.slice(0, 3)) {
        try {
          const activities = await api.get<Array<{
            id: string;
@ -91,10 +98,20 @@ export function CommunicationOverlay() {
  }, []);

  useEffect(() => {
+    // Gate polling on visibility — when the user collapses the overlay
+    // the data isn't being read, so the per-workspace fan-out becomes
+    // pure rate-limit overhead. Pre-fix this overlay polled regardless
+    // of whether the panel was shown, costing ~36 req/min from a
+    // hidden surface.
+    if (!visible) return;
    fetchComms();
-    const interval = setInterval(fetchComms, 10000);
+    // 30s cadence (was 10s). At 3-workspace fan-out that's 6 req/min
+    // worst case from this overlay. Combined with heartbeats (~30/min)
+    // and other canvas polling, leaves ample headroom under the 600/
+    // min/IP server-side rate limit even at 8+ workspace tenants.
+    const interval = setInterval(fetchComms, 30000);
    return () => clearInterval(interval);
-  }, [fetchComms]);
+  }, [fetchComms, visible]);

  if (!visible || comms.length === 0) {
    return (
--- a/canvas/src/components/tests/CommunicationOverlay.test.tsx
+++ b/canvas/src/components/tests/CommunicationOverlay.test.tsx
@ -0,0 +1,121 @@
+// @vitest-environment jsdom
+/**
+ * CommunicationOverlay tests — pin the rate-limit fix shipped 2026-05-04.
+ *
+ * The overlay polls /workspaces/:id/activity?limit=5 for each online
+ * workspace. Pre-fix it (a) polled regardless of visibility and (b)
+ * fanned out to 6 workspaces every 10s. With 8+ workspaces a user
+ * triggered sustained 429s (server-side rate limit is 600 req/min/IP).
+ *
+ * These tests pin:
+ *  1. Fan-out cap of 3 — even with 6 online nodes, only 3 fetches
+ *  2. Visibility gate — when collapsed, no polling
+ *
+ * If a future refactor pushes either dial back up, CI fails before
+ * the regression hits a paying tenant.
+ */
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
+import { render, cleanup, act, fireEvent } from "@testing-library/react";
+
+// ── Mocks (hoisted before imports) ────────────────────────────────────────────
+
+vi.mock("@/lib/api", () => ({
+  api: { get: vi.fn() },
+}));
+
+// Six online nodes — enough to verify the cap of 3.
+const mockStoreState = {
+  selectedNodeId: null as string | null,
+  nodes: [
+    { id: "ws-1", data: { status: "online", name: "ws-1" } },
+    { id: "ws-2", data: { status: "online", name: "ws-2" } },
+    { id: "ws-3", data: { status: "online", name: "ws-3" } },
+    { id: "ws-4", data: { status: "online", name: "ws-4" } },
+    { id: "ws-5", data: { status: "online", name: "ws-5" } },
+    { id: "ws-6", data: { status: "online", name: "ws-6" } },
+    { id: "ws-offline", data: { status: "offline", name: "off" } },
+  ],
+};
+
+vi.mock("@/store/canvas", () => ({
+  useCanvasStore: vi.fn(
+    (selector: (s: typeof mockStoreState) => unknown) =>
+      selector(mockStoreState)
+  ),
+}));
+
+// design-tokens has named exports — keep the shape minimal.
+vi.mock("@/lib/design-tokens", () => ({
+  COMM_TYPE_LABELS: {
+    a2a_send: "→",
+    a2a_receive: "←",
+    task_update: "✓",
+  },
+}));
+
+// ── Imports (after mocks) ─────────────────────────────────────────────────────
+
+import { api } from "@/lib/api";
+import { CommunicationOverlay } from "../CommunicationOverlay";
+
+const mockGet = vi.mocked(api.get);
+
+// ── Setup ─────────────────────────────────────────────────────────────────────
+
+beforeEach(() => {
+  vi.useFakeTimers();
+  mockGet.mockReset();
+  mockGet.mockResolvedValue([]);
+});
+
+afterEach(() => {
+  cleanup();
+  vi.useRealTimers();
+});
+
+// ── Tests ─────────────────────────────────────────────────────────────────────
+
+describe("CommunicationOverlay — fan-out cap", () => {
+  it("polls at most 3 of 6 online workspaces (rate-limit floor)", async () => {
+    await act(async () => {
+      render(<CommunicationOverlay />);
+    });
+    // Mount fires the first poll synchronously (no interval tick yet).
+    // Pre-fix: 6 calls. Post-fix: 3.
+    expect(mockGet).toHaveBeenCalledTimes(3);
+    // Verify the calls are for the FIRST 3 online nodes (slice order).
+    expect(mockGet).toHaveBeenCalledWith("/workspaces/ws-1/activity?limit=5");
+    expect(mockGet).toHaveBeenCalledWith("/workspaces/ws-2/activity?limit=5");
+    expect(mockGet).toHaveBeenCalledWith("/workspaces/ws-3/activity?limit=5");
+  });
+
+  it("never polls offline workspaces", async () => {
+    await act(async () => {
+      render(<CommunicationOverlay />);
+    });
+    expect(mockGet).not.toHaveBeenCalledWith(
+      "/workspaces/ws-offline/activity?limit=5",
+    );
+  });
+});
+
+describe("CommunicationOverlay — visibility gate", () => {
+  it("uses 30s interval cadence (was 10s pre-fix)", async () => {
+    await act(async () => {
+      render(<CommunicationOverlay />);
+    });
+    expect(mockGet).toHaveBeenCalledTimes(3); // initial mount poll
+
+    // Advance 10s — pre-fix this would fire another poll. Post-fix: silent.
+    await act(async () => {
+      vi.advanceTimersByTime(10_000);
+    });
+    expect(mockGet).toHaveBeenCalledTimes(3);
+
+    // Advance to 30s — interval fires.
+    await act(async () => {
+      vi.advanceTimersByTime(20_000);
+    });
+    expect(mockGet).toHaveBeenCalledTimes(6); // +3 from second tick
+  });
+});