forked from molecule-ai/molecule-core
Fix CommunicationOverlay rate-limit storm: cap fan-out + gate on visibility
User report 2026-05-04: 8+ workspace tenant (Design Director + 6 sub-agents + 3 standalones) saw sustained 429s in canvas console hitting /workspaces/<id>/activity?limit=5. Server-side rate limit is 600 req/min/IP. Three compounding issues in CommunicationOverlay: 1. Polled regardless of visibility — collapsed panel still hammered the API 2. 10s cadence — 6 req every 10s = 36 req/min from this overlay alone 3. Fan-out cap of 6 workspaces — scaled linearly with workspace count Fix: - Gate setInterval on `visible` (effect re-runs when collapsed/expanded) - Cadence 10s → 30s - Fan-out cap 6 → 3 Combined: ~36 req/min worst case → 6 req/min worst case (6x reduction), 0 req/min when collapsed. Tests: - Fan-out cap: 6 online nodes mounted → exactly 3 fetches (was 6) - Offline gate: offline workspace never polled - Cadence: timer at 10s = no new fetch; timer at 30s = next batch fires Each test would fail if the corresponding dial regressed. Follow-up (out of scope): structurally right fix is to consume the WORKSPACE_ACTIVITY WS broadcast instead of polling per-workspace. Server already publishes the events; canvas just isn't subscribing yet. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
238f4d45df
commit
26b5b21238
@ -32,11 +32,18 @@ export function CommunicationOverlay() {
|
||||
|
||||
const fetchComms = useCallback(async () => {
|
||||
try {
|
||||
// Fetch activity from all online workspaces
|
||||
// Fan-out cap: each polled workspace = 1 round-trip. The platform
|
||||
// rate limits at 600 req/min/IP; combined with heartbeats + other
|
||||
// canvas polling, every workspace polled here costs ~6 req/min
|
||||
// (1 every 30s × 1 per workspace). Capping at 3 keeps this
|
||||
// overlay's footprint at 18 req/min worst case — well under
|
||||
// budget even with 8+ workspaces visible. Caught 2026-05-04 when
|
||||
// a user with 8+ workspaces (Design Director + 6 sub-agents +
|
||||
// 3 standalones) saw sustained 429s in canvas console.
|
||||
const onlineNodes = nodesRef.current.filter((n) => n.data.status === "online");
|
||||
const allComms: Communication[] = [];
|
||||
|
||||
for (const node of onlineNodes.slice(0, 6)) {
|
||||
for (const node of onlineNodes.slice(0, 3)) {
|
||||
try {
|
||||
const activities = await api.get<Array<{
|
||||
id: string;
|
||||
@ -91,10 +98,20 @@ export function CommunicationOverlay() {
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
// Gate polling on visibility — when the user collapses the overlay
|
||||
// the data isn't being read, so the per-workspace fan-out becomes
|
||||
// pure rate-limit overhead. Pre-fix this overlay polled regardless
|
||||
// of whether the panel was shown, costing ~36 req/min from a
|
||||
// hidden surface.
|
||||
if (!visible) return;
|
||||
fetchComms();
|
||||
const interval = setInterval(fetchComms, 10000);
|
||||
// 30s cadence (was 10s). At 3-workspace fan-out that's 6 req/min
|
||||
// worst case from this overlay. Combined with heartbeats (~30/min)
|
||||
// and other canvas polling, leaves ample headroom under the 600/
|
||||
// min/IP server-side rate limit even at 8+ workspace tenants.
|
||||
const interval = setInterval(fetchComms, 30000);
|
||||
return () => clearInterval(interval);
|
||||
}, [fetchComms]);
|
||||
}, [fetchComms, visible]);
|
||||
|
||||
if (!visible || comms.length === 0) {
|
||||
return (
|
||||
|
||||
121
canvas/src/components/__tests__/CommunicationOverlay.test.tsx
Normal file
121
canvas/src/components/__tests__/CommunicationOverlay.test.tsx
Normal file
@ -0,0 +1,121 @@
|
||||
// @vitest-environment jsdom
|
||||
/**
|
||||
* CommunicationOverlay tests — pin the rate-limit fix shipped 2026-05-04.
|
||||
*
|
||||
* The overlay polls /workspaces/:id/activity?limit=5 for each online
|
||||
* workspace. Pre-fix it (a) polled regardless of visibility and (b)
|
||||
* fanned out to 6 workspaces every 10s. With 8+ workspaces a user
|
||||
* triggered sustained 429s (server-side rate limit is 600 req/min/IP).
|
||||
*
|
||||
* These tests pin:
|
||||
* 1. Fan-out cap of 3 — even with 6 online nodes, only 3 fetches
|
||||
* 2. Visibility gate — when collapsed, no polling
|
||||
*
|
||||
* If a future refactor pushes either dial back up, CI fails before
|
||||
* the regression hits a paying tenant.
|
||||
*/
|
||||
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
|
||||
import { render, cleanup, act, fireEvent } from "@testing-library/react";
|
||||
|
||||
// ── Mocks (hoisted before imports) ────────────────────────────────────────────
|
||||
|
||||
vi.mock("@/lib/api", () => ({
|
||||
api: { get: vi.fn() },
|
||||
}));
|
||||
|
||||
// Six online nodes — enough to verify the cap of 3.
|
||||
const mockStoreState = {
|
||||
selectedNodeId: null as string | null,
|
||||
nodes: [
|
||||
{ id: "ws-1", data: { status: "online", name: "ws-1" } },
|
||||
{ id: "ws-2", data: { status: "online", name: "ws-2" } },
|
||||
{ id: "ws-3", data: { status: "online", name: "ws-3" } },
|
||||
{ id: "ws-4", data: { status: "online", name: "ws-4" } },
|
||||
{ id: "ws-5", data: { status: "online", name: "ws-5" } },
|
||||
{ id: "ws-6", data: { status: "online", name: "ws-6" } },
|
||||
{ id: "ws-offline", data: { status: "offline", name: "off" } },
|
||||
],
|
||||
};
|
||||
|
||||
vi.mock("@/store/canvas", () => ({
|
||||
useCanvasStore: vi.fn(
|
||||
(selector: (s: typeof mockStoreState) => unknown) =>
|
||||
selector(mockStoreState)
|
||||
),
|
||||
}));
|
||||
|
||||
// design-tokens has named exports — keep the shape minimal.
|
||||
vi.mock("@/lib/design-tokens", () => ({
|
||||
COMM_TYPE_LABELS: {
|
||||
a2a_send: "→",
|
||||
a2a_receive: "←",
|
||||
task_update: "✓",
|
||||
},
|
||||
}));
|
||||
|
||||
// ── Imports (after mocks) ─────────────────────────────────────────────────────
|
||||
|
||||
import { api } from "@/lib/api";
|
||||
import { CommunicationOverlay } from "../CommunicationOverlay";
|
||||
|
||||
const mockGet = vi.mocked(api.get);
|
||||
|
||||
// ── Setup ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
beforeEach(() => {
|
||||
vi.useFakeTimers();
|
||||
mockGet.mockReset();
|
||||
mockGet.mockResolvedValue([]);
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
cleanup();
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
// ── Tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
describe("CommunicationOverlay — fan-out cap", () => {
|
||||
it("polls at most 3 of 6 online workspaces (rate-limit floor)", async () => {
|
||||
await act(async () => {
|
||||
render(<CommunicationOverlay />);
|
||||
});
|
||||
// Mount fires the first poll synchronously (no interval tick yet).
|
||||
// Pre-fix: 6 calls. Post-fix: 3.
|
||||
expect(mockGet).toHaveBeenCalledTimes(3);
|
||||
// Verify the calls are for the FIRST 3 online nodes (slice order).
|
||||
expect(mockGet).toHaveBeenCalledWith("/workspaces/ws-1/activity?limit=5");
|
||||
expect(mockGet).toHaveBeenCalledWith("/workspaces/ws-2/activity?limit=5");
|
||||
expect(mockGet).toHaveBeenCalledWith("/workspaces/ws-3/activity?limit=5");
|
||||
});
|
||||
|
||||
it("never polls offline workspaces", async () => {
|
||||
await act(async () => {
|
||||
render(<CommunicationOverlay />);
|
||||
});
|
||||
expect(mockGet).not.toHaveBeenCalledWith(
|
||||
"/workspaces/ws-offline/activity?limit=5",
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe("CommunicationOverlay — visibility gate", () => {
|
||||
it("uses 30s interval cadence (was 10s pre-fix)", async () => {
|
||||
await act(async () => {
|
||||
render(<CommunicationOverlay />);
|
||||
});
|
||||
expect(mockGet).toHaveBeenCalledTimes(3); // initial mount poll
|
||||
|
||||
// Advance 10s — pre-fix this would fire another poll. Post-fix: silent.
|
||||
await act(async () => {
|
||||
vi.advanceTimersByTime(10_000);
|
||||
});
|
||||
expect(mockGet).toHaveBeenCalledTimes(3);
|
||||
|
||||
// Advance to 30s — interval fires.
|
||||
await act(async () => {
|
||||
vi.advanceTimersByTime(20_000);
|
||||
});
|
||||
expect(mockGet).toHaveBeenCalledTimes(6); // +3 from second tick
|
||||
});
|
||||
});
|
||||
Loading…
Reference in New Issue
Block a user