diff --git a/canvas/src/components/ProvisioningTimeout.tsx b/canvas/src/components/ProvisioningTimeout.tsx index c4ed460c..5b254d95 100644 --- a/canvas/src/components/ProvisioningTimeout.tsx +++ b/canvas/src/components/ProvisioningTimeout.tsx @@ -6,11 +6,39 @@ import { api } from "@/lib/api"; import { showToast } from "./Toaster"; import { ConsoleModal } from "./ConsoleModal"; -/** Base provisioning timeout in milliseconds (2 minutes). Used as the - * floor; the effective threshold scales with the number of workspaces - * concurrently provisioning (see effectiveTimeoutMs below). */ +/** Base provisioning timeout in milliseconds (2 minutes). Floor for fast + * runtimes (claude-code, langgraph, crewai) on Docker where cold boot + * is 30-90s. Slow runtimes override via RUNTIME_TIMEOUT_OVERRIDES_MS. + * The effective threshold also scales with concurrent-provisioning + * count (see effectiveTimeoutMs below). */ export const DEFAULT_PROVISION_TIMEOUT_MS = 120_000; +/** Per-runtime timeout floors for cold-boot sequences that legitimately + * exceed the 2-minute default. A too-low threshold creates false-alarm + * banners telling users "your workspace is stuck" while it's actually + * mid-install — confusing, and it makes users retry workspaces that + * would have come online on their own. + * + * Hermes at 12min: installs ripgrep + ffmpeg + node22 + builds + * hermes-agent from source + Playwright + Chromium (~300MB). Measured + * boots on staging EC2 routinely land at 8-13 min. Aligns with the + * SaaS E2E PROVISION_TIMEOUT_SECS=900 (15 min) so the UI warning lands + * shortly before the backend itself gives up. + * + * Add entries here as new runtimes surface false-alarm complaints. + * Runtimes absent from the map get DEFAULT_PROVISION_TIMEOUT_MS. */ +export const RUNTIME_TIMEOUT_OVERRIDES_MS: Record = { + hermes: 720_000, // 12 min — see comment above +}; + +/** Resolve the base timeout for a workspace given its runtime. */ +export function timeoutForRuntime(runtime: string | undefined): number { + if (runtime && runtime in RUNTIME_TIMEOUT_OVERRIDES_MS) { + return RUNTIME_TIMEOUT_OVERRIDES_MS[runtime]; + } + return DEFAULT_PROVISION_TIMEOUT_MS; +} + /** The server provisions up to `PROVISION_CONCURRENCY` containers at * once and paces the rest in a queue (`workspaceCreatePacingMs` = * 2s). Mirrors the Go constants — if those change, bump these. */ @@ -43,8 +71,12 @@ interface TimeoutEntry { * time per node. */ export function ProvisioningTimeout({ - timeoutMs = DEFAULT_PROVISION_TIMEOUT_MS, + timeoutMs, }: { + // If undefined (the default when mounted without a prop), each workspace's + // threshold is resolved from its runtime via timeoutForRuntime(). + // Pass an explicit number to force a single threshold for every workspace + // (used by tests that want deterministic behavior regardless of runtime). timeoutMs?: number; }) { const [timedOut, setTimedOut] = useState([]); @@ -57,19 +89,28 @@ export function ProvisioningTimeout({ const [dismissed, setDismissed] = useState>(new Set()); // Subscribe to provisioning nodes — use shallow compare to avoid infinite re-render - // (filter+map creates new array reference on every store update) + // (filter+map creates new array reference on every store update). + // Runtime included so the timeout threshold can be resolved per-node + // (hermes cold-boot legitimately takes 8-13 min vs 30-90s for docker + // runtimes — a single threshold would false-alarm on one or the other). + // Separator: `|` between fields, `,` between nodes. Names may contain + // anything the user typed; strip `|` and `,` so serialization round-trips. const provisioningNodes = useCanvasStore((s) => { const result = s.nodes .filter((n) => n.data.status === "provisioning") - .map((n) => `${n.id}:${n.data.name}`); + .map((n) => { + const safeName = (n.data.name ?? "").replace(/[|,]/g, " "); + const runtime = n.data.runtime ?? ""; + return `${n.id}|${safeName}|${runtime}`; + }); return result.join(","); }); const parsedProvisioningNodes = useMemo( () => provisioningNodes ? provisioningNodes.split(",").map((entry) => { - const [id, name] = entry.split(":"); - return { id, name }; + const [id, name, runtime] = entry.split("|"); + return { id, name, runtime }; }) : [], [provisioningNodes], @@ -113,14 +154,20 @@ export function ProvisioningTimeout({ const interval = setInterval(() => { const now = Date.now(); const newTimedOut: TimeoutEntry[] = []; - const effective = effectiveTimeoutMs( - timeoutMs, - parsedProvisioningNodes.length, - ); + // Per-node timeout: each workspace has its own base (runtime-aware) + // scaled by the total concurrent-provisioning count. A hermes + // workspace in a batch alongside two langgraph workspaces gets + // hermes's 12-min base, not langgraph's 2-min base. for (const node of parsedProvisioningNodes) { const startedAt = tracking.get(node.id); - if (startedAt && now - startedAt >= effective) { + if (!startedAt) continue; + const base = timeoutMs ?? timeoutForRuntime(node.runtime); + const effective = effectiveTimeoutMs( + base, + parsedProvisioningNodes.length, + ); + if (now - startedAt >= effective) { newTimedOut.push({ workspaceId: node.id, workspaceName: node.name, diff --git a/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx b/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx index f1c5b150..7fba5552 100644 --- a/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx +++ b/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx @@ -7,7 +7,11 @@ global.fetch = vi.fn(() => import { useCanvasStore } from "../../store/canvas"; import type { WorkspaceData } from "../../store/socket"; -import { DEFAULT_PROVISION_TIMEOUT_MS } from "../ProvisioningTimeout"; +import { + DEFAULT_PROVISION_TIMEOUT_MS, + RUNTIME_TIMEOUT_OVERRIDES_MS, + timeoutForRuntime, +} from "../ProvisioningTimeout"; // Helper to build a WorkspaceData object function makeWS(overrides: Partial & { id: string }): WorkspaceData { @@ -184,4 +188,45 @@ describe("ProvisioningTimeout", () => { .nodes.filter((n) => n.data.status === "provisioning"); expect(stillProvisioning).toHaveLength(2); }); + + // ── Runtime-aware timeout regression tests (2026-04-24 outage) ──────────── + // Prior to this, a hermes workspace consistently false-alarmed at 2 min + // into its 8-13 min cold boot, pushing users to retry something that + // would have come online on its own. The runtime-aware override keeps + // the 2-min floor for fast docker runtimes while giving hermes its + // honest 12-min budget. + + describe("timeoutForRuntime", () => { + it("returns the 2-min default for unknown/missing runtimes", () => { + expect(timeoutForRuntime(undefined)).toBe(DEFAULT_PROVISION_TIMEOUT_MS); + expect(timeoutForRuntime("")).toBe(DEFAULT_PROVISION_TIMEOUT_MS); + expect(timeoutForRuntime("some-future-runtime")).toBe( + DEFAULT_PROVISION_TIMEOUT_MS, + ); + }); + + it("returns the docker-fast 2-min default for known-fast runtimes", () => { + // These aren't in the override map so they get the default. + // If someone ever adds one of them to RUNTIME_TIMEOUT_OVERRIDES_MS, + // this test catches the accidental regression. + expect(timeoutForRuntime("claude-code")).toBe(DEFAULT_PROVISION_TIMEOUT_MS); + expect(timeoutForRuntime("langgraph")).toBe(DEFAULT_PROVISION_TIMEOUT_MS); + expect(timeoutForRuntime("crewai")).toBe(DEFAULT_PROVISION_TIMEOUT_MS); + }); + + it("returns 12 min for hermes — covers cold-boot install tail", () => { + expect(timeoutForRuntime("hermes")).toBe(720_000); + expect(timeoutForRuntime("hermes")).toBe( + RUNTIME_TIMEOUT_OVERRIDES_MS.hermes, + ); + }); + + it("hermes override is materially longer than the default", () => { + // Guard against future refactors that accidentally weaken the + // override (e.g. typo lowering hermes to 72_000 = 72s). + expect(RUNTIME_TIMEOUT_OVERRIDES_MS.hermes).toBeGreaterThanOrEqual( + DEFAULT_PROVISION_TIMEOUT_MS * 5, + ); + }); + }); });