From 9597d262ca2b3c1507a2c69bf29794aaabb6f8d1 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Fri, 24 Apr 2026 11:46:09 -0700 Subject: [PATCH 1/2] fix(canvas): runtime-aware provisioning-timeout threshold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hermes workspaces cold-boot in 8-13 min (ripgrep + ffmpeg + node22 + hermes-agent source build + Playwright + Chromium ~300MB). The canvas's 2-min hardcoded "Provisioning Timeout" warning fired at ~2min and told users their workspace was "stuck" while it was still mid-install. Users hit Retry, triggering fresh cold boots and cancelling healthy workspaces. User-facing symptom (reported 2026-04-24 18:35Z): hermes workspace showed "has been provisioning for 3m 15s — it may have encountered an issue" with Retry + Cancel buttons, while the EC2 was installing node_modules. Fix: - Keep DEFAULT_PROVISION_TIMEOUT_MS = 120_000 (2min) — correct for fast docker runtimes (claude-code, langgraph, crewai) where cold boot is 30-90s. - Add RUNTIME_TIMEOUT_OVERRIDES_MS = { hermes: 720_000 } (12min). Aligns with tests/e2e/test_staging_full_saas.sh's PROVISION_TIMEOUT_SECS=900 (15min) so UI warns shortly before the backend itself gives up. - New timeoutForRuntime() resolves the base; per-node lookup in the check-timeouts interval so a mixed batch (1 hermes + 2 langgraph) uses the right threshold for each. - timeoutMs prop is now optional. Undefined → per-runtime lookup; a number → forces a single threshold for every workspace (tests use this for deterministic behavior). Tests: 4 new cases pinning the runtime-aware resolution, including a guard that catches future regressions that would weaken hermes's budget. Existing tests unchanged (they import DEFAULT_PROVISION_TIMEOUT_MS which still exports 120_000). 13/13 pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- canvas/src/components/ProvisioningTimeout.tsx | 73 +++++++++++++++---- .../__tests__/ProvisioningTimeout.test.tsx | 47 +++++++++++- 2 files changed, 106 insertions(+), 14 deletions(-) diff --git a/canvas/src/components/ProvisioningTimeout.tsx b/canvas/src/components/ProvisioningTimeout.tsx index c4ed460c..5b254d95 100644 --- a/canvas/src/components/ProvisioningTimeout.tsx +++ b/canvas/src/components/ProvisioningTimeout.tsx @@ -6,11 +6,39 @@ import { api } from "@/lib/api"; import { showToast } from "./Toaster"; import { ConsoleModal } from "./ConsoleModal"; -/** Base provisioning timeout in milliseconds (2 minutes). Used as the - * floor; the effective threshold scales with the number of workspaces - * concurrently provisioning (see effectiveTimeoutMs below). */ +/** Base provisioning timeout in milliseconds (2 minutes). Floor for fast + * runtimes (claude-code, langgraph, crewai) on Docker where cold boot + * is 30-90s. Slow runtimes override via RUNTIME_TIMEOUT_OVERRIDES_MS. + * The effective threshold also scales with concurrent-provisioning + * count (see effectiveTimeoutMs below). */ export const DEFAULT_PROVISION_TIMEOUT_MS = 120_000; +/** Per-runtime timeout floors for cold-boot sequences that legitimately + * exceed the 2-minute default. A too-low threshold creates false-alarm + * banners telling users "your workspace is stuck" while it's actually + * mid-install — confusing, and it makes users retry workspaces that + * would have come online on their own. + * + * Hermes at 12min: installs ripgrep + ffmpeg + node22 + builds + * hermes-agent from source + Playwright + Chromium (~300MB). Measured + * boots on staging EC2 routinely land at 8-13 min. Aligns with the + * SaaS E2E PROVISION_TIMEOUT_SECS=900 (15 min) so the UI warning lands + * shortly before the backend itself gives up. + * + * Add entries here as new runtimes surface false-alarm complaints. + * Runtimes absent from the map get DEFAULT_PROVISION_TIMEOUT_MS. */ +export const RUNTIME_TIMEOUT_OVERRIDES_MS: Record = { + hermes: 720_000, // 12 min — see comment above +}; + +/** Resolve the base timeout for a workspace given its runtime. */ +export function timeoutForRuntime(runtime: string | undefined): number { + if (runtime && runtime in RUNTIME_TIMEOUT_OVERRIDES_MS) { + return RUNTIME_TIMEOUT_OVERRIDES_MS[runtime]; + } + return DEFAULT_PROVISION_TIMEOUT_MS; +} + /** The server provisions up to `PROVISION_CONCURRENCY` containers at * once and paces the rest in a queue (`workspaceCreatePacingMs` = * 2s). Mirrors the Go constants — if those change, bump these. */ @@ -43,8 +71,12 @@ interface TimeoutEntry { * time per node. */ export function ProvisioningTimeout({ - timeoutMs = DEFAULT_PROVISION_TIMEOUT_MS, + timeoutMs, }: { + // If undefined (the default when mounted without a prop), each workspace's + // threshold is resolved from its runtime via timeoutForRuntime(). + // Pass an explicit number to force a single threshold for every workspace + // (used by tests that want deterministic behavior regardless of runtime). timeoutMs?: number; }) { const [timedOut, setTimedOut] = useState([]); @@ -57,19 +89,28 @@ export function ProvisioningTimeout({ const [dismissed, setDismissed] = useState>(new Set()); // Subscribe to provisioning nodes — use shallow compare to avoid infinite re-render - // (filter+map creates new array reference on every store update) + // (filter+map creates new array reference on every store update). + // Runtime included so the timeout threshold can be resolved per-node + // (hermes cold-boot legitimately takes 8-13 min vs 30-90s for docker + // runtimes — a single threshold would false-alarm on one or the other). + // Separator: `|` between fields, `,` between nodes. Names may contain + // anything the user typed; strip `|` and `,` so serialization round-trips. const provisioningNodes = useCanvasStore((s) => { const result = s.nodes .filter((n) => n.data.status === "provisioning") - .map((n) => `${n.id}:${n.data.name}`); + .map((n) => { + const safeName = (n.data.name ?? "").replace(/[|,]/g, " "); + const runtime = n.data.runtime ?? ""; + return `${n.id}|${safeName}|${runtime}`; + }); return result.join(","); }); const parsedProvisioningNodes = useMemo( () => provisioningNodes ? provisioningNodes.split(",").map((entry) => { - const [id, name] = entry.split(":"); - return { id, name }; + const [id, name, runtime] = entry.split("|"); + return { id, name, runtime }; }) : [], [provisioningNodes], @@ -113,14 +154,20 @@ export function ProvisioningTimeout({ const interval = setInterval(() => { const now = Date.now(); const newTimedOut: TimeoutEntry[] = []; - const effective = effectiveTimeoutMs( - timeoutMs, - parsedProvisioningNodes.length, - ); + // Per-node timeout: each workspace has its own base (runtime-aware) + // scaled by the total concurrent-provisioning count. A hermes + // workspace in a batch alongside two langgraph workspaces gets + // hermes's 12-min base, not langgraph's 2-min base. for (const node of parsedProvisioningNodes) { const startedAt = tracking.get(node.id); - if (startedAt && now - startedAt >= effective) { + if (!startedAt) continue; + const base = timeoutMs ?? timeoutForRuntime(node.runtime); + const effective = effectiveTimeoutMs( + base, + parsedProvisioningNodes.length, + ); + if (now - startedAt >= effective) { newTimedOut.push({ workspaceId: node.id, workspaceName: node.name, diff --git a/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx b/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx index f1c5b150..7fba5552 100644 --- a/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx +++ b/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx @@ -7,7 +7,11 @@ global.fetch = vi.fn(() => import { useCanvasStore } from "../../store/canvas"; import type { WorkspaceData } from "../../store/socket"; -import { DEFAULT_PROVISION_TIMEOUT_MS } from "../ProvisioningTimeout"; +import { + DEFAULT_PROVISION_TIMEOUT_MS, + RUNTIME_TIMEOUT_OVERRIDES_MS, + timeoutForRuntime, +} from "../ProvisioningTimeout"; // Helper to build a WorkspaceData object function makeWS(overrides: Partial & { id: string }): WorkspaceData { @@ -184,4 +188,45 @@ describe("ProvisioningTimeout", () => { .nodes.filter((n) => n.data.status === "provisioning"); expect(stillProvisioning).toHaveLength(2); }); + + // ── Runtime-aware timeout regression tests (2026-04-24 outage) ──────────── + // Prior to this, a hermes workspace consistently false-alarmed at 2 min + // into its 8-13 min cold boot, pushing users to retry something that + // would have come online on its own. The runtime-aware override keeps + // the 2-min floor for fast docker runtimes while giving hermes its + // honest 12-min budget. + + describe("timeoutForRuntime", () => { + it("returns the 2-min default for unknown/missing runtimes", () => { + expect(timeoutForRuntime(undefined)).toBe(DEFAULT_PROVISION_TIMEOUT_MS); + expect(timeoutForRuntime("")).toBe(DEFAULT_PROVISION_TIMEOUT_MS); + expect(timeoutForRuntime("some-future-runtime")).toBe( + DEFAULT_PROVISION_TIMEOUT_MS, + ); + }); + + it("returns the docker-fast 2-min default for known-fast runtimes", () => { + // These aren't in the override map so they get the default. + // If someone ever adds one of them to RUNTIME_TIMEOUT_OVERRIDES_MS, + // this test catches the accidental regression. + expect(timeoutForRuntime("claude-code")).toBe(DEFAULT_PROVISION_TIMEOUT_MS); + expect(timeoutForRuntime("langgraph")).toBe(DEFAULT_PROVISION_TIMEOUT_MS); + expect(timeoutForRuntime("crewai")).toBe(DEFAULT_PROVISION_TIMEOUT_MS); + }); + + it("returns 12 min for hermes — covers cold-boot install tail", () => { + expect(timeoutForRuntime("hermes")).toBe(720_000); + expect(timeoutForRuntime("hermes")).toBe( + RUNTIME_TIMEOUT_OVERRIDES_MS.hermes, + ); + }); + + it("hermes override is materially longer than the default", () => { + // Guard against future refactors that accidentally weaken the + // override (e.g. typo lowering hermes to 72_000 = 72s). + expect(RUNTIME_TIMEOUT_OVERRIDES_MS.hermes).toBeGreaterThanOrEqual( + DEFAULT_PROVISION_TIMEOUT_MS * 5, + ); + }); + }); }); From 0b237ed9dde24168900d47897afd76fc6d314643 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Fri, 24 Apr 2026 11:48:39 -0700 Subject: [PATCH 2/2] refactor(canvas): extract runtime profiles to @/lib/runtimeProfiles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Preparation for a "hundreds of runtimes" plugin ecosystem. Keeping the runtime-specific UX knobs in-line inside ProvisioningTimeout scales badly — every new runtime would require editing a component, not just adding a table entry. Other components (create-workspace dialog, workspace card tooltips, etc.) will want the same runtime metadata. Changes: - New file `canvas/src/lib/runtimeProfiles.ts` owns: * `RuntimeProfile` type — structural shape, every field optional so new runtimes can partially-fill without breaking consumers. * `DEFAULT_RUNTIME_PROFILE` — 2-min default floor (docker-fast). * `RUNTIME_PROFILES` — named overrides (currently: hermes 12 min). * `WorkspaceRuntimeOverrides` — interface for server-provided per-workspace overrides, so operators can tune via template manifest / workspace metadata without a canvas release. * `getRuntimeProfile()` — resolver with overrides → profile → default priority. * `provisionTimeoutForRuntime()` — convenience wrapper. - `ProvisioningTimeout.tsx` now delegates to the profile module. `DEFAULT_PROVISION_TIMEOUT_MS` re-exported for legacy test importers. - Tests: 16/16 (up from 9 before the first fix). Adds pinning for: * overrides > profile > default priority chain * "every entry in RUNTIME_PROFILES resolves to a number" contract * backward-compat export Adding a new slow runtime is now one table entry in `canvas/src/lib/runtimeProfiles.ts` with a mandatory `WHY` comment. Moving to server-driven profiles later is a ~10-line change (the resolver already threads WorkspaceRuntimeOverrides through). Co-Authored-By: Claude Opus 4.7 (1M context) --- canvas/src/components/ProvisioningTimeout.tsx | 51 +++----- .../__tests__/ProvisioningTimeout.test.tsx | 121 +++++++++++++----- canvas/src/lib/runtimeProfiles.ts | 120 +++++++++++++++++ 3 files changed, 225 insertions(+), 67 deletions(-) create mode 100644 canvas/src/lib/runtimeProfiles.ts diff --git a/canvas/src/components/ProvisioningTimeout.tsx b/canvas/src/components/ProvisioningTimeout.tsx index 5b254d95..1c09fa3b 100644 --- a/canvas/src/components/ProvisioningTimeout.tsx +++ b/canvas/src/components/ProvisioningTimeout.tsx @@ -6,38 +6,16 @@ import { api } from "@/lib/api"; import { showToast } from "./Toaster"; import { ConsoleModal } from "./ConsoleModal"; -/** Base provisioning timeout in milliseconds (2 minutes). Floor for fast - * runtimes (claude-code, langgraph, crewai) on Docker where cold boot - * is 30-90s. Slow runtimes override via RUNTIME_TIMEOUT_OVERRIDES_MS. - * The effective threshold also scales with concurrent-provisioning - * count (see effectiveTimeoutMs below). */ -export const DEFAULT_PROVISION_TIMEOUT_MS = 120_000; +import { + DEFAULT_RUNTIME_PROFILE, + provisionTimeoutForRuntime, +} from "@/lib/runtimeProfiles"; -/** Per-runtime timeout floors for cold-boot sequences that legitimately - * exceed the 2-minute default. A too-low threshold creates false-alarm - * banners telling users "your workspace is stuck" while it's actually - * mid-install — confusing, and it makes users retry workspaces that - * would have come online on their own. - * - * Hermes at 12min: installs ripgrep + ffmpeg + node22 + builds - * hermes-agent from source + Playwright + Chromium (~300MB). Measured - * boots on staging EC2 routinely land at 8-13 min. Aligns with the - * SaaS E2E PROVISION_TIMEOUT_SECS=900 (15 min) so the UI warning lands - * shortly before the backend itself gives up. - * - * Add entries here as new runtimes surface false-alarm complaints. - * Runtimes absent from the map get DEFAULT_PROVISION_TIMEOUT_MS. */ -export const RUNTIME_TIMEOUT_OVERRIDES_MS: Record = { - hermes: 720_000, // 12 min — see comment above -}; - -/** Resolve the base timeout for a workspace given its runtime. */ -export function timeoutForRuntime(runtime: string | undefined): number { - if (runtime && runtime in RUNTIME_TIMEOUT_OVERRIDES_MS) { - return RUNTIME_TIMEOUT_OVERRIDES_MS[runtime]; - } - return DEFAULT_PROVISION_TIMEOUT_MS; -} +/** Re-export for backward compatibility with tests and other importers + * that previously imported DEFAULT_PROVISION_TIMEOUT_MS from this file. + * New code should read via getRuntimeProfile() from @/lib/runtimeProfiles. */ +export const DEFAULT_PROVISION_TIMEOUT_MS = + DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs; /** The server provisions up to `PROVISION_CONCURRENCY` containers at * once and paces the rest in a queue (`workspaceCreatePacingMs` = @@ -155,14 +133,15 @@ export function ProvisioningTimeout({ const now = Date.now(); const newTimedOut: TimeoutEntry[] = []; - // Per-node timeout: each workspace has its own base (runtime-aware) - // scaled by the total concurrent-provisioning count. A hermes - // workspace in a batch alongside two langgraph workspaces gets - // hermes's 12-min base, not langgraph's 2-min base. + // Per-node timeout: each workspace resolves its own base via + // @/lib/runtimeProfiles (server-override → runtime profile → + // default), then scales by concurrent-provisioning count. A + // hermes workspace in a batch alongside two langgraph workspaces + // gets hermes's 12-min base, not langgraph's 2-min base. for (const node of parsedProvisioningNodes) { const startedAt = tracking.get(node.id); if (!startedAt) continue; - const base = timeoutMs ?? timeoutForRuntime(node.runtime); + const base = timeoutMs ?? provisionTimeoutForRuntime(node.runtime); const effective = effectiveTimeoutMs( base, parsedProvisioningNodes.length, diff --git a/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx b/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx index 7fba5552..2424ea49 100644 --- a/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx +++ b/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx @@ -7,11 +7,13 @@ global.fetch = vi.fn(() => import { useCanvasStore } from "../../store/canvas"; import type { WorkspaceData } from "../../store/socket"; +import { DEFAULT_PROVISION_TIMEOUT_MS } from "../ProvisioningTimeout"; import { - DEFAULT_PROVISION_TIMEOUT_MS, - RUNTIME_TIMEOUT_OVERRIDES_MS, - timeoutForRuntime, -} from "../ProvisioningTimeout"; + DEFAULT_RUNTIME_PROFILE, + RUNTIME_PROFILES, + getRuntimeProfile, + provisionTimeoutForRuntime, +} from "@/lib/runtimeProfiles"; // Helper to build a WorkspaceData object function makeWS(overrides: Partial & { id: string }): WorkspaceData { @@ -196,37 +198,94 @@ describe("ProvisioningTimeout", () => { // the 2-min floor for fast docker runtimes while giving hermes its // honest 12-min budget. - describe("timeoutForRuntime", () => { - it("returns the 2-min default for unknown/missing runtimes", () => { - expect(timeoutForRuntime(undefined)).toBe(DEFAULT_PROVISION_TIMEOUT_MS); - expect(timeoutForRuntime("")).toBe(DEFAULT_PROVISION_TIMEOUT_MS); - expect(timeoutForRuntime("some-future-runtime")).toBe( - DEFAULT_PROVISION_TIMEOUT_MS, - ); + describe("runtime profile resolution (@/lib/runtimeProfiles)", () => { + describe("provisionTimeoutForRuntime", () => { + it("returns the default for unknown/missing runtimes", () => { + expect(provisionTimeoutForRuntime(undefined)).toBe( + DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs, + ); + expect(provisionTimeoutForRuntime("")).toBe( + DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs, + ); + expect(provisionTimeoutForRuntime("some-future-runtime")).toBe( + DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs, + ); + }); + + it("returns default for known-fast runtimes (not in profile map)", () => { + // If someone ever adds one of these to RUNTIME_PROFILES with a + // slower value, this test catches the unintended regression. + expect(provisionTimeoutForRuntime("claude-code")).toBe( + DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs, + ); + expect(provisionTimeoutForRuntime("langgraph")).toBe( + DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs, + ); + expect(provisionTimeoutForRuntime("crewai")).toBe( + DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs, + ); + }); + + it("returns hermes override when runtime = hermes", () => { + expect(provisionTimeoutForRuntime("hermes")).toBe( + RUNTIME_PROFILES.hermes?.provisionTimeoutMs, + ); + expect(provisionTimeoutForRuntime("hermes")).toBeGreaterThanOrEqual( + DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs * 5, + ); + }); + + it("server-side workspace override wins over runtime profile", () => { + // The resolution order is: overrides → profile → default. + // An operator-tunable per-workspace number on the backend + // (e.g. via a template manifest field) should beat the canvas + // runtime map. + expect( + provisionTimeoutForRuntime("hermes", { + provisionTimeoutMs: 60_000, + }), + ).toBe(60_000); + expect( + provisionTimeoutForRuntime("some-unknown", { + provisionTimeoutMs: 300_000, + }), + ).toBe(300_000); + }); }); - it("returns the docker-fast 2-min default for known-fast runtimes", () => { - // These aren't in the override map so they get the default. - // If someone ever adds one of them to RUNTIME_TIMEOUT_OVERRIDES_MS, - // this test catches the accidental regression. - expect(timeoutForRuntime("claude-code")).toBe(DEFAULT_PROVISION_TIMEOUT_MS); - expect(timeoutForRuntime("langgraph")).toBe(DEFAULT_PROVISION_TIMEOUT_MS); - expect(timeoutForRuntime("crewai")).toBe(DEFAULT_PROVISION_TIMEOUT_MS); + describe("getRuntimeProfile", () => { + it("returns a structural profile with required fields", () => { + const profile = getRuntimeProfile("hermes"); + expect(profile.provisionTimeoutMs).toBeTypeOf("number"); + expect(profile.provisionTimeoutMs).toBeGreaterThan(0); + }); + + it("default profile is a valid superset of every override", () => { + // Every entry in RUNTIME_PROFILES must provide fields the + // default does — otherwise consumers could get undefined where + // they expected a number. This test enforces that contract so + // future entries can't accidentally drop fields. + for (const [runtime, profile] of Object.entries(RUNTIME_PROFILES)) { + const resolved = getRuntimeProfile(runtime); + expect( + resolved.provisionTimeoutMs, + `runtime=${runtime} must resolve to a number`, + ).toBeTypeOf("number"); + expect(resolved.provisionTimeoutMs).toBeGreaterThan(0); + // Profile's explicit value should be used iff present. + if (profile.provisionTimeoutMs !== undefined) { + expect(resolved.provisionTimeoutMs).toBe(profile.provisionTimeoutMs); + } + } + }); }); - it("returns 12 min for hermes — covers cold-boot install tail", () => { - expect(timeoutForRuntime("hermes")).toBe(720_000); - expect(timeoutForRuntime("hermes")).toBe( - RUNTIME_TIMEOUT_OVERRIDES_MS.hermes, - ); - }); - - it("hermes override is materially longer than the default", () => { - // Guard against future refactors that accidentally weaken the - // override (e.g. typo lowering hermes to 72_000 = 72s). - expect(RUNTIME_TIMEOUT_OVERRIDES_MS.hermes).toBeGreaterThanOrEqual( - DEFAULT_PROVISION_TIMEOUT_MS * 5, - ); + describe("DEFAULT_PROVISION_TIMEOUT_MS backward-compat export", () => { + it("still exports the same default for legacy importers", () => { + expect(DEFAULT_PROVISION_TIMEOUT_MS).toBe( + DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs, + ); + }); }); }); }); diff --git a/canvas/src/lib/runtimeProfiles.ts b/canvas/src/lib/runtimeProfiles.ts new file mode 100644 index 00000000..68befd8a --- /dev/null +++ b/canvas/src/lib/runtimeProfiles.ts @@ -0,0 +1,120 @@ +/** + * Runtime profiles — per-runtime UX metadata. + * + * Scaling target: hundreds of runtimes (plugin-architecture-v2 roadmap). + * This module is the single source of truth for runtime-specific UI knobs + * on the canvas side. Each runtime can declare: + * + * - provisionTimeoutMs: when to show the "taking longer than expected" + * banner. Fast docker runtimes = 2min; slow source-build runtimes = 12min. + * - (future) label, icon, color, helpUrl, capabilities — add as needed. + * + * Resolution order (most specific wins): + * + * 1. Server-provided override on the workspace data (e.g. + * `workspace.data.provisionTimeoutMs` set from a template manifest). + * Lets operators tune without a canvas release once server-side + * declarative config lands. + * 2. Per-runtime entry in RUNTIME_PROFILES. + * 3. DEFAULT_RUNTIME_PROFILE. + * + * Adding a new runtime: + * - If it's fast (≤ 2min cold boot): do nothing, the default catches it. + * - If it's slow: add one entry to RUNTIME_PROFILES below. + * - Long-term: move runtime profiles server-side so this file can shrink. + * + * Architectural note: this deliberately lives under /lib, NOT + * /components/ProvisioningTimeout. Other components (e.g. a + * "create workspace" dialog that needs to know the runtime's expected + * cold-boot time) should import from here too — avoids duplicating the + * runtime-name knowledge across the codebase. + */ + +/** + * Structural shape of a runtime profile. Add fields as new UX knobs + * become runtime-specific. Every field should be optional so new runtimes + * can partially fill the profile without breaking older code that reads + * only some fields. + */ +export interface RuntimeProfile { + /** Milliseconds before the canvas shows the "taking too long" banner. + * Base value — the ProvisioningTimeout component still scales this by + * concurrent-provisioning count. */ + provisionTimeoutMs?: number; + // Future extensions (kept commented until used): + // label?: string; + // icon?: string; + // color?: string; + // helpUrl?: string; +} + +/** The floor every runtime inherits unless it overrides. Calibrated for + * docker-local fast runtimes (claude-code, langgraph, crewai) where cold + * boot is 30-90s. */ +export const DEFAULT_RUNTIME_PROFILE: Required< + Pick +> = { + provisionTimeoutMs: 120_000, // 2 min +}; + +/** + * Named per-runtime overrides. Keep this map small and explicit — + * each entry is a deliberate statement that this runtime's cold-boot + * behavior differs materially from the default. + * + * Each override must also ship with a comment explaining WHY the default + * is wrong for this runtime. Unexplained numbers rot. + */ +export const RUNTIME_PROFILES: Record = { + hermes: { + // 12 min. Installs ripgrep + ffmpeg + node22 + builds hermes-agent + // from source + Playwright + Chromium (~300MB download). Measured + // cold boots on staging EC2 routinely land at 8-13 min. Aligns + // with SaaS E2E's PROVISION_TIMEOUT_SECS=900 (15 min) so the UI + // warning lands shortly before the backend itself gives up. + provisionTimeoutMs: 720_000, + }, +}; + +/** + * Data fields the canvas can consult for per-workspace overrides. These + * let the backend (via workspace data on the socket payload) override + * profile values without a canvas release. + * + * Intentionally loose typing — if a field isn't present on the node, we + * fall through to the runtime profile. + */ +export interface WorkspaceRuntimeOverrides { + provisionTimeoutMs?: number; +} + +/** + * Resolve a runtime profile for a given runtime name, optionally merging + * server-provided per-workspace overrides on top. + * + * Resolution (most-specific wins): + * overrides.provisionTimeoutMs + * → RUNTIME_PROFILES[runtime].provisionTimeoutMs + * → DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs + */ +export function getRuntimeProfile( + runtime: string | undefined, + overrides?: WorkspaceRuntimeOverrides, +): Required> { + const profile = runtime ? RUNTIME_PROFILES[runtime] : undefined; + return { + provisionTimeoutMs: + overrides?.provisionTimeoutMs ?? + profile?.provisionTimeoutMs ?? + DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs, + }; +} + +/** Convenience: just the provisionTimeoutMs. Equivalent to + * `getRuntimeProfile(runtime, overrides).provisionTimeoutMs`. */ +export function provisionTimeoutForRuntime( + runtime: string | undefined, + overrides?: WorkspaceRuntimeOverrides, +): number { + return getRuntimeProfile(runtime, overrides).provisionTimeoutMs; +}