From 9597d262ca2b3c1507a2c69bf29794aaabb6f8d1 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Fri, 24 Apr 2026 11:46:09 -0700
Subject: [PATCH 1/2] fix(canvas): runtime-aware provisioning-timeout threshold
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hermes workspaces cold-boot in 8-13 min (ripgrep + ffmpeg + node22 +
hermes-agent source build + Playwright + Chromium ~300MB). The canvas's
2-min hardcoded "Provisioning Timeout" warning fired at ~2min and told
users their workspace was "stuck" while it was still mid-install. Users
hit Retry, triggering fresh cold boots and cancelling healthy workspaces.

User-facing symptom (reported 2026-04-24 18:35Z): hermes workspace showed
"has been provisioning for 3m 15s — it may have encountered an issue"
with Retry + Cancel buttons, while the EC2 was installing node_modules.

Fix:
- Keep DEFAULT_PROVISION_TIMEOUT_MS = 120_000 (2min) — correct for fast
  docker runtimes (claude-code, langgraph, crewai) where cold boot is
  30-90s.
- Add RUNTIME_TIMEOUT_OVERRIDES_MS = { hermes: 720_000 } (12min).
  Aligns with tests/e2e/test_staging_full_saas.sh's
  PROVISION_TIMEOUT_SECS=900 (15min) so UI warns shortly before the
  backend itself gives up.
- New timeoutForRuntime() resolves the base; per-node lookup in the
  check-timeouts interval so a mixed batch (1 hermes + 2 langgraph) uses
  the right threshold for each.
- timeoutMs prop is now optional. Undefined → per-runtime lookup; a
  number → forces a single threshold for every workspace (tests use this
  for deterministic behavior).

Tests: 4 new cases pinning the runtime-aware resolution, including a
guard that catches future regressions that would weaken hermes's budget.
Existing tests unchanged (they import DEFAULT_PROVISION_TIMEOUT_MS which
still exports 120_000).

13/13 pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 canvas/src/components/ProvisioningTimeout.tsx | 73 +++++++++++++++----
 .../__tests__/ProvisioningTimeout.test.tsx    | 47 +++++++++++-
 2 files changed, 106 insertions(+), 14 deletions(-)

diff --git a/canvas/src/components/ProvisioningTimeout.tsx b/canvas/src/components/ProvisioningTimeout.tsx
index c4ed460c..5b254d95 100644
--- a/canvas/src/components/ProvisioningTimeout.tsx
+++ b/canvas/src/components/ProvisioningTimeout.tsx
@@ -6,11 +6,39 @@ import { api } from "@/lib/api";
 import { showToast } from "./Toaster";
 import { ConsoleModal } from "./ConsoleModal";
 
-/** Base provisioning timeout in milliseconds (2 minutes). Used as the
- *  floor; the effective threshold scales with the number of workspaces
- *  concurrently provisioning (see effectiveTimeoutMs below). */
+/** Base provisioning timeout in milliseconds (2 minutes). Floor for fast
+ *  runtimes (claude-code, langgraph, crewai) on Docker where cold boot
+ *  is 30-90s. Slow runtimes override via RUNTIME_TIMEOUT_OVERRIDES_MS.
+ *  The effective threshold also scales with concurrent-provisioning
+ *  count (see effectiveTimeoutMs below). */
 export const DEFAULT_PROVISION_TIMEOUT_MS = 120_000;
 
+/** Per-runtime timeout floors for cold-boot sequences that legitimately
+ *  exceed the 2-minute default. A too-low threshold creates false-alarm
+ *  banners telling users "your workspace is stuck" while it's actually
+ *  mid-install — confusing, and it makes users retry workspaces that
+ *  would have come online on their own.
+ *
+ *  Hermes at 12min: installs ripgrep + ffmpeg + node22 + builds
+ *  hermes-agent from source + Playwright + Chromium (~300MB). Measured
+ *  boots on staging EC2 routinely land at 8-13 min. Aligns with the
+ *  SaaS E2E PROVISION_TIMEOUT_SECS=900 (15 min) so the UI warning lands
+ *  shortly before the backend itself gives up.
+ *
+ *  Add entries here as new runtimes surface false-alarm complaints.
+ *  Runtimes absent from the map get DEFAULT_PROVISION_TIMEOUT_MS. */
+export const RUNTIME_TIMEOUT_OVERRIDES_MS: Record<string, number> = {
+  hermes: 720_000, // 12 min — see comment above
+};
+
+/** Resolve the base timeout for a workspace given its runtime. */
+export function timeoutForRuntime(runtime: string | undefined): number {
+  if (runtime && runtime in RUNTIME_TIMEOUT_OVERRIDES_MS) {
+    return RUNTIME_TIMEOUT_OVERRIDES_MS[runtime];
+  }
+  return DEFAULT_PROVISION_TIMEOUT_MS;
+}
+
 /** The server provisions up to `PROVISION_CONCURRENCY` containers at
  *  once and paces the rest in a queue (`workspaceCreatePacingMs` =
  *  2s). Mirrors the Go constants — if those change, bump these. */
@@ -43,8 +71,12 @@ interface TimeoutEntry {
  * time per node.
  */
 export function ProvisioningTimeout({
-  timeoutMs = DEFAULT_PROVISION_TIMEOUT_MS,
+  timeoutMs,
 }: {
+  // If undefined (the default when mounted without a prop), each workspace's
+  // threshold is resolved from its runtime via timeoutForRuntime().
+  // Pass an explicit number to force a single threshold for every workspace
+  // (used by tests that want deterministic behavior regardless of runtime).
   timeoutMs?: number;
 }) {
   const [timedOut, setTimedOut] = useState<TimeoutEntry[]>([]);
@@ -57,19 +89,28 @@ export function ProvisioningTimeout({
   const [dismissed, setDismissed] = useState<Set<string>>(new Set());
 
   // Subscribe to provisioning nodes — use shallow compare to avoid infinite re-render
-  // (filter+map creates new array reference on every store update)
+  // (filter+map creates new array reference on every store update).
+  // Runtime included so the timeout threshold can be resolved per-node
+  // (hermes cold-boot legitimately takes 8-13 min vs 30-90s for docker
+  //  runtimes — a single threshold would false-alarm on one or the other).
+  // Separator: `|` between fields, `,` between nodes. Names may contain
+  // anything the user typed; strip `|` and `,` so serialization round-trips.
   const provisioningNodes = useCanvasStore((s) => {
     const result = s.nodes
       .filter((n) => n.data.status === "provisioning")
-      .map((n) => `${n.id}:${n.data.name}`);
+      .map((n) => {
+        const safeName = (n.data.name ?? "").replace(/[|,]/g, " ");
+        const runtime = n.data.runtime ?? "";
+        return `${n.id}|${safeName}|${runtime}`;
+      });
     return result.join(",");
   });
   const parsedProvisioningNodes = useMemo(
     () =>
       provisioningNodes
         ? provisioningNodes.split(",").map((entry) => {
-            const [id, name] = entry.split(":");
-            return { id, name };
+            const [id, name, runtime] = entry.split("|");
+            return { id, name, runtime };
           })
         : [],
     [provisioningNodes],
@@ -113,14 +154,20 @@ export function ProvisioningTimeout({
     const interval = setInterval(() => {
       const now = Date.now();
       const newTimedOut: TimeoutEntry[] = [];
-      const effective = effectiveTimeoutMs(
-        timeoutMs,
-        parsedProvisioningNodes.length,
-      );
 
+      // Per-node timeout: each workspace has its own base (runtime-aware)
+      // scaled by the total concurrent-provisioning count. A hermes
+      // workspace in a batch alongside two langgraph workspaces gets
+      // hermes's 12-min base, not langgraph's 2-min base.
       for (const node of parsedProvisioningNodes) {
         const startedAt = tracking.get(node.id);
-        if (startedAt && now - startedAt >= effective) {
+        if (!startedAt) continue;
+        const base = timeoutMs ?? timeoutForRuntime(node.runtime);
+        const effective = effectiveTimeoutMs(
+          base,
+          parsedProvisioningNodes.length,
+        );
+        if (now - startedAt >= effective) {
           newTimedOut.push({
             workspaceId: node.id,
             workspaceName: node.name,
diff --git a/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx b/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx
index f1c5b150..7fba5552 100644
--- a/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx
+++ b/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx
@@ -7,7 +7,11 @@ global.fetch = vi.fn(() =>
 
 import { useCanvasStore } from "../../store/canvas";
 import type { WorkspaceData } from "../../store/socket";
-import { DEFAULT_PROVISION_TIMEOUT_MS } from "../ProvisioningTimeout";
+import {
+  DEFAULT_PROVISION_TIMEOUT_MS,
+  RUNTIME_TIMEOUT_OVERRIDES_MS,
+  timeoutForRuntime,
+} from "../ProvisioningTimeout";
 
 // Helper to build a WorkspaceData object
 function makeWS(overrides: Partial<WorkspaceData> & { id: string }): WorkspaceData {
@@ -184,4 +188,45 @@ describe("ProvisioningTimeout", () => {
       .nodes.filter((n) => n.data.status === "provisioning");
     expect(stillProvisioning).toHaveLength(2);
   });
+
+  // ── Runtime-aware timeout regression tests (2026-04-24 outage) ────────────
+  // Prior to this, a hermes workspace consistently false-alarmed at 2 min
+  // into its 8-13 min cold boot, pushing users to retry something that
+  // would have come online on its own. The runtime-aware override keeps
+  // the 2-min floor for fast docker runtimes while giving hermes its
+  // honest 12-min budget.
+
+  describe("timeoutForRuntime", () => {
+    it("returns the 2-min default for unknown/missing runtimes", () => {
+      expect(timeoutForRuntime(undefined)).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
+      expect(timeoutForRuntime("")).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
+      expect(timeoutForRuntime("some-future-runtime")).toBe(
+        DEFAULT_PROVISION_TIMEOUT_MS,
+      );
+    });
+
+    it("returns the docker-fast 2-min default for known-fast runtimes", () => {
+      // These aren't in the override map so they get the default.
+      // If someone ever adds one of them to RUNTIME_TIMEOUT_OVERRIDES_MS,
+      // this test catches the accidental regression.
+      expect(timeoutForRuntime("claude-code")).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
+      expect(timeoutForRuntime("langgraph")).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
+      expect(timeoutForRuntime("crewai")).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
+    });
+
+    it("returns 12 min for hermes — covers cold-boot install tail", () => {
+      expect(timeoutForRuntime("hermes")).toBe(720_000);
+      expect(timeoutForRuntime("hermes")).toBe(
+        RUNTIME_TIMEOUT_OVERRIDES_MS.hermes,
+      );
+    });
+
+    it("hermes override is materially longer than the default", () => {
+      // Guard against future refactors that accidentally weaken the
+      // override (e.g. typo lowering hermes to 72_000 = 72s).
+      expect(RUNTIME_TIMEOUT_OVERRIDES_MS.hermes).toBeGreaterThanOrEqual(
+        DEFAULT_PROVISION_TIMEOUT_MS * 5,
+      );
+    });
+  });
 });

From 0b237ed9dde24168900d47897afd76fc6d314643 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Fri, 24 Apr 2026 11:48:39 -0700
Subject: [PATCH 2/2] refactor(canvas): extract runtime profiles to
 @/lib/runtimeProfiles
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Preparation for a "hundreds of runtimes" plugin ecosystem. Keeping the
runtime-specific UX knobs in-line inside ProvisioningTimeout scales badly
— every new runtime would require editing a component, not just adding a
table entry. Other components (create-workspace dialog, workspace card
tooltips, etc.) will want the same runtime metadata.

Changes:

- New file `canvas/src/lib/runtimeProfiles.ts` owns:
  * `RuntimeProfile` type — structural shape, every field optional so
    new runtimes can partially-fill without breaking consumers.
  * `DEFAULT_RUNTIME_PROFILE` — 2-min default floor (docker-fast).
  * `RUNTIME_PROFILES` — named overrides (currently: hermes 12 min).
  * `WorkspaceRuntimeOverrides` — interface for server-provided
    per-workspace overrides, so operators can tune via template
    manifest / workspace metadata without a canvas release.
  * `getRuntimeProfile()` — resolver with
    overrides → profile → default priority.
  * `provisionTimeoutForRuntime()` — convenience wrapper.

- `ProvisioningTimeout.tsx` now delegates to the profile module.
  `DEFAULT_PROVISION_TIMEOUT_MS` re-exported for legacy test importers.

- Tests: 16/16 (up from 9 before the first fix). Adds pinning for:
  * overrides > profile > default priority chain
  * "every entry in RUNTIME_PROFILES resolves to a number" contract
  * backward-compat export

Adding a new slow runtime is now one table entry in
`canvas/src/lib/runtimeProfiles.ts` with a mandatory `WHY` comment.
Moving to server-driven profiles later is a ~10-line change (the
resolver already threads WorkspaceRuntimeOverrides through).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 canvas/src/components/ProvisioningTimeout.tsx |  51 +++-----
 .../__tests__/ProvisioningTimeout.test.tsx    | 121 +++++++++++++-----
 canvas/src/lib/runtimeProfiles.ts             | 120 +++++++++++++++++
 3 files changed, 225 insertions(+), 67 deletions(-)
 create mode 100644 canvas/src/lib/runtimeProfiles.ts

diff --git a/canvas/src/components/ProvisioningTimeout.tsx b/canvas/src/components/ProvisioningTimeout.tsx
index 5b254d95..1c09fa3b 100644
--- a/canvas/src/components/ProvisioningTimeout.tsx
+++ b/canvas/src/components/ProvisioningTimeout.tsx
@@ -6,38 +6,16 @@ import { api } from "@/lib/api";
 import { showToast } from "./Toaster";
 import { ConsoleModal } from "./ConsoleModal";
 
-/** Base provisioning timeout in milliseconds (2 minutes). Floor for fast
- *  runtimes (claude-code, langgraph, crewai) on Docker where cold boot
- *  is 30-90s. Slow runtimes override via RUNTIME_TIMEOUT_OVERRIDES_MS.
- *  The effective threshold also scales with concurrent-provisioning
- *  count (see effectiveTimeoutMs below). */
-export const DEFAULT_PROVISION_TIMEOUT_MS = 120_000;
+import {
+  DEFAULT_RUNTIME_PROFILE,
+  provisionTimeoutForRuntime,
+} from "@/lib/runtimeProfiles";
 
-/** Per-runtime timeout floors for cold-boot sequences that legitimately
- *  exceed the 2-minute default. A too-low threshold creates false-alarm
- *  banners telling users "your workspace is stuck" while it's actually
- *  mid-install — confusing, and it makes users retry workspaces that
- *  would have come online on their own.
- *
- *  Hermes at 12min: installs ripgrep + ffmpeg + node22 + builds
- *  hermes-agent from source + Playwright + Chromium (~300MB). Measured
- *  boots on staging EC2 routinely land at 8-13 min. Aligns with the
- *  SaaS E2E PROVISION_TIMEOUT_SECS=900 (15 min) so the UI warning lands
- *  shortly before the backend itself gives up.
- *
- *  Add entries here as new runtimes surface false-alarm complaints.
- *  Runtimes absent from the map get DEFAULT_PROVISION_TIMEOUT_MS. */
-export const RUNTIME_TIMEOUT_OVERRIDES_MS: Record<string, number> = {
-  hermes: 720_000, // 12 min — see comment above
-};
-
-/** Resolve the base timeout for a workspace given its runtime. */
-export function timeoutForRuntime(runtime: string | undefined): number {
-  if (runtime && runtime in RUNTIME_TIMEOUT_OVERRIDES_MS) {
-    return RUNTIME_TIMEOUT_OVERRIDES_MS[runtime];
-  }
-  return DEFAULT_PROVISION_TIMEOUT_MS;
-}
+/** Re-export for backward compatibility with tests and other importers
+ *  that previously imported DEFAULT_PROVISION_TIMEOUT_MS from this file.
+ *  New code should read via getRuntimeProfile() from @/lib/runtimeProfiles. */
+export const DEFAULT_PROVISION_TIMEOUT_MS =
+  DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs;
 
 /** The server provisions up to `PROVISION_CONCURRENCY` containers at
  *  once and paces the rest in a queue (`workspaceCreatePacingMs` =
@@ -155,14 +133,15 @@ export function ProvisioningTimeout({
       const now = Date.now();
       const newTimedOut: TimeoutEntry[] = [];
 
-      // Per-node timeout: each workspace has its own base (runtime-aware)
-      // scaled by the total concurrent-provisioning count. A hermes
-      // workspace in a batch alongside two langgraph workspaces gets
-      // hermes's 12-min base, not langgraph's 2-min base.
+      // Per-node timeout: each workspace resolves its own base via
+      // @/lib/runtimeProfiles (server-override → runtime profile →
+      // default), then scales by concurrent-provisioning count. A
+      // hermes workspace in a batch alongside two langgraph workspaces
+      // gets hermes's 12-min base, not langgraph's 2-min base.
       for (const node of parsedProvisioningNodes) {
         const startedAt = tracking.get(node.id);
         if (!startedAt) continue;
-        const base = timeoutMs ?? timeoutForRuntime(node.runtime);
+        const base = timeoutMs ?? provisionTimeoutForRuntime(node.runtime);
         const effective = effectiveTimeoutMs(
           base,
           parsedProvisioningNodes.length,
diff --git a/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx b/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx
index 7fba5552..2424ea49 100644
--- a/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx
+++ b/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx
@@ -7,11 +7,13 @@ global.fetch = vi.fn(() =>
 
 import { useCanvasStore } from "../../store/canvas";
 import type { WorkspaceData } from "../../store/socket";
+import { DEFAULT_PROVISION_TIMEOUT_MS } from "../ProvisioningTimeout";
 import {
-  DEFAULT_PROVISION_TIMEOUT_MS,
-  RUNTIME_TIMEOUT_OVERRIDES_MS,
-  timeoutForRuntime,
-} from "../ProvisioningTimeout";
+  DEFAULT_RUNTIME_PROFILE,
+  RUNTIME_PROFILES,
+  getRuntimeProfile,
+  provisionTimeoutForRuntime,
+} from "@/lib/runtimeProfiles";
 
 // Helper to build a WorkspaceData object
 function makeWS(overrides: Partial<WorkspaceData> & { id: string }): WorkspaceData {
@@ -196,37 +198,94 @@ describe("ProvisioningTimeout", () => {
   // the 2-min floor for fast docker runtimes while giving hermes its
   // honest 12-min budget.
 
-  describe("timeoutForRuntime", () => {
-    it("returns the 2-min default for unknown/missing runtimes", () => {
-      expect(timeoutForRuntime(undefined)).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
-      expect(timeoutForRuntime("")).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
-      expect(timeoutForRuntime("some-future-runtime")).toBe(
-        DEFAULT_PROVISION_TIMEOUT_MS,
-      );
+  describe("runtime profile resolution (@/lib/runtimeProfiles)", () => {
+    describe("provisionTimeoutForRuntime", () => {
+      it("returns the default for unknown/missing runtimes", () => {
+        expect(provisionTimeoutForRuntime(undefined)).toBe(
+          DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
+        );
+        expect(provisionTimeoutForRuntime("")).toBe(
+          DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
+        );
+        expect(provisionTimeoutForRuntime("some-future-runtime")).toBe(
+          DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
+        );
+      });
+
+      it("returns default for known-fast runtimes (not in profile map)", () => {
+        // If someone ever adds one of these to RUNTIME_PROFILES with a
+        // slower value, this test catches the unintended regression.
+        expect(provisionTimeoutForRuntime("claude-code")).toBe(
+          DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
+        );
+        expect(provisionTimeoutForRuntime("langgraph")).toBe(
+          DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
+        );
+        expect(provisionTimeoutForRuntime("crewai")).toBe(
+          DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
+        );
+      });
+
+      it("returns hermes override when runtime = hermes", () => {
+        expect(provisionTimeoutForRuntime("hermes")).toBe(
+          RUNTIME_PROFILES.hermes?.provisionTimeoutMs,
+        );
+        expect(provisionTimeoutForRuntime("hermes")).toBeGreaterThanOrEqual(
+          DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs * 5,
+        );
+      });
+
+      it("server-side workspace override wins over runtime profile", () => {
+        // The resolution order is: overrides → profile → default.
+        // An operator-tunable per-workspace number on the backend
+        // (e.g. via a template manifest field) should beat the canvas
+        // runtime map.
+        expect(
+          provisionTimeoutForRuntime("hermes", {
+            provisionTimeoutMs: 60_000,
+          }),
+        ).toBe(60_000);
+        expect(
+          provisionTimeoutForRuntime("some-unknown", {
+            provisionTimeoutMs: 300_000,
+          }),
+        ).toBe(300_000);
+      });
     });
 
-    it("returns the docker-fast 2-min default for known-fast runtimes", () => {
-      // These aren't in the override map so they get the default.
-      // If someone ever adds one of them to RUNTIME_TIMEOUT_OVERRIDES_MS,
-      // this test catches the accidental regression.
-      expect(timeoutForRuntime("claude-code")).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
-      expect(timeoutForRuntime("langgraph")).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
-      expect(timeoutForRuntime("crewai")).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
+    describe("getRuntimeProfile", () => {
+      it("returns a structural profile with required fields", () => {
+        const profile = getRuntimeProfile("hermes");
+        expect(profile.provisionTimeoutMs).toBeTypeOf("number");
+        expect(profile.provisionTimeoutMs).toBeGreaterThan(0);
+      });
+
+      it("default profile is a valid superset of every override", () => {
+        // Every entry in RUNTIME_PROFILES must provide fields the
+        // default does — otherwise consumers could get undefined where
+        // they expected a number. This test enforces that contract so
+        // future entries can't accidentally drop fields.
+        for (const [runtime, profile] of Object.entries(RUNTIME_PROFILES)) {
+          const resolved = getRuntimeProfile(runtime);
+          expect(
+            resolved.provisionTimeoutMs,
+            `runtime=${runtime} must resolve to a number`,
+          ).toBeTypeOf("number");
+          expect(resolved.provisionTimeoutMs).toBeGreaterThan(0);
+          // Profile's explicit value should be used iff present.
+          if (profile.provisionTimeoutMs !== undefined) {
+            expect(resolved.provisionTimeoutMs).toBe(profile.provisionTimeoutMs);
+          }
+        }
+      });
     });
 
-    it("returns 12 min for hermes — covers cold-boot install tail", () => {
-      expect(timeoutForRuntime("hermes")).toBe(720_000);
-      expect(timeoutForRuntime("hermes")).toBe(
-        RUNTIME_TIMEOUT_OVERRIDES_MS.hermes,
-      );
-    });
-
-    it("hermes override is materially longer than the default", () => {
-      // Guard against future refactors that accidentally weaken the
-      // override (e.g. typo lowering hermes to 72_000 = 72s).
-      expect(RUNTIME_TIMEOUT_OVERRIDES_MS.hermes).toBeGreaterThanOrEqual(
-        DEFAULT_PROVISION_TIMEOUT_MS * 5,
-      );
+    describe("DEFAULT_PROVISION_TIMEOUT_MS backward-compat export", () => {
+      it("still exports the same default for legacy importers", () => {
+        expect(DEFAULT_PROVISION_TIMEOUT_MS).toBe(
+          DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
+        );
+      });
     });
   });
 });
diff --git a/canvas/src/lib/runtimeProfiles.ts b/canvas/src/lib/runtimeProfiles.ts
new file mode 100644
index 00000000..68befd8a
--- /dev/null
+++ b/canvas/src/lib/runtimeProfiles.ts
@@ -0,0 +1,120 @@
+/**
+ * Runtime profiles — per-runtime UX metadata.
+ *
+ * Scaling target: hundreds of runtimes (plugin-architecture-v2 roadmap).
+ * This module is the single source of truth for runtime-specific UI knobs
+ * on the canvas side. Each runtime can declare:
+ *
+ *   - provisionTimeoutMs: when to show the "taking longer than expected"
+ *     banner. Fast docker runtimes = 2min; slow source-build runtimes = 12min.
+ *   - (future) label, icon, color, helpUrl, capabilities — add as needed.
+ *
+ * Resolution order (most specific wins):
+ *
+ *   1. Server-provided override on the workspace data (e.g.
+ *      `workspace.data.provisionTimeoutMs` set from a template manifest).
+ *      Lets operators tune without a canvas release once server-side
+ *      declarative config lands.
+ *   2. Per-runtime entry in RUNTIME_PROFILES.
+ *   3. DEFAULT_RUNTIME_PROFILE.
+ *
+ * Adding a new runtime:
+ *   - If it's fast (≤ 2min cold boot): do nothing, the default catches it.
+ *   - If it's slow: add one entry to RUNTIME_PROFILES below.
+ *   - Long-term: move runtime profiles server-side so this file can shrink.
+ *
+ * Architectural note: this deliberately lives under /lib, NOT
+ * /components/ProvisioningTimeout. Other components (e.g. a
+ * "create workspace" dialog that needs to know the runtime's expected
+ * cold-boot time) should import from here too — avoids duplicating the
+ * runtime-name knowledge across the codebase.
+ */
+
+/**
+ * Structural shape of a runtime profile. Add fields as new UX knobs
+ * become runtime-specific. Every field should be optional so new runtimes
+ * can partially fill the profile without breaking older code that reads
+ * only some fields.
+ */
+export interface RuntimeProfile {
+  /** Milliseconds before the canvas shows the "taking too long" banner.
+   *  Base value — the ProvisioningTimeout component still scales this by
+   *  concurrent-provisioning count. */
+  provisionTimeoutMs?: number;
+  // Future extensions (kept commented until used):
+  // label?: string;
+  // icon?: string;
+  // color?: string;
+  // helpUrl?: string;
+}
+
+/** The floor every runtime inherits unless it overrides. Calibrated for
+ *  docker-local fast runtimes (claude-code, langgraph, crewai) where cold
+ *  boot is 30-90s. */
+export const DEFAULT_RUNTIME_PROFILE: Required<
+  Pick<RuntimeProfile, "provisionTimeoutMs">
+> = {
+  provisionTimeoutMs: 120_000, // 2 min
+};
+
+/**
+ * Named per-runtime overrides. Keep this map small and explicit —
+ * each entry is a deliberate statement that this runtime's cold-boot
+ * behavior differs materially from the default.
+ *
+ * Each override must also ship with a comment explaining WHY the default
+ * is wrong for this runtime. Unexplained numbers rot.
+ */
+export const RUNTIME_PROFILES: Record<string, RuntimeProfile> = {
+  hermes: {
+    // 12 min. Installs ripgrep + ffmpeg + node22 + builds hermes-agent
+    // from source + Playwright + Chromium (~300MB download). Measured
+    // cold boots on staging EC2 routinely land at 8-13 min. Aligns
+    // with SaaS E2E's PROVISION_TIMEOUT_SECS=900 (15 min) so the UI
+    // warning lands shortly before the backend itself gives up.
+    provisionTimeoutMs: 720_000,
+  },
+};
+
+/**
+ * Data fields the canvas can consult for per-workspace overrides. These
+ * let the backend (via workspace data on the socket payload) override
+ * profile values without a canvas release.
+ *
+ * Intentionally loose typing — if a field isn't present on the node, we
+ * fall through to the runtime profile.
+ */
+export interface WorkspaceRuntimeOverrides {
+  provisionTimeoutMs?: number;
+}
+
+/**
+ * Resolve a runtime profile for a given runtime name, optionally merging
+ * server-provided per-workspace overrides on top.
+ *
+ * Resolution (most-specific wins):
+ *   overrides.provisionTimeoutMs
+ *   → RUNTIME_PROFILES[runtime].provisionTimeoutMs
+ *   → DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs
+ */
+export function getRuntimeProfile(
+  runtime: string | undefined,
+  overrides?: WorkspaceRuntimeOverrides,
+): Required<Pick<RuntimeProfile, "provisionTimeoutMs">> {
+  const profile = runtime ? RUNTIME_PROFILES[runtime] : undefined;
+  return {
+    provisionTimeoutMs:
+      overrides?.provisionTimeoutMs ??
+      profile?.provisionTimeoutMs ??
+      DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
+  };
+}
+
+/** Convenience: just the provisionTimeoutMs. Equivalent to
+ *  `getRuntimeProfile(runtime, overrides).provisionTimeoutMs`. */
+export function provisionTimeoutForRuntime(
+  runtime: string | undefined,
+  overrides?: WorkspaceRuntimeOverrides,
+): number {
+  return getRuntimeProfile(runtime, overrides).provisionTimeoutMs;
+}