fix(canvas): runtime-aware provisioning-timeout threshold

Hermes workspaces cold-boot in 8-13 min (ripgrep + ffmpeg + node22 + hermes-agent source build + Playwright + Chromium ~300MB). The canvas's 2-min hardcoded "Provisioning Timeout" warning fired at ~2min and told users their workspace was "stuck" while it was still mid-install. Users hit Retry, triggering fresh cold boots and cancelling healthy workspaces. User-facing symptom (reported 2026-04-24 18:35Z): hermes workspace showed "has been provisioning for 3m 15s — it may have encountered an issue" with Retry + Cancel buttons, while the EC2 was installing node_modules. Fix: - Keep DEFAULT_PROVISION_TIMEOUT_MS = 120_000 (2min) — correct for fast docker runtimes (claude-code, langgraph, crewai) where cold boot is 30-90s. - Add RUNTIME_TIMEOUT_OVERRIDES_MS = { hermes: 720_000 } (12min). Aligns with tests/e2e/test_staging_full_saas.sh's PROVISION_TIMEOUT_SECS=900 (15min) so UI warns shortly before the backend itself gives up. - New timeoutForRuntime() resolves the base; per-node lookup in the check-timeouts interval so a mixed batch (1 hermes + 2 langgraph) uses the right threshold for each. - timeoutMs prop is now optional. Undefined → per-runtime lookup; a number → forces a single threshold for every workspace (tests use this for deterministic behavior). Tests: 4 new cases pinning the runtime-aware resolution, including a guard that catches future regressions that would weaken hermes's budget. Existing tests unchanged (they import DEFAULT_PROVISION_TIMEOUT_MS which still exports 120_000). 13/13 pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 11:46:09 -07:00 · 2026-04-24 11:46:09 -07:00 · 9597d262ca
commit 9597d262ca
parent f5d44eba8c
2 changed files with 106 additions and 14 deletions
--- a/canvas/src/components/ProvisioningTimeout.tsx
+++ b/canvas/src/components/ProvisioningTimeout.tsx
@ -6,11 +6,39 @@ import { api } from "@/lib/api";
 import { showToast } from "./Toaster";
 import { ConsoleModal } from "./ConsoleModal";

-/** Base provisioning timeout in milliseconds (2 minutes). Used as the
- *  floor; the effective threshold scales with the number of workspaces
- *  concurrently provisioning (see effectiveTimeoutMs below). */
+/** Base provisioning timeout in milliseconds (2 minutes). Floor for fast
+ *  runtimes (claude-code, langgraph, crewai) on Docker where cold boot
+ *  is 30-90s. Slow runtimes override via RUNTIME_TIMEOUT_OVERRIDES_MS.
+ *  The effective threshold also scales with concurrent-provisioning
+ *  count (see effectiveTimeoutMs below). */
 export const DEFAULT_PROVISION_TIMEOUT_MS = 120_000;

+/** Per-runtime timeout floors for cold-boot sequences that legitimately
+ *  exceed the 2-minute default. A too-low threshold creates false-alarm
+ *  banners telling users "your workspace is stuck" while it's actually
+ *  mid-install — confusing, and it makes users retry workspaces that
+ *  would have come online on their own.
+ *
+ *  Hermes at 12min: installs ripgrep + ffmpeg + node22 + builds
+ *  hermes-agent from source + Playwright + Chromium (~300MB). Measured
+ *  boots on staging EC2 routinely land at 8-13 min. Aligns with the
+ *  SaaS E2E PROVISION_TIMEOUT_SECS=900 (15 min) so the UI warning lands
+ *  shortly before the backend itself gives up.
+ *
+ *  Add entries here as new runtimes surface false-alarm complaints.
+ *  Runtimes absent from the map get DEFAULT_PROVISION_TIMEOUT_MS. */
+export const RUNTIME_TIMEOUT_OVERRIDES_MS: Record<string, number> = {
+  hermes: 720_000, // 12 min — see comment above
+};
+
+/** Resolve the base timeout for a workspace given its runtime. */
+export function timeoutForRuntime(runtime: string | undefined): number {
+  if (runtime && runtime in RUNTIME_TIMEOUT_OVERRIDES_MS) {
+    return RUNTIME_TIMEOUT_OVERRIDES_MS[runtime];
+  }
+  return DEFAULT_PROVISION_TIMEOUT_MS;
+}
+
 /** The server provisions up to `PROVISION_CONCURRENCY` containers at
 *  once and paces the rest in a queue (`workspaceCreatePacingMs` =
 *  2s). Mirrors the Go constants — if those change, bump these. */
@ -43,8 +71,12 @@ interface TimeoutEntry {
 * time per node.
 */
 export function ProvisioningTimeout({
-  timeoutMs = DEFAULT_PROVISION_TIMEOUT_MS,
+  timeoutMs,
 }: {
+  // If undefined (the default when mounted without a prop), each workspace's
+  // threshold is resolved from its runtime via timeoutForRuntime().
+  // Pass an explicit number to force a single threshold for every workspace
+  // (used by tests that want deterministic behavior regardless of runtime).
  timeoutMs?: number;
 }) {
  const [timedOut, setTimedOut] = useState<TimeoutEntry[]>([]);
@ -57,19 +89,28 @@ export function ProvisioningTimeout({
  const [dismissed, setDismissed] = useState<Set<string>>(new Set());

  // Subscribe to provisioning nodes — use shallow compare to avoid infinite re-render
-  // (filter+map creates new array reference on every store update)
+  // (filter+map creates new array reference on every store update).
+  // Runtime included so the timeout threshold can be resolved per-node
+  // (hermes cold-boot legitimately takes 8-13 min vs 30-90s for docker
+  //  runtimes — a single threshold would false-alarm on one or the other).
+  // Separator: `|` between fields, `,` between nodes. Names may contain
+  // anything the user typed; strip `|` and `,` so serialization round-trips.
  const provisioningNodes = useCanvasStore((s) => {
    const result = s.nodes
      .filter((n) => n.data.status === "provisioning")
-      .map((n) => `${n.id}:${n.data.name}`);
+      .map((n) => {
+        const safeName = (n.data.name ?? "").replace(/[|,]/g, " ");
+        const runtime = n.data.runtime ?? "";
+        return `${n.id}|${safeName}|${runtime}`;
+      });
    return result.join(",");
  });
  const parsedProvisioningNodes = useMemo(
    () =>
      provisioningNodes
        ? provisioningNodes.split(",").map((entry) => {
-            const [id, name] = entry.split(":");
-            return { id, name };
+            const [id, name, runtime] = entry.split("|");
+            return { id, name, runtime };
          })
        : [],
    [provisioningNodes],
@ -113,14 +154,20 @@ export function ProvisioningTimeout({
    const interval = setInterval(() => {
      const now = Date.now();
      const newTimedOut: TimeoutEntry[] = [];
-      const effective = effectiveTimeoutMs(
-        timeoutMs,
-        parsedProvisioningNodes.length,
-      );

+      // Per-node timeout: each workspace has its own base (runtime-aware)
+      // scaled by the total concurrent-provisioning count. A hermes
+      // workspace in a batch alongside two langgraph workspaces gets
+      // hermes's 12-min base, not langgraph's 2-min base.
      for (const node of parsedProvisioningNodes) {
        const startedAt = tracking.get(node.id);
-        if (startedAt && now - startedAt >= effective) {
+        if (!startedAt) continue;
+        const base = timeoutMs ?? timeoutForRuntime(node.runtime);
+        const effective = effectiveTimeoutMs(
+          base,
+          parsedProvisioningNodes.length,
+        );
+        if (now - startedAt >= effective) {
          newTimedOut.push({
            workspaceId: node.id,
            workspaceName: node.name,
--- a/canvas/src/components/tests/ProvisioningTimeout.test.tsx
+++ b/canvas/src/components/tests/ProvisioningTimeout.test.tsx
@ -7,7 +7,11 @@ global.fetch = vi.fn(() =>

 import { useCanvasStore } from "../../store/canvas";
 import type { WorkspaceData } from "../../store/socket";
-import { DEFAULT_PROVISION_TIMEOUT_MS } from "../ProvisioningTimeout";
+import {
+  DEFAULT_PROVISION_TIMEOUT_MS,
+  RUNTIME_TIMEOUT_OVERRIDES_MS,
+  timeoutForRuntime,
+} from "../ProvisioningTimeout";

 // Helper to build a WorkspaceData object
 function makeWS(overrides: Partial<WorkspaceData> & { id: string }): WorkspaceData {
@ -184,4 +188,45 @@ describe("ProvisioningTimeout", () => {
      .nodes.filter((n) => n.data.status === "provisioning");
    expect(stillProvisioning).toHaveLength(2);
  });
+
+  // ── Runtime-aware timeout regression tests (2026-04-24 outage) ────────────
+  // Prior to this, a hermes workspace consistently false-alarmed at 2 min
+  // into its 8-13 min cold boot, pushing users to retry something that
+  // would have come online on its own. The runtime-aware override keeps
+  // the 2-min floor for fast docker runtimes while giving hermes its
+  // honest 12-min budget.
+
+  describe("timeoutForRuntime", () => {
+    it("returns the 2-min default for unknown/missing runtimes", () => {
+      expect(timeoutForRuntime(undefined)).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
+      expect(timeoutForRuntime("")).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
+      expect(timeoutForRuntime("some-future-runtime")).toBe(
+        DEFAULT_PROVISION_TIMEOUT_MS,
+      );
+    });
+
+    it("returns the docker-fast 2-min default for known-fast runtimes", () => {
+      // These aren't in the override map so they get the default.
+      // If someone ever adds one of them to RUNTIME_TIMEOUT_OVERRIDES_MS,
+      // this test catches the accidental regression.
+      expect(timeoutForRuntime("claude-code")).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
+      expect(timeoutForRuntime("langgraph")).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
+      expect(timeoutForRuntime("crewai")).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
+    });
+
+    it("returns 12 min for hermes — covers cold-boot install tail", () => {
+      expect(timeoutForRuntime("hermes")).toBe(720_000);
+      expect(timeoutForRuntime("hermes")).toBe(
+        RUNTIME_TIMEOUT_OVERRIDES_MS.hermes,
+      );
+    });
+
+    it("hermes override is materially longer than the default", () => {
+      // Guard against future refactors that accidentally weaken the
+      // override (e.g. typo lowering hermes to 72_000 = 72s).
+      expect(RUNTIME_TIMEOUT_OVERRIDES_MS.hermes).toBeGreaterThanOrEqual(
+        DEFAULT_PROVISION_TIMEOUT_MS * 5,
+      );
+    });
+  });
 });