feat(canvas): per-workspace provision_timeout_ms override (#2054)

Phase 1 of moving runtime UX knobs server-side. Builds the canvas foundation: a workspace can carry its own provision_timeout_ms (sourced server-side from a template manifest in a follow-up PR), and ProvisioningTimeout's resolver respects it per-node. Today the resolver had Props-level timeoutMs that applied to ALL nodes — fine for tests but wrong for production where one batch could mix runtimes (hermes 12-min cold boot alongside docker 2-min). The runtime profile fallback already handles per-runtime defaults; this PR adds the per-WORKSPACE override layer above that. Resolution priority (most specific wins): 1. node.provisionTimeoutMs — server-declared per-workspace override (this PR's new field) 2. timeoutMs prop — single-threshold test override 3. runtime profile in @/lib/runtimeProfiles 4. DEFAULT_RUNTIME_PROFILE Changes: - WorkspaceData (socket): add optional provision_timeout_ms - WorkspaceNodeData: add optional provisionTimeoutMs - canvas-topology hydrate: thread the field through to node.data - ProvisioningTimeout: extend the serialized-string node iteration to carry provisionTimeoutMs (4-field positional split); pass as the second arg to provisionTimeoutForRuntime - 3 new tests in ProvisioningTimeout.test.tsx covering hydrate threading, null fall-through, and resolver priority Phase 2 (separate PR, blocked on workspace-server template-config loader): workspace-server reads provision_timeout_seconds from template config.yaml at provision time, includes provision_timeout_ms in the workspace API/socket response. Phase 3 (template-repo PR): template-hermes config.yaml declares provision_timeout_seconds: 720; canvas RUNTIME_PROFILES.hermes becomes redundant and can be removed. 19/19 tests pass (3 new + 16 existing). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 06:02:56 -07:00 · 2026-04-26 06:02:56 -07:00 · 1a273f21f5
commit 1a273f21f5
parent dff14c010e
5 changed files with 94 additions and 4 deletions
--- a/canvas/src/components/ProvisioningTimeout.tsx
+++ b/canvas/src/components/ProvisioningTimeout.tsx
@ -71,15 +71,19 @@ export function ProvisioningTimeout({
  // Runtime included so the timeout threshold can be resolved per-node
  // (hermes cold-boot legitimately takes 8-13 min vs 30-90s for docker
  //  runtimes — a single threshold would false-alarm on one or the other).
+  // provisionTimeoutMs added by #2054 — server-declared per-workspace
+  // override that wins over the runtime profile when present.
  // Separator: `|` between fields, `,` between nodes. Names may contain
  // anything the user typed; strip `|` and `,` so serialization round-trips.
+  // Empty-string sentinels for missing values so split/index stays positional.
  const provisioningNodes = useCanvasStore((s) => {
    const result = s.nodes
      .filter((n) => n.data.status === "provisioning")
      .map((n) => {
        const safeName = (n.data.name ?? "").replace(/[|,]/g, " ");
        const runtime = n.data.runtime ?? "";
-        return `${n.id}|${safeName}|${runtime}`;
+        const provisionTimeoutMs = n.data.provisionTimeoutMs ?? "";
+        return `${n.id}|${safeName}|${runtime}|${provisionTimeoutMs}`;
      });
    return result.join(",");
  });
@ -87,8 +91,14 @@ export function ProvisioningTimeout({
    () =>
      provisioningNodes
        ? provisioningNodes.split(",").map((entry) => {
-            const [id, name, runtime] = entry.split("|");
-            return { id, name, runtime };
+            const [id, name, runtime, provisionTimeoutMs] = entry.split("|");
+            const ptms = provisionTimeoutMs ? Number(provisionTimeoutMs) : undefined;
+            return {
+              id,
+              name,
+              runtime,
+              provisionTimeoutMs: Number.isFinite(ptms) ? ptms : undefined,
+            };
          })
        : [],
    [provisioningNodes],
@ -138,10 +148,19 @@ export function ProvisioningTimeout({
      // default), then scales by concurrent-provisioning count. A
      // hermes workspace in a batch alongside two langgraph workspaces
      // gets hermes's 12-min base, not langgraph's 2-min base.
+      //
+      // Resolution priority (most specific wins):
+      //   1. node.provisionTimeoutMs — server-declared per-workspace
+      //      override (#2054, sourced from template manifest)
+      //   2. timeoutMs prop — single-threshold test override
+      //   3. runtime profile in @/lib/runtimeProfiles
+      //   4. DEFAULT_RUNTIME_PROFILE
      for (const node of parsedProvisioningNodes) {
        const startedAt = tracking.get(node.id);
        if (!startedAt) continue;
-        const base = timeoutMs ?? provisionTimeoutForRuntime(node.runtime);
+        const base = provisionTimeoutForRuntime(node.runtime, {
+          provisionTimeoutMs: node.provisionTimeoutMs ?? timeoutMs,
+        });
        const effective = effectiveTimeoutMs(
          base,
          parsedProvisioningNodes.length,
--- a/canvas/src/components/tests/ProvisioningTimeout.test.tsx
+++ b/canvas/src/components/tests/ProvisioningTimeout.test.tsx
@ -287,5 +287,60 @@ describe("ProvisioningTimeout", () => {
        );
      });
    });
+
+    // #2054 — per-workspace server override threading from socket
+    // payload through node-data into ProvisioningTimeout's resolver.
+    // Doesn't render the component; verifies the data path lands the
+    // value where ProvisioningTimeout reads it from.
+    describe("server-side per-workspace override (#2054)", () => {
+      it("hydrate carries provision_timeout_ms onto node.data.provisionTimeoutMs", () => {
+        useCanvasStore.getState().hydrate([
+          makeWS({
+            id: "ws-slow",
+            name: "Slow",
+            status: "provisioning",
+            runtime: "future-runtime",
+            provision_timeout_ms: 600_000,
+          }),
+        ]);
+        const node = useCanvasStore
+          .getState()
+          .nodes.find((n) => n.id === "ws-slow");
+        expect(node?.data.provisionTimeoutMs).toBe(600_000);
+      });
+
+      it("absent provision_timeout_ms hydrates to null (falls through to runtime profile)", () => {
+        useCanvasStore.getState().hydrate([
+          makeWS({ id: "ws-default", name: "Default", status: "provisioning", runtime: "hermes" }),
+        ]);
+        const node = useCanvasStore
+          .getState()
+          .nodes.find((n) => n.id === "ws-default");
+        expect(node?.data.provisionTimeoutMs).toBeNull();
+        // And the resolver still returns hermes' profile value when
+        // no override is supplied — proves the fall-through stays intact.
+        expect(
+          provisionTimeoutForRuntime("hermes", {
+            provisionTimeoutMs: node?.data.provisionTimeoutMs ?? undefined,
+          }),
+        ).toBe(RUNTIME_PROFILES.hermes.provisionTimeoutMs);
+      });
+
+      it("server override wins over runtime profile via the resolver path the component uses", () => {
+        // Mirrors ProvisioningTimeout.tsx:144 where node.provisionTimeoutMs
+        // is passed as overrides — verifies the resolver respects it
+        // even when the runtime has its own profile entry.
+        const override = 30_000;
+        expect(
+          provisionTimeoutForRuntime("hermes", {
+            provisionTimeoutMs: override,
+          }),
+        ).toBe(override);
+        // Sanity — the runtime profile would have been much larger.
+        expect(RUNTIME_PROFILES.hermes.provisionTimeoutMs).toBeGreaterThan(
+          override,
+        );
+      });
+    });
  });
 });
--- a/canvas/src/store/canvas-topology.ts
+++ b/canvas/src/store/canvas-topology.ts
@ -478,6 +478,9 @@ export function buildNodesAndEdges(
        needsRestart: false,
        budgetLimit: ws.budget_limit ?? null,
        budgetUsed: ws.budget_used ?? null,
+        // #2054 — server-declared per-workspace provisioning timeout.
+        // Falls through to the runtime profile when null/absent.
+        provisionTimeoutMs: ws.provision_timeout_ms ?? null,
      },
    };
    if (hasParent) {
--- a/canvas/src/store/canvas.ts
+++ b/canvas/src/store/canvas.ts
@ -92,6 +92,12 @@ export interface WorkspaceNodeData extends Record<string, unknown> {
  budgetLimit: number | null;
  /** Cumulative USD spend. Present when the platform tracks spend (issue #541). */
  budgetUsed?: number | null;
+  /** Per-workspace provisioning-timeout override in milliseconds (#2054).
+   *  Sourced server-side from the workspace's template manifest at provision
+   *  time. null/absent = fall through to runtime profile + default in
+   *  @/lib/runtimeProfiles. Lets a slow runtime declare its cold-boot
+   *  expectation without a canvas release. */
+  provisionTimeoutMs?: number | null;
 }

 export type PanelTab = "details" | "skills" | "chat" | "terminal" | "config" | "schedule" | "channels" | "files" | "memory" | "traces" | "events" | "activity" | "audit";
--- a/canvas/src/store/socket.ts
+++ b/canvas/src/store/socket.ts
@ -122,6 +122,13 @@ export interface WorkspaceData {
  budget_limit: number | null;
  /** Cumulative USD spend for this workspace. Present when the platform tracks spend. */
  budget_used?: number | null;
+  /** Server-declared provisioning-timeout override in milliseconds (#2054).
+   *  Sourced from the workspace's template manifest at provision time —
+   *  lets a slow runtime declare its cold-boot expectation without a
+   *  canvas release. Falls through to the per-runtime profile in
+   *  `@/lib/runtimeProfiles` when absent (the default behavior for any
+   *  template that hasn't yet declared the field). */
+  provision_timeout_ms?: number | null;
 }

 let socket: ReconnectingSocket | null = null;