feat(canvas): per-workspace provision_timeout_ms override (#2054)

Phase 1 of moving runtime UX knobs server-side. Builds the canvas
foundation: a workspace can carry its own provision_timeout_ms
(sourced server-side from a template manifest in a follow-up PR),
and ProvisioningTimeout's resolver respects it per-node.

Today the resolver had Props-level timeoutMs that applied to ALL
nodes — fine for tests but wrong for production where one batch
could mix runtimes (hermes 12-min cold boot alongside docker 2-min).
The runtime profile fallback already handles per-runtime defaults;
this PR adds the per-WORKSPACE override layer above that.

Resolution priority (most specific wins):
  1. node.provisionTimeoutMs — server-declared per-workspace
     override (this PR's new field)
  2. timeoutMs prop — single-threshold test override
  3. runtime profile in @/lib/runtimeProfiles
  4. DEFAULT_RUNTIME_PROFILE

Changes:
- WorkspaceData (socket): add optional provision_timeout_ms
- WorkspaceNodeData: add optional provisionTimeoutMs
- canvas-topology hydrate: thread the field through to node.data
- ProvisioningTimeout: extend the serialized-string node iteration
  to carry provisionTimeoutMs (4-field positional split); pass as
  the second arg to provisionTimeoutForRuntime
- 3 new tests in ProvisioningTimeout.test.tsx covering hydrate
  threading, null fall-through, and resolver priority

Phase 2 (separate PR, blocked on workspace-server template-config
loader): workspace-server reads provision_timeout_seconds from
template config.yaml at provision time, includes
provision_timeout_ms in the workspace API/socket response. Phase 3
(template-repo PR): template-hermes config.yaml declares
provision_timeout_seconds: 720; canvas RUNTIME_PROFILES.hermes
becomes redundant and can be removed.

19/19 tests pass (3 new + 16 existing).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
rabbitblood 2026-04-26 06:02:56 -07:00
parent dff14c010e
commit 1a273f21f5
5 changed files with 94 additions and 4 deletions

View File

@ -71,15 +71,19 @@ export function ProvisioningTimeout({
// Runtime included so the timeout threshold can be resolved per-node
// (hermes cold-boot legitimately takes 8-13 min vs 30-90s for docker
// runtimes — a single threshold would false-alarm on one or the other).
// provisionTimeoutMs added by #2054 — server-declared per-workspace
// override that wins over the runtime profile when present.
// Separator: `|` between fields, `,` between nodes. Names may contain
// anything the user typed; strip `|` and `,` so serialization round-trips.
// Empty-string sentinels for missing values so split/index stays positional.
const provisioningNodes = useCanvasStore((s) => {
const result = s.nodes
.filter((n) => n.data.status === "provisioning")
.map((n) => {
const safeName = (n.data.name ?? "").replace(/[|,]/g, " ");
const runtime = n.data.runtime ?? "";
return `${n.id}|${safeName}|${runtime}`;
const provisionTimeoutMs = n.data.provisionTimeoutMs ?? "";
return `${n.id}|${safeName}|${runtime}|${provisionTimeoutMs}`;
});
return result.join(",");
});
@ -87,8 +91,14 @@ export function ProvisioningTimeout({
() =>
provisioningNodes
? provisioningNodes.split(",").map((entry) => {
const [id, name, runtime] = entry.split("|");
return { id, name, runtime };
const [id, name, runtime, provisionTimeoutMs] = entry.split("|");
const ptms = provisionTimeoutMs ? Number(provisionTimeoutMs) : undefined;
return {
id,
name,
runtime,
provisionTimeoutMs: Number.isFinite(ptms) ? ptms : undefined,
};
})
: [],
[provisioningNodes],
@ -138,10 +148,19 @@ export function ProvisioningTimeout({
// default), then scales by concurrent-provisioning count. A
// hermes workspace in a batch alongside two langgraph workspaces
// gets hermes's 12-min base, not langgraph's 2-min base.
//
// Resolution priority (most specific wins):
// 1. node.provisionTimeoutMs — server-declared per-workspace
// override (#2054, sourced from template manifest)
// 2. timeoutMs prop — single-threshold test override
// 3. runtime profile in @/lib/runtimeProfiles
// 4. DEFAULT_RUNTIME_PROFILE
for (const node of parsedProvisioningNodes) {
const startedAt = tracking.get(node.id);
if (!startedAt) continue;
const base = timeoutMs ?? provisionTimeoutForRuntime(node.runtime);
const base = provisionTimeoutForRuntime(node.runtime, {
provisionTimeoutMs: node.provisionTimeoutMs ?? timeoutMs,
});
const effective = effectiveTimeoutMs(
base,
parsedProvisioningNodes.length,

View File

@ -287,5 +287,60 @@ describe("ProvisioningTimeout", () => {
);
});
});
// #2054 — per-workspace server override threading from socket
// payload through node-data into ProvisioningTimeout's resolver.
// Doesn't render the component; verifies the data path lands the
// value where ProvisioningTimeout reads it from.
describe("server-side per-workspace override (#2054)", () => {
it("hydrate carries provision_timeout_ms onto node.data.provisionTimeoutMs", () => {
useCanvasStore.getState().hydrate([
makeWS({
id: "ws-slow",
name: "Slow",
status: "provisioning",
runtime: "future-runtime",
provision_timeout_ms: 600_000,
}),
]);
const node = useCanvasStore
.getState()
.nodes.find((n) => n.id === "ws-slow");
expect(node?.data.provisionTimeoutMs).toBe(600_000);
});
it("absent provision_timeout_ms hydrates to null (falls through to runtime profile)", () => {
useCanvasStore.getState().hydrate([
makeWS({ id: "ws-default", name: "Default", status: "provisioning", runtime: "hermes" }),
]);
const node = useCanvasStore
.getState()
.nodes.find((n) => n.id === "ws-default");
expect(node?.data.provisionTimeoutMs).toBeNull();
// And the resolver still returns hermes' profile value when
// no override is supplied — proves the fall-through stays intact.
expect(
provisionTimeoutForRuntime("hermes", {
provisionTimeoutMs: node?.data.provisionTimeoutMs ?? undefined,
}),
).toBe(RUNTIME_PROFILES.hermes.provisionTimeoutMs);
});
it("server override wins over runtime profile via the resolver path the component uses", () => {
// Mirrors ProvisioningTimeout.tsx:144 where node.provisionTimeoutMs
// is passed as overrides — verifies the resolver respects it
// even when the runtime has its own profile entry.
const override = 30_000;
expect(
provisionTimeoutForRuntime("hermes", {
provisionTimeoutMs: override,
}),
).toBe(override);
// Sanity — the runtime profile would have been much larger.
expect(RUNTIME_PROFILES.hermes.provisionTimeoutMs).toBeGreaterThan(
override,
);
});
});
});
});

View File

@ -478,6 +478,9 @@ export function buildNodesAndEdges(
needsRestart: false,
budgetLimit: ws.budget_limit ?? null,
budgetUsed: ws.budget_used ?? null,
// #2054 — server-declared per-workspace provisioning timeout.
// Falls through to the runtime profile when null/absent.
provisionTimeoutMs: ws.provision_timeout_ms ?? null,
},
};
if (hasParent) {

View File

@ -92,6 +92,12 @@ export interface WorkspaceNodeData extends Record<string, unknown> {
budgetLimit: number | null;
/** Cumulative USD spend. Present when the platform tracks spend (issue #541). */
budgetUsed?: number | null;
/** Per-workspace provisioning-timeout override in milliseconds (#2054).
* Sourced server-side from the workspace's template manifest at provision
* time. null/absent = fall through to runtime profile + default in
* @/lib/runtimeProfiles. Lets a slow runtime declare its cold-boot
* expectation without a canvas release. */
provisionTimeoutMs?: number | null;
}
export type PanelTab = "details" | "skills" | "chat" | "terminal" | "config" | "schedule" | "channels" | "files" | "memory" | "traces" | "events" | "activity" | "audit";

View File

@ -122,6 +122,13 @@ export interface WorkspaceData {
budget_limit: number | null;
/** Cumulative USD spend for this workspace. Present when the platform tracks spend. */
budget_used?: number | null;
/** Server-declared provisioning-timeout override in milliseconds (#2054).
* Sourced from the workspace's template manifest at provision time
* lets a slow runtime declare its cold-boot expectation without a
* canvas release. Falls through to the per-runtime profile in
* `@/lib/runtimeProfiles` when absent (the default behavior for any
* template that hasn't yet declared the field). */
provision_timeout_ms?: number | null;
}
let socket: ReconnectingSocket | null = null;