Merge pull request #2052 from Molecule-AI/fix/canvas-provisioning-timeout-runtime-aware
fix(canvas): runtime-aware provisioning-timeout threshold (hermes 12min vs default 2min)
This commit is contained in:
commit
04e60e7303
@ -6,10 +6,16 @@ import { api } from "@/lib/api";
|
||||
import { showToast } from "./Toaster";
|
||||
import { ConsoleModal } from "./ConsoleModal";
|
||||
|
||||
/** Base provisioning timeout in milliseconds (2 minutes). Used as the
|
||||
* floor; the effective threshold scales with the number of workspaces
|
||||
* concurrently provisioning (see effectiveTimeoutMs below). */
|
||||
export const DEFAULT_PROVISION_TIMEOUT_MS = 120_000;
|
||||
import {
|
||||
DEFAULT_RUNTIME_PROFILE,
|
||||
provisionTimeoutForRuntime,
|
||||
} from "@/lib/runtimeProfiles";
|
||||
|
||||
/** Re-export for backward compatibility with tests and other importers
|
||||
* that previously imported DEFAULT_PROVISION_TIMEOUT_MS from this file.
|
||||
* New code should read via getRuntimeProfile() from @/lib/runtimeProfiles. */
|
||||
export const DEFAULT_PROVISION_TIMEOUT_MS =
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs;
|
||||
|
||||
/** The server provisions up to `PROVISION_CONCURRENCY` containers at
|
||||
* once and paces the rest in a queue (`workspaceCreatePacingMs` =
|
||||
@ -43,8 +49,12 @@ interface TimeoutEntry {
|
||||
* time per node.
|
||||
*/
|
||||
export function ProvisioningTimeout({
|
||||
timeoutMs = DEFAULT_PROVISION_TIMEOUT_MS,
|
||||
timeoutMs,
|
||||
}: {
|
||||
// If undefined (the default when mounted without a prop), each workspace's
|
||||
// threshold is resolved from its runtime via timeoutForRuntime().
|
||||
// Pass an explicit number to force a single threshold for every workspace
|
||||
// (used by tests that want deterministic behavior regardless of runtime).
|
||||
timeoutMs?: number;
|
||||
}) {
|
||||
const [timedOut, setTimedOut] = useState<TimeoutEntry[]>([]);
|
||||
@ -57,19 +67,28 @@ export function ProvisioningTimeout({
|
||||
const [dismissed, setDismissed] = useState<Set<string>>(new Set());
|
||||
|
||||
// Subscribe to provisioning nodes — use shallow compare to avoid infinite re-render
|
||||
// (filter+map creates new array reference on every store update)
|
||||
// (filter+map creates new array reference on every store update).
|
||||
// Runtime included so the timeout threshold can be resolved per-node
|
||||
// (hermes cold-boot legitimately takes 8-13 min vs 30-90s for docker
|
||||
// runtimes — a single threshold would false-alarm on one or the other).
|
||||
// Separator: `|` between fields, `,` between nodes. Names may contain
|
||||
// anything the user typed; strip `|` and `,` so serialization round-trips.
|
||||
const provisioningNodes = useCanvasStore((s) => {
|
||||
const result = s.nodes
|
||||
.filter((n) => n.data.status === "provisioning")
|
||||
.map((n) => `${n.id}:${n.data.name}`);
|
||||
.map((n) => {
|
||||
const safeName = (n.data.name ?? "").replace(/[|,]/g, " ");
|
||||
const runtime = n.data.runtime ?? "";
|
||||
return `${n.id}|${safeName}|${runtime}`;
|
||||
});
|
||||
return result.join(",");
|
||||
});
|
||||
const parsedProvisioningNodes = useMemo(
|
||||
() =>
|
||||
provisioningNodes
|
||||
? provisioningNodes.split(",").map((entry) => {
|
||||
const [id, name] = entry.split(":");
|
||||
return { id, name };
|
||||
const [id, name, runtime] = entry.split("|");
|
||||
return { id, name, runtime };
|
||||
})
|
||||
: [],
|
||||
[provisioningNodes],
|
||||
@ -113,14 +132,21 @@ export function ProvisioningTimeout({
|
||||
const interval = setInterval(() => {
|
||||
const now = Date.now();
|
||||
const newTimedOut: TimeoutEntry[] = [];
|
||||
const effective = effectiveTimeoutMs(
|
||||
timeoutMs,
|
||||
parsedProvisioningNodes.length,
|
||||
);
|
||||
|
||||
// Per-node timeout: each workspace resolves its own base via
|
||||
// @/lib/runtimeProfiles (server-override → runtime profile →
|
||||
// default), then scales by concurrent-provisioning count. A
|
||||
// hermes workspace in a batch alongside two langgraph workspaces
|
||||
// gets hermes's 12-min base, not langgraph's 2-min base.
|
||||
for (const node of parsedProvisioningNodes) {
|
||||
const startedAt = tracking.get(node.id);
|
||||
if (startedAt && now - startedAt >= effective) {
|
||||
if (!startedAt) continue;
|
||||
const base = timeoutMs ?? provisionTimeoutForRuntime(node.runtime);
|
||||
const effective = effectiveTimeoutMs(
|
||||
base,
|
||||
parsedProvisioningNodes.length,
|
||||
);
|
||||
if (now - startedAt >= effective) {
|
||||
newTimedOut.push({
|
||||
workspaceId: node.id,
|
||||
workspaceName: node.name,
|
||||
|
||||
@ -8,6 +8,12 @@ global.fetch = vi.fn(() =>
|
||||
import { useCanvasStore } from "../../store/canvas";
|
||||
import type { WorkspaceData } from "../../store/socket";
|
||||
import { DEFAULT_PROVISION_TIMEOUT_MS } from "../ProvisioningTimeout";
|
||||
import {
|
||||
DEFAULT_RUNTIME_PROFILE,
|
||||
RUNTIME_PROFILES,
|
||||
getRuntimeProfile,
|
||||
provisionTimeoutForRuntime,
|
||||
} from "@/lib/runtimeProfiles";
|
||||
|
||||
// Helper to build a WorkspaceData object
|
||||
function makeWS(overrides: Partial<WorkspaceData> & { id: string }): WorkspaceData {
|
||||
@ -184,4 +190,102 @@ describe("ProvisioningTimeout", () => {
|
||||
.nodes.filter((n) => n.data.status === "provisioning");
|
||||
expect(stillProvisioning).toHaveLength(2);
|
||||
});
|
||||
|
||||
// ── Runtime-aware timeout regression tests (2026-04-24 outage) ────────────
|
||||
// Prior to this, a hermes workspace consistently false-alarmed at 2 min
|
||||
// into its 8-13 min cold boot, pushing users to retry something that
|
||||
// would have come online on its own. The runtime-aware override keeps
|
||||
// the 2-min floor for fast docker runtimes while giving hermes its
|
||||
// honest 12-min budget.
|
||||
|
||||
describe("runtime profile resolution (@/lib/runtimeProfiles)", () => {
|
||||
describe("provisionTimeoutForRuntime", () => {
|
||||
it("returns the default for unknown/missing runtimes", () => {
|
||||
expect(provisionTimeoutForRuntime(undefined)).toBe(
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
|
||||
);
|
||||
expect(provisionTimeoutForRuntime("")).toBe(
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
|
||||
);
|
||||
expect(provisionTimeoutForRuntime("some-future-runtime")).toBe(
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
|
||||
);
|
||||
});
|
||||
|
||||
it("returns default for known-fast runtimes (not in profile map)", () => {
|
||||
// If someone ever adds one of these to RUNTIME_PROFILES with a
|
||||
// slower value, this test catches the unintended regression.
|
||||
expect(provisionTimeoutForRuntime("claude-code")).toBe(
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
|
||||
);
|
||||
expect(provisionTimeoutForRuntime("langgraph")).toBe(
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
|
||||
);
|
||||
expect(provisionTimeoutForRuntime("crewai")).toBe(
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
|
||||
);
|
||||
});
|
||||
|
||||
it("returns hermes override when runtime = hermes", () => {
|
||||
expect(provisionTimeoutForRuntime("hermes")).toBe(
|
||||
RUNTIME_PROFILES.hermes?.provisionTimeoutMs,
|
||||
);
|
||||
expect(provisionTimeoutForRuntime("hermes")).toBeGreaterThanOrEqual(
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs * 5,
|
||||
);
|
||||
});
|
||||
|
||||
it("server-side workspace override wins over runtime profile", () => {
|
||||
// The resolution order is: overrides → profile → default.
|
||||
// An operator-tunable per-workspace number on the backend
|
||||
// (e.g. via a template manifest field) should beat the canvas
|
||||
// runtime map.
|
||||
expect(
|
||||
provisionTimeoutForRuntime("hermes", {
|
||||
provisionTimeoutMs: 60_000,
|
||||
}),
|
||||
).toBe(60_000);
|
||||
expect(
|
||||
provisionTimeoutForRuntime("some-unknown", {
|
||||
provisionTimeoutMs: 300_000,
|
||||
}),
|
||||
).toBe(300_000);
|
||||
});
|
||||
});
|
||||
|
||||
describe("getRuntimeProfile", () => {
|
||||
it("returns a structural profile with required fields", () => {
|
||||
const profile = getRuntimeProfile("hermes");
|
||||
expect(profile.provisionTimeoutMs).toBeTypeOf("number");
|
||||
expect(profile.provisionTimeoutMs).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it("default profile is a valid superset of every override", () => {
|
||||
// Every entry in RUNTIME_PROFILES must provide fields the
|
||||
// default does — otherwise consumers could get undefined where
|
||||
// they expected a number. This test enforces that contract so
|
||||
// future entries can't accidentally drop fields.
|
||||
for (const [runtime, profile] of Object.entries(RUNTIME_PROFILES)) {
|
||||
const resolved = getRuntimeProfile(runtime);
|
||||
expect(
|
||||
resolved.provisionTimeoutMs,
|
||||
`runtime=${runtime} must resolve to a number`,
|
||||
).toBeTypeOf("number");
|
||||
expect(resolved.provisionTimeoutMs).toBeGreaterThan(0);
|
||||
// Profile's explicit value should be used iff present.
|
||||
if (profile.provisionTimeoutMs !== undefined) {
|
||||
expect(resolved.provisionTimeoutMs).toBe(profile.provisionTimeoutMs);
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe("DEFAULT_PROVISION_TIMEOUT_MS backward-compat export", () => {
|
||||
it("still exports the same default for legacy importers", () => {
|
||||
expect(DEFAULT_PROVISION_TIMEOUT_MS).toBe(
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
|
||||
);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
120
canvas/src/lib/runtimeProfiles.ts
Normal file
120
canvas/src/lib/runtimeProfiles.ts
Normal file
@ -0,0 +1,120 @@
|
||||
/**
|
||||
* Runtime profiles — per-runtime UX metadata.
|
||||
*
|
||||
* Scaling target: hundreds of runtimes (plugin-architecture-v2 roadmap).
|
||||
* This module is the single source of truth for runtime-specific UI knobs
|
||||
* on the canvas side. Each runtime can declare:
|
||||
*
|
||||
* - provisionTimeoutMs: when to show the "taking longer than expected"
|
||||
* banner. Fast docker runtimes = 2min; slow source-build runtimes = 12min.
|
||||
* - (future) label, icon, color, helpUrl, capabilities — add as needed.
|
||||
*
|
||||
* Resolution order (most specific wins):
|
||||
*
|
||||
* 1. Server-provided override on the workspace data (e.g.
|
||||
* `workspace.data.provisionTimeoutMs` set from a template manifest).
|
||||
* Lets operators tune without a canvas release once server-side
|
||||
* declarative config lands.
|
||||
* 2. Per-runtime entry in RUNTIME_PROFILES.
|
||||
* 3. DEFAULT_RUNTIME_PROFILE.
|
||||
*
|
||||
* Adding a new runtime:
|
||||
* - If it's fast (≤ 2min cold boot): do nothing, the default catches it.
|
||||
* - If it's slow: add one entry to RUNTIME_PROFILES below.
|
||||
* - Long-term: move runtime profiles server-side so this file can shrink.
|
||||
*
|
||||
* Architectural note: this deliberately lives under /lib, NOT
|
||||
* /components/ProvisioningTimeout. Other components (e.g. a
|
||||
* "create workspace" dialog that needs to know the runtime's expected
|
||||
* cold-boot time) should import from here too — avoids duplicating the
|
||||
* runtime-name knowledge across the codebase.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Structural shape of a runtime profile. Add fields as new UX knobs
|
||||
* become runtime-specific. Every field should be optional so new runtimes
|
||||
* can partially fill the profile without breaking older code that reads
|
||||
* only some fields.
|
||||
*/
|
||||
export interface RuntimeProfile {
|
||||
/** Milliseconds before the canvas shows the "taking too long" banner.
|
||||
* Base value — the ProvisioningTimeout component still scales this by
|
||||
* concurrent-provisioning count. */
|
||||
provisionTimeoutMs?: number;
|
||||
// Future extensions (kept commented until used):
|
||||
// label?: string;
|
||||
// icon?: string;
|
||||
// color?: string;
|
||||
// helpUrl?: string;
|
||||
}
|
||||
|
||||
/** The floor every runtime inherits unless it overrides. Calibrated for
|
||||
* docker-local fast runtimes (claude-code, langgraph, crewai) where cold
|
||||
* boot is 30-90s. */
|
||||
export const DEFAULT_RUNTIME_PROFILE: Required<
|
||||
Pick<RuntimeProfile, "provisionTimeoutMs">
|
||||
> = {
|
||||
provisionTimeoutMs: 120_000, // 2 min
|
||||
};
|
||||
|
||||
/**
|
||||
* Named per-runtime overrides. Keep this map small and explicit —
|
||||
* each entry is a deliberate statement that this runtime's cold-boot
|
||||
* behavior differs materially from the default.
|
||||
*
|
||||
* Each override must also ship with a comment explaining WHY the default
|
||||
* is wrong for this runtime. Unexplained numbers rot.
|
||||
*/
|
||||
export const RUNTIME_PROFILES: Record<string, RuntimeProfile> = {
|
||||
hermes: {
|
||||
// 12 min. Installs ripgrep + ffmpeg + node22 + builds hermes-agent
|
||||
// from source + Playwright + Chromium (~300MB download). Measured
|
||||
// cold boots on staging EC2 routinely land at 8-13 min. Aligns
|
||||
// with SaaS E2E's PROVISION_TIMEOUT_SECS=900 (15 min) so the UI
|
||||
// warning lands shortly before the backend itself gives up.
|
||||
provisionTimeoutMs: 720_000,
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* Data fields the canvas can consult for per-workspace overrides. These
|
||||
* let the backend (via workspace data on the socket payload) override
|
||||
* profile values without a canvas release.
|
||||
*
|
||||
* Intentionally loose typing — if a field isn't present on the node, we
|
||||
* fall through to the runtime profile.
|
||||
*/
|
||||
export interface WorkspaceRuntimeOverrides {
|
||||
provisionTimeoutMs?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve a runtime profile for a given runtime name, optionally merging
|
||||
* server-provided per-workspace overrides on top.
|
||||
*
|
||||
* Resolution (most-specific wins):
|
||||
* overrides.provisionTimeoutMs
|
||||
* → RUNTIME_PROFILES[runtime].provisionTimeoutMs
|
||||
* → DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs
|
||||
*/
|
||||
export function getRuntimeProfile(
|
||||
runtime: string | undefined,
|
||||
overrides?: WorkspaceRuntimeOverrides,
|
||||
): Required<Pick<RuntimeProfile, "provisionTimeoutMs">> {
|
||||
const profile = runtime ? RUNTIME_PROFILES[runtime] : undefined;
|
||||
return {
|
||||
provisionTimeoutMs:
|
||||
overrides?.provisionTimeoutMs ??
|
||||
profile?.provisionTimeoutMs ??
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
|
||||
};
|
||||
}
|
||||
|
||||
/** Convenience: just the provisionTimeoutMs. Equivalent to
|
||||
* `getRuntimeProfile(runtime, overrides).provisionTimeoutMs`. */
|
||||
export function provisionTimeoutForRuntime(
|
||||
runtime: string | undefined,
|
||||
overrides?: WorkspaceRuntimeOverrides,
|
||||
): number {
|
||||
return getRuntimeProfile(runtime, overrides).provisionTimeoutMs;
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user