fix(canvas): runtime-aware provisioning-timeout threshold
Hermes workspaces cold-boot in 8-13 min (ripgrep + ffmpeg + node22 +
hermes-agent source build + Playwright + Chromium ~300MB). The canvas's
2-min hardcoded "Provisioning Timeout" warning fired at ~2min and told
users their workspace was "stuck" while it was still mid-install. Users
hit Retry, triggering fresh cold boots and cancelling healthy workspaces.
User-facing symptom (reported 2026-04-24 18:35Z): hermes workspace showed
"has been provisioning for 3m 15s — it may have encountered an issue"
with Retry + Cancel buttons, while the EC2 was installing node_modules.
Fix:
- Keep DEFAULT_PROVISION_TIMEOUT_MS = 120_000 (2min) — correct for fast
docker runtimes (claude-code, langgraph, crewai) where cold boot is
30-90s.
- Add RUNTIME_TIMEOUT_OVERRIDES_MS = { hermes: 720_000 } (12min).
Aligns with tests/e2e/test_staging_full_saas.sh's
PROVISION_TIMEOUT_SECS=900 (15min) so UI warns shortly before the
backend itself gives up.
- New timeoutForRuntime() resolves the base; per-node lookup in the
check-timeouts interval so a mixed batch (1 hermes + 2 langgraph) uses
the right threshold for each.
- timeoutMs prop is now optional. Undefined → per-runtime lookup; a
number → forces a single threshold for every workspace (tests use this
for deterministic behavior).
Tests: 4 new cases pinning the runtime-aware resolution, including a
guard that catches future regressions that would weaken hermes's budget.
Existing tests unchanged (they import DEFAULT_PROVISION_TIMEOUT_MS which
still exports 120_000).
13/13 pass.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
f5d44eba8c
commit
9597d262ca
@ -6,11 +6,39 @@ import { api } from "@/lib/api";
|
||||
import { showToast } from "./Toaster";
|
||||
import { ConsoleModal } from "./ConsoleModal";
|
||||
|
||||
/** Base provisioning timeout in milliseconds (2 minutes). Used as the
|
||||
* floor; the effective threshold scales with the number of workspaces
|
||||
* concurrently provisioning (see effectiveTimeoutMs below). */
|
||||
/** Base provisioning timeout in milliseconds (2 minutes). Floor for fast
|
||||
* runtimes (claude-code, langgraph, crewai) on Docker where cold boot
|
||||
* is 30-90s. Slow runtimes override via RUNTIME_TIMEOUT_OVERRIDES_MS.
|
||||
* The effective threshold also scales with concurrent-provisioning
|
||||
* count (see effectiveTimeoutMs below). */
|
||||
export const DEFAULT_PROVISION_TIMEOUT_MS = 120_000;
|
||||
|
||||
/** Per-runtime timeout floors for cold-boot sequences that legitimately
|
||||
* exceed the 2-minute default. A too-low threshold creates false-alarm
|
||||
* banners telling users "your workspace is stuck" while it's actually
|
||||
* mid-install — confusing, and it makes users retry workspaces that
|
||||
* would have come online on their own.
|
||||
*
|
||||
* Hermes at 12min: installs ripgrep + ffmpeg + node22 + builds
|
||||
* hermes-agent from source + Playwright + Chromium (~300MB). Measured
|
||||
* boots on staging EC2 routinely land at 8-13 min. Aligns with the
|
||||
* SaaS E2E PROVISION_TIMEOUT_SECS=900 (15 min) so the UI warning lands
|
||||
* shortly before the backend itself gives up.
|
||||
*
|
||||
* Add entries here as new runtimes surface false-alarm complaints.
|
||||
* Runtimes absent from the map get DEFAULT_PROVISION_TIMEOUT_MS. */
|
||||
export const RUNTIME_TIMEOUT_OVERRIDES_MS: Record<string, number> = {
|
||||
hermes: 720_000, // 12 min — see comment above
|
||||
};
|
||||
|
||||
/** Resolve the base timeout for a workspace given its runtime. */
|
||||
export function timeoutForRuntime(runtime: string | undefined): number {
|
||||
if (runtime && runtime in RUNTIME_TIMEOUT_OVERRIDES_MS) {
|
||||
return RUNTIME_TIMEOUT_OVERRIDES_MS[runtime];
|
||||
}
|
||||
return DEFAULT_PROVISION_TIMEOUT_MS;
|
||||
}
|
||||
|
||||
/** The server provisions up to `PROVISION_CONCURRENCY` containers at
|
||||
* once and paces the rest in a queue (`workspaceCreatePacingMs` =
|
||||
* 2s). Mirrors the Go constants — if those change, bump these. */
|
||||
@ -43,8 +71,12 @@ interface TimeoutEntry {
|
||||
* time per node.
|
||||
*/
|
||||
export function ProvisioningTimeout({
|
||||
timeoutMs = DEFAULT_PROVISION_TIMEOUT_MS,
|
||||
timeoutMs,
|
||||
}: {
|
||||
// If undefined (the default when mounted without a prop), each workspace's
|
||||
// threshold is resolved from its runtime via timeoutForRuntime().
|
||||
// Pass an explicit number to force a single threshold for every workspace
|
||||
// (used by tests that want deterministic behavior regardless of runtime).
|
||||
timeoutMs?: number;
|
||||
}) {
|
||||
const [timedOut, setTimedOut] = useState<TimeoutEntry[]>([]);
|
||||
@ -57,19 +89,28 @@ export function ProvisioningTimeout({
|
||||
const [dismissed, setDismissed] = useState<Set<string>>(new Set());
|
||||
|
||||
// Subscribe to provisioning nodes — use shallow compare to avoid infinite re-render
|
||||
// (filter+map creates new array reference on every store update)
|
||||
// (filter+map creates new array reference on every store update).
|
||||
// Runtime included so the timeout threshold can be resolved per-node
|
||||
// (hermes cold-boot legitimately takes 8-13 min vs 30-90s for docker
|
||||
// runtimes — a single threshold would false-alarm on one or the other).
|
||||
// Separator: `|` between fields, `,` between nodes. Names may contain
|
||||
// anything the user typed; strip `|` and `,` so serialization round-trips.
|
||||
const provisioningNodes = useCanvasStore((s) => {
|
||||
const result = s.nodes
|
||||
.filter((n) => n.data.status === "provisioning")
|
||||
.map((n) => `${n.id}:${n.data.name}`);
|
||||
.map((n) => {
|
||||
const safeName = (n.data.name ?? "").replace(/[|,]/g, " ");
|
||||
const runtime = n.data.runtime ?? "";
|
||||
return `${n.id}|${safeName}|${runtime}`;
|
||||
});
|
||||
return result.join(",");
|
||||
});
|
||||
const parsedProvisioningNodes = useMemo(
|
||||
() =>
|
||||
provisioningNodes
|
||||
? provisioningNodes.split(",").map((entry) => {
|
||||
const [id, name] = entry.split(":");
|
||||
return { id, name };
|
||||
const [id, name, runtime] = entry.split("|");
|
||||
return { id, name, runtime };
|
||||
})
|
||||
: [],
|
||||
[provisioningNodes],
|
||||
@ -113,14 +154,20 @@ export function ProvisioningTimeout({
|
||||
const interval = setInterval(() => {
|
||||
const now = Date.now();
|
||||
const newTimedOut: TimeoutEntry[] = [];
|
||||
const effective = effectiveTimeoutMs(
|
||||
timeoutMs,
|
||||
parsedProvisioningNodes.length,
|
||||
);
|
||||
|
||||
// Per-node timeout: each workspace has its own base (runtime-aware)
|
||||
// scaled by the total concurrent-provisioning count. A hermes
|
||||
// workspace in a batch alongside two langgraph workspaces gets
|
||||
// hermes's 12-min base, not langgraph's 2-min base.
|
||||
for (const node of parsedProvisioningNodes) {
|
||||
const startedAt = tracking.get(node.id);
|
||||
if (startedAt && now - startedAt >= effective) {
|
||||
if (!startedAt) continue;
|
||||
const base = timeoutMs ?? timeoutForRuntime(node.runtime);
|
||||
const effective = effectiveTimeoutMs(
|
||||
base,
|
||||
parsedProvisioningNodes.length,
|
||||
);
|
||||
if (now - startedAt >= effective) {
|
||||
newTimedOut.push({
|
||||
workspaceId: node.id,
|
||||
workspaceName: node.name,
|
||||
|
||||
@ -7,7 +7,11 @@ global.fetch = vi.fn(() =>
|
||||
|
||||
import { useCanvasStore } from "../../store/canvas";
|
||||
import type { WorkspaceData } from "../../store/socket";
|
||||
import { DEFAULT_PROVISION_TIMEOUT_MS } from "../ProvisioningTimeout";
|
||||
import {
|
||||
DEFAULT_PROVISION_TIMEOUT_MS,
|
||||
RUNTIME_TIMEOUT_OVERRIDES_MS,
|
||||
timeoutForRuntime,
|
||||
} from "../ProvisioningTimeout";
|
||||
|
||||
// Helper to build a WorkspaceData object
|
||||
function makeWS(overrides: Partial<WorkspaceData> & { id: string }): WorkspaceData {
|
||||
@ -184,4 +188,45 @@ describe("ProvisioningTimeout", () => {
|
||||
.nodes.filter((n) => n.data.status === "provisioning");
|
||||
expect(stillProvisioning).toHaveLength(2);
|
||||
});
|
||||
|
||||
// ── Runtime-aware timeout regression tests (2026-04-24 outage) ────────────
|
||||
// Prior to this, a hermes workspace consistently false-alarmed at 2 min
|
||||
// into its 8-13 min cold boot, pushing users to retry something that
|
||||
// would have come online on its own. The runtime-aware override keeps
|
||||
// the 2-min floor for fast docker runtimes while giving hermes its
|
||||
// honest 12-min budget.
|
||||
|
||||
describe("timeoutForRuntime", () => {
|
||||
it("returns the 2-min default for unknown/missing runtimes", () => {
|
||||
expect(timeoutForRuntime(undefined)).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
|
||||
expect(timeoutForRuntime("")).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
|
||||
expect(timeoutForRuntime("some-future-runtime")).toBe(
|
||||
DEFAULT_PROVISION_TIMEOUT_MS,
|
||||
);
|
||||
});
|
||||
|
||||
it("returns the docker-fast 2-min default for known-fast runtimes", () => {
|
||||
// These aren't in the override map so they get the default.
|
||||
// If someone ever adds one of them to RUNTIME_TIMEOUT_OVERRIDES_MS,
|
||||
// this test catches the accidental regression.
|
||||
expect(timeoutForRuntime("claude-code")).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
|
||||
expect(timeoutForRuntime("langgraph")).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
|
||||
expect(timeoutForRuntime("crewai")).toBe(DEFAULT_PROVISION_TIMEOUT_MS);
|
||||
});
|
||||
|
||||
it("returns 12 min for hermes — covers cold-boot install tail", () => {
|
||||
expect(timeoutForRuntime("hermes")).toBe(720_000);
|
||||
expect(timeoutForRuntime("hermes")).toBe(
|
||||
RUNTIME_TIMEOUT_OVERRIDES_MS.hermes,
|
||||
);
|
||||
});
|
||||
|
||||
it("hermes override is materially longer than the default", () => {
|
||||
// Guard against future refactors that accidentally weaken the
|
||||
// override (e.g. typo lowering hermes to 72_000 = 72s).
|
||||
expect(RUNTIME_TIMEOUT_OVERRIDES_MS.hermes).toBeGreaterThanOrEqual(
|
||||
DEFAULT_PROVISION_TIMEOUT_MS * 5,
|
||||
);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
Loading…
Reference in New Issue
Block a user