diff --git a/workspace-server/cmd/server/main.go b/workspace-server/cmd/server/main.go index f620537b..2021d631 100644 --- a/workspace-server/cmd/server/main.go +++ b/workspace-server/cmd/server/main.go @@ -260,7 +260,13 @@ func main() { // and the state is incoherent (e.g. user sees "Retry" after 15min but // backend still thinks provisioning is in progress). go supervised.RunWithRecover(ctx, "provision-timeout-sweep", func(c context.Context) { - registry.StartProvisioningTimeoutSweep(c, broadcaster, registry.DefaultProvisionSweepInterval) + // Pass the handler's per-runtime template-manifest lookup so the + // sweeper honours `runtime_config.provision_timeout_seconds` + // declared in any template's config.yaml — the same value the + // canvas already reads via addProvisionTimeoutMs. Without this + // the sweeper killed claude-code at the 10-min hardcoded floor + // regardless of the manifest. See registry.RuntimeTimeoutLookup. + registry.StartProvisioningTimeoutSweep(c, broadcaster, registry.DefaultProvisionSweepInterval, wh.ProvisionTimeoutSecondsForRuntime) }) // Cron Scheduler — fires A2A messages to workspaces on user-defined schedules diff --git a/workspace-server/internal/handlers/workspace.go b/workspace-server/internal/handlers/workspace.go index 78181e61..32057f22 100644 --- a/workspace-server/internal/handlers/workspace.go +++ b/workspace-server/internal/handlers/workspace.go @@ -498,6 +498,22 @@ func (h *WorkspaceHandler) addProvisionTimeoutMs(ws map[string]interface{}, runt } } +// ProvisionTimeoutSecondsForRuntime returns the per-runtime provision +// timeout in seconds when a template's config.yaml declared +// `runtime_config.provision_timeout_seconds`, else 0 ("no override — +// caller falls through to its own default"). +// +// Exported so cmd/server/main.go can pass it to +// registry.StartProvisioningTimeoutSweep — same template-manifest value +// the canvas reads via addProvisionTimeoutMs. Without this, the +// sweeper killed claude-code at 10 min while the manifest declared a +// longer window, and a user saw the "Retry" UI before their image +// pull even finished. See registry.RuntimeTimeoutLookup for the +// resolution order. +func (h *WorkspaceHandler) ProvisionTimeoutSecondsForRuntime(runtime string) int { + return h.provisionTimeouts.get(h.configsDir, runtime) +} + // scanWorkspaceRow is a helper to scan workspace+layout rows into a clean JSON map. func scanWorkspaceRow(rows interface { Scan(dest ...interface{}) error diff --git a/workspace-server/internal/registry/provisiontimeout.go b/workspace-server/internal/registry/provisiontimeout.go index 268c929e..1b35798e 100644 --- a/workspace-server/internal/registry/provisiontimeout.go +++ b/workspace-server/internal/registry/provisiontimeout.go @@ -47,18 +47,44 @@ const HermesProvisioningTimeout = 30 * time.Minute // query which hits the primary key / status partial index. const DefaultProvisionSweepInterval = 30 * time.Second -// provisioningTimeoutFor picks the per-runtime sweep deadline. Mirrors -// the CP bootstrap-watcher's runtime gating (provisioner.bootstrapTimeoutFn). -// PROVISION_TIMEOUT_SECONDS env override, when set, applies to ALL -// runtimes — useful for ops debugging but loses the runtime nuance, so -// operators should prefer the defaults unless they have a specific -// reason. -func provisioningTimeoutFor(runtime string) time.Duration { +// RuntimeTimeoutLookup returns the per-runtime provision timeout in +// seconds when a template's config.yaml declared +// `runtime_config.provision_timeout_seconds`, else zero (= "no override, +// fall through to runtime defaults below"). Same shape as +// runtimeProvisionTimeoutsCache.get in handlers — wired through main.go +// so this package stays template-discovery agnostic. +// +// Why an interface instead of importing the cache directly: registry +// already sits below handlers in the import graph (handlers → registry, +// not the reverse). A function-typed argument keeps that flow. +type RuntimeTimeoutLookup func(runtime string) int + +// provisioningTimeoutFor picks the per-runtime sweep deadline. Resolution +// order: +// +// 1. PROVISION_TIMEOUT_SECONDS env — global override, ops-debug only. +// 2. Template manifest override (lookup) — what the canvas spinner +// also reads via #2054 phase 2. Without this, a template that +// declared `runtime_config.provision_timeout_seconds: 900` would +// still get killed by the sweeper at the 10-min hardcoded floor — +// a real wiring gap that drove every claude-code burst on a cold +// EC2 to false-positive timeout. +// 3. Hermes special-case (CP bootstrap-watcher 25 min + 5 min slack). +// 4. DefaultProvisioningTimeout (10 min) for everything else. +// +// lookup may be nil (during package tests, or before main.go has wired +// it) — falls through to the legacy hermes/default split. +func provisioningTimeoutFor(runtime string, lookup RuntimeTimeoutLookup) time.Duration { if v := os.Getenv("PROVISION_TIMEOUT_SECONDS"); v != "" { if n, err := strconv.Atoi(v); err == nil && n > 0 { return time.Duration(n) * time.Second } } + if lookup != nil { + if secs := lookup(runtime); secs > 0 { + return time.Duration(secs) * time.Second + } + } if runtime == "hermes" { return HermesProvisioningTimeout } @@ -74,7 +100,7 @@ func provisioningTimeoutFor(runtime string) time.Duration { // The sweep is idempotent: the UPDATE's WHERE clause re-checks both status // and age under the same row lock, so a workspace that raced to `online` or // was restarted while the sweep was scanning will not get flipped. -func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeoutEmitter, interval time.Duration) { +func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeoutEmitter, interval time.Duration, lookup RuntimeTimeoutLookup) { if emitter == nil { log.Println("Provision-timeout sweep: emitter is nil — skipping (no one to broadcast to)") return @@ -85,15 +111,15 @@ func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeout ticker := time.NewTicker(interval) defer ticker.Stop() - log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s default / %s hermes)", - interval, DefaultProvisioningTimeout, HermesProvisioningTimeout) + log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s default / %s hermes / per-runtime manifest override=%v)", + interval, DefaultProvisioningTimeout, HermesProvisioningTimeout, lookup != nil) for { select { case <-ctx.Done(): return case <-ticker.C: - sweepStuckProvisioning(ctx, emitter) + sweepStuckProvisioning(ctx, emitter, lookup) } } } @@ -109,7 +135,7 @@ func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeout // sweep, leaving an incoherent "marked failed but actually working" // state. See bootstrap_watcher.go's bootstrapTimeoutFn for the // canonical CP-side gating. -func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter) { +func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter, lookup RuntimeTimeoutLookup) { // We can't pre-filter by age in SQL because the threshold depends // on the row's runtime. Pull every provisioning row + its runtime // + its age, evaluate per-row in Go. Still cheap — the @@ -141,7 +167,7 @@ func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter } for _, c := range ids { - timeout := provisioningTimeoutFor(c.runtime) + timeout := provisioningTimeoutFor(c.runtime, lookup) timeoutSec := int(timeout / time.Second) if c.ageSec < timeoutSec { continue diff --git a/workspace-server/internal/registry/provisiontimeout_test.go b/workspace-server/internal/registry/provisiontimeout_test.go index fccb966f..3d1017f2 100644 --- a/workspace-server/internal/registry/provisiontimeout_test.go +++ b/workspace-server/internal/registry/provisiontimeout_test.go @@ -66,7 +66,7 @@ func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) { WillReturnResult(sqlmock.NewResult(0, 1)) emit := &fakeEmitter{} - sweepStuckProvisioning(context.Background(), emit) + sweepStuckProvisioning(context.Background(), emit, nil) if emit.count() != 1 { t.Fatalf("expected 1 event, got %d", emit.count()) @@ -96,7 +96,7 @@ func TestSweepStuckProvisioning_HermesGets30MinSlack(t *testing.T) { WillReturnRows(candidateRows([3]any{"ws-hermes-booting", "hermes", 660})) emit := &fakeEmitter{} - sweepStuckProvisioning(context.Background(), emit) + sweepStuckProvisioning(context.Background(), emit, nil) if emit.count() != 0 { t.Fatalf("hermes at 11min should NOT have been flipped, got %d events", emit.count()) @@ -121,7 +121,7 @@ func TestSweepStuckProvisioning_HermesPastDeadline(t *testing.T) { WillReturnResult(sqlmock.NewResult(0, 1)) emit := &fakeEmitter{} - sweepStuckProvisioning(context.Background(), emit) + sweepStuckProvisioning(context.Background(), emit, nil) if emit.count() != 1 { t.Fatalf("hermes past 30min must be flipped, got %d events", emit.count()) @@ -151,7 +151,7 @@ func TestSweepStuckProvisioning_RaceSafe(t *testing.T) { WillReturnResult(sqlmock.NewResult(0, 0)) // 0 rows — raced emit := &fakeEmitter{} - sweepStuckProvisioning(context.Background(), emit) + sweepStuckProvisioning(context.Background(), emit, nil) if emit.count() != 0 { t.Errorf("expected 0 events on race, got %d", emit.count()) @@ -170,7 +170,7 @@ func TestSweepStuckProvisioning_NoStuck(t *testing.T) { WillReturnRows(candidateRows()) emit := &fakeEmitter{} - sweepStuckProvisioning(context.Background(), emit) + sweepStuckProvisioning(context.Background(), emit, nil) if emit.count() != 0 { t.Errorf("expected 0 events when nothing stuck, got %d", emit.count()) @@ -201,7 +201,7 @@ func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) { WillReturnResult(sqlmock.NewResult(0, 1)) emit := &fakeEmitter{} - sweepStuckProvisioning(context.Background(), emit) + sweepStuckProvisioning(context.Background(), emit, nil) if emit.count() != 2 { t.Fatalf("expected 2 events, got %d", emit.count()) @@ -222,7 +222,7 @@ func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) { emit := &fakeEmitter{fail: true} // Must not panic. - sweepStuckProvisioning(context.Background(), emit) + sweepStuckProvisioning(context.Background(), emit, nil) } // TestProvisioningTimeout_EnvOverride verifies PROVISION_TIMEOUT_SECONDS @@ -231,18 +231,18 @@ func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) { func TestProvisioningTimeout_EnvOverride(t *testing.T) { t.Setenv("PROVISION_TIMEOUT_SECONDS", "60") // When env override is set it wins over runtime defaults. - if got := provisioningTimeoutFor(""); got.Seconds() != 60 { + if got := provisioningTimeoutFor("", nil); got.Seconds() != 60 { t.Errorf("override (no runtime): got %v, want 60s", got) } - if got := provisioningTimeoutFor("hermes"); got.Seconds() != 60 { + if got := provisioningTimeoutFor("hermes", nil); got.Seconds() != 60 { t.Errorf("override (hermes): got %v, want 60s", got) } t.Setenv("PROVISION_TIMEOUT_SECONDS", "") - if got := provisioningTimeoutFor(""); got != DefaultProvisioningTimeout { + if got := provisioningTimeoutFor("", nil); got != DefaultProvisioningTimeout { t.Errorf("default (no runtime): got %v, want %v", got, DefaultProvisioningTimeout) } t.Setenv("PROVISION_TIMEOUT_SECONDS", "not-a-number") - if got := provisioningTimeoutFor("claude-code"); got != DefaultProvisioningTimeout { + if got := provisioningTimeoutFor("claude-code", nil); got != DefaultProvisioningTimeout { t.Errorf("bad override (claude-code): got %v, want default %v", got, DefaultProvisioningTimeout) } } @@ -266,8 +266,69 @@ func TestProvisioningTimeout_RuntimeAware(t *testing.T) { {"unknown-runtime", DefaultProvisioningTimeout}, } for _, c := range cases { - if got := provisioningTimeoutFor(c.runtime); got != c.want { + if got := provisioningTimeoutFor(c.runtime, nil); got != c.want { t.Errorf("runtime=%q: got %v, want %v", c.runtime, got, c.want) } } } + +// TestProvisioningTimeout_ManifestOverride pins the resolution order +// when a template's config.yaml declared +// `runtime_config.provision_timeout_seconds`. Without this gate, the +// sweeper kept the hardcoded 10-min floor regardless of manifest — +// which is the original wiring gap that drove false-positive timeouts +// on cold-pull claude-code bursts. +// +// Order pinned: +// +// 1. PROVISION_TIMEOUT_SECONDS env beats everything (ops debug). +// 2. Manifest lookup beats hermes special-case + default. +// 3. Hermes default applies when lookup returns 0 for hermes. +// 4. DefaultProvisioningTimeout applies when lookup returns 0 for +// anything else. +// 5. Lookup returning 0 for ANY runtime is "no override" — never +// a 0-second timeout (which would kill every workspace instantly). +func TestProvisioningTimeout_ManifestOverride(t *testing.T) { + manifest := map[string]int{ + "claude-code": 900, // 15 min — what an ops manifest bump would set + "langgraph": 1200, + "hermes": 2400, // 40 min — manifest can override hermes default too + } + lookup := func(runtime string) int { return manifest[runtime] } + + cases := []struct { + name string + runtime string + want time.Duration + }{ + {"manifest override beats default for claude-code", "claude-code", 900 * time.Second}, + {"manifest override applied for langgraph", "langgraph", 1200 * time.Second}, + {"manifest override beats hermes default", "hermes", 2400 * time.Second}, + {"unknown runtime + no manifest entry → default", "unknown-runtime", DefaultProvisioningTimeout}, + {"empty runtime + no manifest entry → default", "", DefaultProvisioningTimeout}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + if got := provisioningTimeoutFor(c.runtime, lookup); got != c.want { + t.Errorf("got %v, want %v", got, c.want) + } + }) + } + + // Env override beats manifest — ops debug must be the top priority. + t.Setenv("PROVISION_TIMEOUT_SECONDS", "60") + if got := provisioningTimeoutFor("claude-code", lookup); got.Seconds() != 60 { + t.Errorf("env-override should beat manifest: got %v, want 60s", got) + } + t.Setenv("PROVISION_TIMEOUT_SECONDS", "") + + // Lookup returning 0 means "no entry" — must NOT result in a + // 0-second timeout. Falls through to runtime defaults. + zeroLookup := func(_ string) int { return 0 } + if got := provisioningTimeoutFor("claude-code", zeroLookup); got != DefaultProvisioningTimeout { + t.Errorf("zero-from-lookup should fall through to default, got %v", got) + } + if got := provisioningTimeoutFor("hermes", zeroLookup); got != HermesProvisioningTimeout { + t.Errorf("zero-from-lookup should fall through to hermes default, got %v", got) + } +}