Merge pull request #2494 from Molecule-AI/fix/sweeper-honor-template-timeout
fix(sweeper): honour template-manifest provision_timeout_seconds
This commit is contained in:
commit
40e09508b6
@ -260,7 +260,13 @@ func main() {
|
||||
// and the state is incoherent (e.g. user sees "Retry" after 15min but
|
||||
// backend still thinks provisioning is in progress).
|
||||
go supervised.RunWithRecover(ctx, "provision-timeout-sweep", func(c context.Context) {
|
||||
registry.StartProvisioningTimeoutSweep(c, broadcaster, registry.DefaultProvisionSweepInterval)
|
||||
// Pass the handler's per-runtime template-manifest lookup so the
|
||||
// sweeper honours `runtime_config.provision_timeout_seconds`
|
||||
// declared in any template's config.yaml — the same value the
|
||||
// canvas already reads via addProvisionTimeoutMs. Without this
|
||||
// the sweeper killed claude-code at the 10-min hardcoded floor
|
||||
// regardless of the manifest. See registry.RuntimeTimeoutLookup.
|
||||
registry.StartProvisioningTimeoutSweep(c, broadcaster, registry.DefaultProvisionSweepInterval, wh.ProvisionTimeoutSecondsForRuntime)
|
||||
})
|
||||
|
||||
// Cron Scheduler — fires A2A messages to workspaces on user-defined schedules
|
||||
|
||||
@ -498,6 +498,22 @@ func (h *WorkspaceHandler) addProvisionTimeoutMs(ws map[string]interface{}, runt
|
||||
}
|
||||
}
|
||||
|
||||
// ProvisionTimeoutSecondsForRuntime returns the per-runtime provision
|
||||
// timeout in seconds when a template's config.yaml declared
|
||||
// `runtime_config.provision_timeout_seconds`, else 0 ("no override —
|
||||
// caller falls through to its own default").
|
||||
//
|
||||
// Exported so cmd/server/main.go can pass it to
|
||||
// registry.StartProvisioningTimeoutSweep — same template-manifest value
|
||||
// the canvas reads via addProvisionTimeoutMs. Without this, the
|
||||
// sweeper killed claude-code at 10 min while the manifest declared a
|
||||
// longer window, and a user saw the "Retry" UI before their image
|
||||
// pull even finished. See registry.RuntimeTimeoutLookup for the
|
||||
// resolution order.
|
||||
func (h *WorkspaceHandler) ProvisionTimeoutSecondsForRuntime(runtime string) int {
|
||||
return h.provisionTimeouts.get(h.configsDir, runtime)
|
||||
}
|
||||
|
||||
// scanWorkspaceRow is a helper to scan workspace+layout rows into a clean JSON map.
|
||||
func scanWorkspaceRow(rows interface {
|
||||
Scan(dest ...interface{}) error
|
||||
|
||||
@ -47,18 +47,44 @@ const HermesProvisioningTimeout = 30 * time.Minute
|
||||
// query which hits the primary key / status partial index.
|
||||
const DefaultProvisionSweepInterval = 30 * time.Second
|
||||
|
||||
// provisioningTimeoutFor picks the per-runtime sweep deadline. Mirrors
|
||||
// the CP bootstrap-watcher's runtime gating (provisioner.bootstrapTimeoutFn).
|
||||
// PROVISION_TIMEOUT_SECONDS env override, when set, applies to ALL
|
||||
// runtimes — useful for ops debugging but loses the runtime nuance, so
|
||||
// operators should prefer the defaults unless they have a specific
|
||||
// reason.
|
||||
func provisioningTimeoutFor(runtime string) time.Duration {
|
||||
// RuntimeTimeoutLookup returns the per-runtime provision timeout in
|
||||
// seconds when a template's config.yaml declared
|
||||
// `runtime_config.provision_timeout_seconds`, else zero (= "no override,
|
||||
// fall through to runtime defaults below"). Same shape as
|
||||
// runtimeProvisionTimeoutsCache.get in handlers — wired through main.go
|
||||
// so this package stays template-discovery agnostic.
|
||||
//
|
||||
// Why an interface instead of importing the cache directly: registry
|
||||
// already sits below handlers in the import graph (handlers → registry,
|
||||
// not the reverse). A function-typed argument keeps that flow.
|
||||
type RuntimeTimeoutLookup func(runtime string) int
|
||||
|
||||
// provisioningTimeoutFor picks the per-runtime sweep deadline. Resolution
|
||||
// order:
|
||||
//
|
||||
// 1. PROVISION_TIMEOUT_SECONDS env — global override, ops-debug only.
|
||||
// 2. Template manifest override (lookup) — what the canvas spinner
|
||||
// also reads via #2054 phase 2. Without this, a template that
|
||||
// declared `runtime_config.provision_timeout_seconds: 900` would
|
||||
// still get killed by the sweeper at the 10-min hardcoded floor —
|
||||
// a real wiring gap that drove every claude-code burst on a cold
|
||||
// EC2 to false-positive timeout.
|
||||
// 3. Hermes special-case (CP bootstrap-watcher 25 min + 5 min slack).
|
||||
// 4. DefaultProvisioningTimeout (10 min) for everything else.
|
||||
//
|
||||
// lookup may be nil (during package tests, or before main.go has wired
|
||||
// it) — falls through to the legacy hermes/default split.
|
||||
func provisioningTimeoutFor(runtime string, lookup RuntimeTimeoutLookup) time.Duration {
|
||||
if v := os.Getenv("PROVISION_TIMEOUT_SECONDS"); v != "" {
|
||||
if n, err := strconv.Atoi(v); err == nil && n > 0 {
|
||||
return time.Duration(n) * time.Second
|
||||
}
|
||||
}
|
||||
if lookup != nil {
|
||||
if secs := lookup(runtime); secs > 0 {
|
||||
return time.Duration(secs) * time.Second
|
||||
}
|
||||
}
|
||||
if runtime == "hermes" {
|
||||
return HermesProvisioningTimeout
|
||||
}
|
||||
@ -74,7 +100,7 @@ func provisioningTimeoutFor(runtime string) time.Duration {
|
||||
// The sweep is idempotent: the UPDATE's WHERE clause re-checks both status
|
||||
// and age under the same row lock, so a workspace that raced to `online` or
|
||||
// was restarted while the sweep was scanning will not get flipped.
|
||||
func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeoutEmitter, interval time.Duration) {
|
||||
func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeoutEmitter, interval time.Duration, lookup RuntimeTimeoutLookup) {
|
||||
if emitter == nil {
|
||||
log.Println("Provision-timeout sweep: emitter is nil — skipping (no one to broadcast to)")
|
||||
return
|
||||
@ -85,15 +111,15 @@ func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeout
|
||||
ticker := time.NewTicker(interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s default / %s hermes)",
|
||||
interval, DefaultProvisioningTimeout, HermesProvisioningTimeout)
|
||||
log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s default / %s hermes / per-runtime manifest override=%v)",
|
||||
interval, DefaultProvisioningTimeout, HermesProvisioningTimeout, lookup != nil)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
sweepStuckProvisioning(ctx, emitter)
|
||||
sweepStuckProvisioning(ctx, emitter, lookup)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -109,7 +135,7 @@ func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeout
|
||||
// sweep, leaving an incoherent "marked failed but actually working"
|
||||
// state. See bootstrap_watcher.go's bootstrapTimeoutFn for the
|
||||
// canonical CP-side gating.
|
||||
func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter) {
|
||||
func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter, lookup RuntimeTimeoutLookup) {
|
||||
// We can't pre-filter by age in SQL because the threshold depends
|
||||
// on the row's runtime. Pull every provisioning row + its runtime
|
||||
// + its age, evaluate per-row in Go. Still cheap — the
|
||||
@ -141,7 +167,7 @@ func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter
|
||||
}
|
||||
|
||||
for _, c := range ids {
|
||||
timeout := provisioningTimeoutFor(c.runtime)
|
||||
timeout := provisioningTimeoutFor(c.runtime, lookup)
|
||||
timeoutSec := int(timeout / time.Second)
|
||||
if c.ageSec < timeoutSec {
|
||||
continue
|
||||
|
||||
@ -66,7 +66,7 @@ func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
emit := &fakeEmitter{}
|
||||
sweepStuckProvisioning(context.Background(), emit)
|
||||
sweepStuckProvisioning(context.Background(), emit, nil)
|
||||
|
||||
if emit.count() != 1 {
|
||||
t.Fatalf("expected 1 event, got %d", emit.count())
|
||||
@ -96,7 +96,7 @@ func TestSweepStuckProvisioning_HermesGets30MinSlack(t *testing.T) {
|
||||
WillReturnRows(candidateRows([3]any{"ws-hermes-booting", "hermes", 660}))
|
||||
|
||||
emit := &fakeEmitter{}
|
||||
sweepStuckProvisioning(context.Background(), emit)
|
||||
sweepStuckProvisioning(context.Background(), emit, nil)
|
||||
|
||||
if emit.count() != 0 {
|
||||
t.Fatalf("hermes at 11min should NOT have been flipped, got %d events", emit.count())
|
||||
@ -121,7 +121,7 @@ func TestSweepStuckProvisioning_HermesPastDeadline(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
emit := &fakeEmitter{}
|
||||
sweepStuckProvisioning(context.Background(), emit)
|
||||
sweepStuckProvisioning(context.Background(), emit, nil)
|
||||
|
||||
if emit.count() != 1 {
|
||||
t.Fatalf("hermes past 30min must be flipped, got %d events", emit.count())
|
||||
@ -151,7 +151,7 @@ func TestSweepStuckProvisioning_RaceSafe(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 0)) // 0 rows — raced
|
||||
|
||||
emit := &fakeEmitter{}
|
||||
sweepStuckProvisioning(context.Background(), emit)
|
||||
sweepStuckProvisioning(context.Background(), emit, nil)
|
||||
|
||||
if emit.count() != 0 {
|
||||
t.Errorf("expected 0 events on race, got %d", emit.count())
|
||||
@ -170,7 +170,7 @@ func TestSweepStuckProvisioning_NoStuck(t *testing.T) {
|
||||
WillReturnRows(candidateRows())
|
||||
|
||||
emit := &fakeEmitter{}
|
||||
sweepStuckProvisioning(context.Background(), emit)
|
||||
sweepStuckProvisioning(context.Background(), emit, nil)
|
||||
|
||||
if emit.count() != 0 {
|
||||
t.Errorf("expected 0 events when nothing stuck, got %d", emit.count())
|
||||
@ -201,7 +201,7 @@ func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
emit := &fakeEmitter{}
|
||||
sweepStuckProvisioning(context.Background(), emit)
|
||||
sweepStuckProvisioning(context.Background(), emit, nil)
|
||||
|
||||
if emit.count() != 2 {
|
||||
t.Fatalf("expected 2 events, got %d", emit.count())
|
||||
@ -222,7 +222,7 @@ func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) {
|
||||
|
||||
emit := &fakeEmitter{fail: true}
|
||||
// Must not panic.
|
||||
sweepStuckProvisioning(context.Background(), emit)
|
||||
sweepStuckProvisioning(context.Background(), emit, nil)
|
||||
}
|
||||
|
||||
// TestProvisioningTimeout_EnvOverride verifies PROVISION_TIMEOUT_SECONDS
|
||||
@ -231,18 +231,18 @@ func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) {
|
||||
func TestProvisioningTimeout_EnvOverride(t *testing.T) {
|
||||
t.Setenv("PROVISION_TIMEOUT_SECONDS", "60")
|
||||
// When env override is set it wins over runtime defaults.
|
||||
if got := provisioningTimeoutFor(""); got.Seconds() != 60 {
|
||||
if got := provisioningTimeoutFor("", nil); got.Seconds() != 60 {
|
||||
t.Errorf("override (no runtime): got %v, want 60s", got)
|
||||
}
|
||||
if got := provisioningTimeoutFor("hermes"); got.Seconds() != 60 {
|
||||
if got := provisioningTimeoutFor("hermes", nil); got.Seconds() != 60 {
|
||||
t.Errorf("override (hermes): got %v, want 60s", got)
|
||||
}
|
||||
t.Setenv("PROVISION_TIMEOUT_SECONDS", "")
|
||||
if got := provisioningTimeoutFor(""); got != DefaultProvisioningTimeout {
|
||||
if got := provisioningTimeoutFor("", nil); got != DefaultProvisioningTimeout {
|
||||
t.Errorf("default (no runtime): got %v, want %v", got, DefaultProvisioningTimeout)
|
||||
}
|
||||
t.Setenv("PROVISION_TIMEOUT_SECONDS", "not-a-number")
|
||||
if got := provisioningTimeoutFor("claude-code"); got != DefaultProvisioningTimeout {
|
||||
if got := provisioningTimeoutFor("claude-code", nil); got != DefaultProvisioningTimeout {
|
||||
t.Errorf("bad override (claude-code): got %v, want default %v", got, DefaultProvisioningTimeout)
|
||||
}
|
||||
}
|
||||
@ -266,8 +266,69 @@ func TestProvisioningTimeout_RuntimeAware(t *testing.T) {
|
||||
{"unknown-runtime", DefaultProvisioningTimeout},
|
||||
}
|
||||
for _, c := range cases {
|
||||
if got := provisioningTimeoutFor(c.runtime); got != c.want {
|
||||
if got := provisioningTimeoutFor(c.runtime, nil); got != c.want {
|
||||
t.Errorf("runtime=%q: got %v, want %v", c.runtime, got, c.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestProvisioningTimeout_ManifestOverride pins the resolution order
|
||||
// when a template's config.yaml declared
|
||||
// `runtime_config.provision_timeout_seconds`. Without this gate, the
|
||||
// sweeper kept the hardcoded 10-min floor regardless of manifest —
|
||||
// which is the original wiring gap that drove false-positive timeouts
|
||||
// on cold-pull claude-code bursts.
|
||||
//
|
||||
// Order pinned:
|
||||
//
|
||||
// 1. PROVISION_TIMEOUT_SECONDS env beats everything (ops debug).
|
||||
// 2. Manifest lookup beats hermes special-case + default.
|
||||
// 3. Hermes default applies when lookup returns 0 for hermes.
|
||||
// 4. DefaultProvisioningTimeout applies when lookup returns 0 for
|
||||
// anything else.
|
||||
// 5. Lookup returning 0 for ANY runtime is "no override" — never
|
||||
// a 0-second timeout (which would kill every workspace instantly).
|
||||
func TestProvisioningTimeout_ManifestOverride(t *testing.T) {
|
||||
manifest := map[string]int{
|
||||
"claude-code": 900, // 15 min — what an ops manifest bump would set
|
||||
"langgraph": 1200,
|
||||
"hermes": 2400, // 40 min — manifest can override hermes default too
|
||||
}
|
||||
lookup := func(runtime string) int { return manifest[runtime] }
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
runtime string
|
||||
want time.Duration
|
||||
}{
|
||||
{"manifest override beats default for claude-code", "claude-code", 900 * time.Second},
|
||||
{"manifest override applied for langgraph", "langgraph", 1200 * time.Second},
|
||||
{"manifest override beats hermes default", "hermes", 2400 * time.Second},
|
||||
{"unknown runtime + no manifest entry → default", "unknown-runtime", DefaultProvisioningTimeout},
|
||||
{"empty runtime + no manifest entry → default", "", DefaultProvisioningTimeout},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
if got := provisioningTimeoutFor(c.runtime, lookup); got != c.want {
|
||||
t.Errorf("got %v, want %v", got, c.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Env override beats manifest — ops debug must be the top priority.
|
||||
t.Setenv("PROVISION_TIMEOUT_SECONDS", "60")
|
||||
if got := provisioningTimeoutFor("claude-code", lookup); got.Seconds() != 60 {
|
||||
t.Errorf("env-override should beat manifest: got %v, want 60s", got)
|
||||
}
|
||||
t.Setenv("PROVISION_TIMEOUT_SECONDS", "")
|
||||
|
||||
// Lookup returning 0 means "no entry" — must NOT result in a
|
||||
// 0-second timeout. Falls through to runtime defaults.
|
||||
zeroLookup := func(_ string) int { return 0 }
|
||||
if got := provisioningTimeoutFor("claude-code", zeroLookup); got != DefaultProvisioningTimeout {
|
||||
t.Errorf("zero-from-lookup should fall through to default, got %v", got)
|
||||
}
|
||||
if got := provisioningTimeoutFor("hermes", zeroLookup); got != HermesProvisioningTimeout {
|
||||
t.Errorf("zero-from-lookup should fall through to hermes default, got %v", got)
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user