Merge pull request #2494 from Molecule-AI/fix/sweeper-honor-template-timeout

fix(sweeper): honour template-manifest provision_timeout_seconds
2026-05-02 04:47:53 +00:00 · 2026-05-02 04:47:53 +00:00 · 40e09508b6
commit 40e09508b6
parent 3ca2f40e16 18edf88d59
4 changed files with 135 additions and 26 deletions
--- a/workspace-server/cmd/server/main.go
+++ b/workspace-server/cmd/server/main.go
@ -260,7 +260,13 @@ func main() {
 	// and the state is incoherent (e.g. user sees "Retry" after 15min but
 	// backend still thinks provisioning is in progress).
 	go supervised.RunWithRecover(ctx, "provision-timeout-sweep", func(c context.Context) {
-		registry.StartProvisioningTimeoutSweep(c, broadcaster, registry.DefaultProvisionSweepInterval)
+		// Pass the handler's per-runtime template-manifest lookup so the
+		// sweeper honours `runtime_config.provision_timeout_seconds`
+		// declared in any template's config.yaml — the same value the
+		// canvas already reads via addProvisionTimeoutMs. Without this
+		// the sweeper killed claude-code at the 10-min hardcoded floor
+		// regardless of the manifest. See registry.RuntimeTimeoutLookup.
+		registry.StartProvisioningTimeoutSweep(c, broadcaster, registry.DefaultProvisionSweepInterval, wh.ProvisionTimeoutSecondsForRuntime)
 	})

 	// Cron Scheduler — fires A2A messages to workspaces on user-defined schedules
--- a/workspace-server/internal/handlers/workspace.go
+++ b/workspace-server/internal/handlers/workspace.go
@ -498,6 +498,22 @@ func (h *WorkspaceHandler) addProvisionTimeoutMs(ws map[string]interface{}, runt
 	}
 }

+// ProvisionTimeoutSecondsForRuntime returns the per-runtime provision
+// timeout in seconds when a template's config.yaml declared
+// `runtime_config.provision_timeout_seconds`, else 0 ("no override —
+// caller falls through to its own default").
+//
+// Exported so cmd/server/main.go can pass it to
+// registry.StartProvisioningTimeoutSweep — same template-manifest value
+// the canvas reads via addProvisionTimeoutMs. Without this, the
+// sweeper killed claude-code at 10 min while the manifest declared a
+// longer window, and a user saw the "Retry" UI before their image
+// pull even finished. See registry.RuntimeTimeoutLookup for the
+// resolution order.
+func (h *WorkspaceHandler) ProvisionTimeoutSecondsForRuntime(runtime string) int {
+	return h.provisionTimeouts.get(h.configsDir, runtime)
+}
+
 // scanWorkspaceRow is a helper to scan workspace+layout rows into a clean JSON map.
 func scanWorkspaceRow(rows interface {
 	Scan(dest ...interface{}) error
--- a/workspace-server/internal/registry/provisiontimeout.go
+++ b/workspace-server/internal/registry/provisiontimeout.go
@ -47,18 +47,44 @@ const HermesProvisioningTimeout = 30 * time.Minute
 // query which hits the primary key / status partial index.
 const DefaultProvisionSweepInterval = 30 * time.Second

-// provisioningTimeoutFor picks the per-runtime sweep deadline. Mirrors
-// the CP bootstrap-watcher's runtime gating (provisioner.bootstrapTimeoutFn).
-// PROVISION_TIMEOUT_SECONDS env override, when set, applies to ALL
-// runtimes — useful for ops debugging but loses the runtime nuance, so
-// operators should prefer the defaults unless they have a specific
-// reason.
-func provisioningTimeoutFor(runtime string) time.Duration {
+// RuntimeTimeoutLookup returns the per-runtime provision timeout in
+// seconds when a template's config.yaml declared
+// `runtime_config.provision_timeout_seconds`, else zero (= "no override,
+// fall through to runtime defaults below"). Same shape as
+// runtimeProvisionTimeoutsCache.get in handlers — wired through main.go
+// so this package stays template-discovery agnostic.
+//
+// Why an interface instead of importing the cache directly: registry
+// already sits below handlers in the import graph (handlers → registry,
+// not the reverse). A function-typed argument keeps that flow.
+type RuntimeTimeoutLookup func(runtime string) int
+
+// provisioningTimeoutFor picks the per-runtime sweep deadline. Resolution
+// order:
+//
+//  1. PROVISION_TIMEOUT_SECONDS env — global override, ops-debug only.
+//  2. Template manifest override (lookup) — what the canvas spinner
+//     also reads via #2054 phase 2. Without this, a template that
+//     declared `runtime_config.provision_timeout_seconds: 900` would
+//     still get killed by the sweeper at the 10-min hardcoded floor —
+//     a real wiring gap that drove every claude-code burst on a cold
+//     EC2 to false-positive timeout.
+//  3. Hermes special-case (CP bootstrap-watcher 25 min + 5 min slack).
+//  4. DefaultProvisioningTimeout (10 min) for everything else.
+//
+// lookup may be nil (during package tests, or before main.go has wired
+// it) — falls through to the legacy hermes/default split.
+func provisioningTimeoutFor(runtime string, lookup RuntimeTimeoutLookup) time.Duration {
 	if v := os.Getenv("PROVISION_TIMEOUT_SECONDS"); v != "" {
 		if n, err := strconv.Atoi(v); err == nil && n > 0 {
 			return time.Duration(n) * time.Second
 		}
 	}
+	if lookup != nil {
+		if secs := lookup(runtime); secs > 0 {
+			return time.Duration(secs) * time.Second
+		}
+	}
 	if runtime == "hermes" {
 		return HermesProvisioningTimeout
 	}
@ -74,7 +100,7 @@ func provisioningTimeoutFor(runtime string) time.Duration {
 // The sweep is idempotent: the UPDATE's WHERE clause re-checks both status
 // and age under the same row lock, so a workspace that raced to `online` or
 // was restarted while the sweep was scanning will not get flipped.
-func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeoutEmitter, interval time.Duration) {
+func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeoutEmitter, interval time.Duration, lookup RuntimeTimeoutLookup) {
 	if emitter == nil {
 		log.Println("Provision-timeout sweep: emitter is nil — skipping (no one to broadcast to)")
 		return
@ -85,15 +111,15 @@ func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeout
 	ticker := time.NewTicker(interval)
 	defer ticker.Stop()

-	log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s default / %s hermes)",
-		interval, DefaultProvisioningTimeout, HermesProvisioningTimeout)
+	log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s default / %s hermes / per-runtime manifest override=%v)",
+		interval, DefaultProvisioningTimeout, HermesProvisioningTimeout, lookup != nil)

 	for {
 		select {
 		case <-ctx.Done():
 			return
 		case <-ticker.C:
-			sweepStuckProvisioning(ctx, emitter)
+			sweepStuckProvisioning(ctx, emitter, lookup)
 		}
 	}
 }
@ -109,7 +135,7 @@ func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeout
 // sweep, leaving an incoherent "marked failed but actually working"
 // state. See bootstrap_watcher.go's bootstrapTimeoutFn for the
 // canonical CP-side gating.
-func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter) {
+func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter, lookup RuntimeTimeoutLookup) {
 	// We can't pre-filter by age in SQL because the threshold depends
 	// on the row's runtime. Pull every provisioning row + its runtime
 	// + its age, evaluate per-row in Go. Still cheap — the
@ -141,7 +167,7 @@ func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter
 	}

 	for _, c := range ids {
-		timeout := provisioningTimeoutFor(c.runtime)
+		timeout := provisioningTimeoutFor(c.runtime, lookup)
 		timeoutSec := int(timeout / time.Second)
 		if c.ageSec < timeoutSec {
 			continue
--- a/workspace-server/internal/registry/provisiontimeout_test.go
+++ b/workspace-server/internal/registry/provisiontimeout_test.go
@ -66,7 +66,7 @@ func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
 		WillReturnResult(sqlmock.NewResult(0, 1))

 	emit := &fakeEmitter{}
-	sweepStuckProvisioning(context.Background(), emit)
+	sweepStuckProvisioning(context.Background(), emit, nil)

 	if emit.count() != 1 {
 		t.Fatalf("expected 1 event, got %d", emit.count())
@ -96,7 +96,7 @@ func TestSweepStuckProvisioning_HermesGets30MinSlack(t *testing.T) {
 		WillReturnRows(candidateRows([3]any{"ws-hermes-booting", "hermes", 660}))

 	emit := &fakeEmitter{}
-	sweepStuckProvisioning(context.Background(), emit)
+	sweepStuckProvisioning(context.Background(), emit, nil)

 	if emit.count() != 0 {
 		t.Fatalf("hermes at 11min should NOT have been flipped, got %d events", emit.count())
@ -121,7 +121,7 @@ func TestSweepStuckProvisioning_HermesPastDeadline(t *testing.T) {
 		WillReturnResult(sqlmock.NewResult(0, 1))

 	emit := &fakeEmitter{}
-	sweepStuckProvisioning(context.Background(), emit)
+	sweepStuckProvisioning(context.Background(), emit, nil)

 	if emit.count() != 1 {
 		t.Fatalf("hermes past 30min must be flipped, got %d events", emit.count())
@ -151,7 +151,7 @@ func TestSweepStuckProvisioning_RaceSafe(t *testing.T) {
 		WillReturnResult(sqlmock.NewResult(0, 0)) // 0 rows — raced

 	emit := &fakeEmitter{}
-	sweepStuckProvisioning(context.Background(), emit)
+	sweepStuckProvisioning(context.Background(), emit, nil)

 	if emit.count() != 0 {
 		t.Errorf("expected 0 events on race, got %d", emit.count())
@ -170,7 +170,7 @@ func TestSweepStuckProvisioning_NoStuck(t *testing.T) {
 		WillReturnRows(candidateRows())

 	emit := &fakeEmitter{}
-	sweepStuckProvisioning(context.Background(), emit)
+	sweepStuckProvisioning(context.Background(), emit, nil)

 	if emit.count() != 0 {
 		t.Errorf("expected 0 events when nothing stuck, got %d", emit.count())
@ -201,7 +201,7 @@ func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) {
 		WillReturnResult(sqlmock.NewResult(0, 1))

 	emit := &fakeEmitter{}
-	sweepStuckProvisioning(context.Background(), emit)
+	sweepStuckProvisioning(context.Background(), emit, nil)

 	if emit.count() != 2 {
 		t.Fatalf("expected 2 events, got %d", emit.count())
@ -222,7 +222,7 @@ func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) {

 	emit := &fakeEmitter{fail: true}
 	// Must not panic.
-	sweepStuckProvisioning(context.Background(), emit)
+	sweepStuckProvisioning(context.Background(), emit, nil)
 }

 // TestProvisioningTimeout_EnvOverride verifies PROVISION_TIMEOUT_SECONDS
@ -231,18 +231,18 @@ func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) {
 func TestProvisioningTimeout_EnvOverride(t *testing.T) {
 	t.Setenv("PROVISION_TIMEOUT_SECONDS", "60")
 	// When env override is set it wins over runtime defaults.
-	if got := provisioningTimeoutFor(""); got.Seconds() != 60 {
+	if got := provisioningTimeoutFor("", nil); got.Seconds() != 60 {
 		t.Errorf("override (no runtime): got %v, want 60s", got)
 	}
-	if got := provisioningTimeoutFor("hermes"); got.Seconds() != 60 {
+	if got := provisioningTimeoutFor("hermes", nil); got.Seconds() != 60 {
 		t.Errorf("override (hermes): got %v, want 60s", got)
 	}
 	t.Setenv("PROVISION_TIMEOUT_SECONDS", "")
-	if got := provisioningTimeoutFor(""); got != DefaultProvisioningTimeout {
+	if got := provisioningTimeoutFor("", nil); got != DefaultProvisioningTimeout {
 		t.Errorf("default (no runtime): got %v, want %v", got, DefaultProvisioningTimeout)
 	}
 	t.Setenv("PROVISION_TIMEOUT_SECONDS", "not-a-number")
-	if got := provisioningTimeoutFor("claude-code"); got != DefaultProvisioningTimeout {
+	if got := provisioningTimeoutFor("claude-code", nil); got != DefaultProvisioningTimeout {
 		t.Errorf("bad override (claude-code): got %v, want default %v", got, DefaultProvisioningTimeout)
 	}
 }
@ -266,8 +266,69 @@ func TestProvisioningTimeout_RuntimeAware(t *testing.T) {
 		{"unknown-runtime", DefaultProvisioningTimeout},
 	}
 	for _, c := range cases {
-		if got := provisioningTimeoutFor(c.runtime); got != c.want {
+		if got := provisioningTimeoutFor(c.runtime, nil); got != c.want {
 			t.Errorf("runtime=%q: got %v, want %v", c.runtime, got, c.want)
 		}
 	}
 }
+
+// TestProvisioningTimeout_ManifestOverride pins the resolution order
+// when a template's config.yaml declared
+// `runtime_config.provision_timeout_seconds`. Without this gate, the
+// sweeper kept the hardcoded 10-min floor regardless of manifest —
+// which is the original wiring gap that drove false-positive timeouts
+// on cold-pull claude-code bursts.
+//
+// Order pinned:
+//
+//   1. PROVISION_TIMEOUT_SECONDS env beats everything (ops debug).
+//   2. Manifest lookup beats hermes special-case + default.
+//   3. Hermes default applies when lookup returns 0 for hermes.
+//   4. DefaultProvisioningTimeout applies when lookup returns 0 for
+//      anything else.
+//   5. Lookup returning 0 for ANY runtime is "no override" — never
+//      a 0-second timeout (which would kill every workspace instantly).
+func TestProvisioningTimeout_ManifestOverride(t *testing.T) {
+	manifest := map[string]int{
+		"claude-code": 900, // 15 min — what an ops manifest bump would set
+		"langgraph":   1200,
+		"hermes":      2400, // 40 min — manifest can override hermes default too
+	}
+	lookup := func(runtime string) int { return manifest[runtime] }
+
+	cases := []struct {
+		name    string
+		runtime string
+		want    time.Duration
+	}{
+		{"manifest override beats default for claude-code", "claude-code", 900 * time.Second},
+		{"manifest override applied for langgraph", "langgraph", 1200 * time.Second},
+		{"manifest override beats hermes default", "hermes", 2400 * time.Second},
+		{"unknown runtime + no manifest entry → default", "unknown-runtime", DefaultProvisioningTimeout},
+		{"empty runtime + no manifest entry → default", "", DefaultProvisioningTimeout},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			if got := provisioningTimeoutFor(c.runtime, lookup); got != c.want {
+				t.Errorf("got %v, want %v", got, c.want)
+			}
+		})
+	}
+
+	// Env override beats manifest — ops debug must be the top priority.
+	t.Setenv("PROVISION_TIMEOUT_SECONDS", "60")
+	if got := provisioningTimeoutFor("claude-code", lookup); got.Seconds() != 60 {
+		t.Errorf("env-override should beat manifest: got %v, want 60s", got)
+	}
+	t.Setenv("PROVISION_TIMEOUT_SECONDS", "")
+
+	// Lookup returning 0 means "no entry" — must NOT result in a
+	// 0-second timeout. Falls through to runtime defaults.
+	zeroLookup := func(_ string) int { return 0 }
+	if got := provisioningTimeoutFor("claude-code", zeroLookup); got != DefaultProvisioningTimeout {
+		t.Errorf("zero-from-lookup should fall through to default, got %v", got)
+	}
+	if got := provisioningTimeoutFor("hermes", zeroLookup); got != HermesProvisioningTimeout {
+		t.Errorf("zero-from-lookup should fall through to hermes default, got %v", got)
+	}
+}