fix(registry): runtime-aware provision-timeout sweep — give hermes 30 min

Pre-fix: workspace-server's provision-timeout sweep was hardcoded at 10 min for all runtimes. The CP-side bootstrap-watcher (cp#245) correctly gives hermes 25 min for cold-boot (hermes installs include apt + uv + Python venv + Node + hermes-agent — 13–25 min on slow apt mirrors is normal). The two timeout systems disagreed: the watcher would happily wait 25 min, but the workspace-server's 10-min sweep killed healthy hermes boots mid-install at 10 min and marked them failed. Today's example: #2061's E2E run on 2026-04-26 at 08:06:34Z created a hermes workspace, EC2 cloud-init was visibly making progress on apt-installs (libcjson1, libmbedcrypto7t64) when the sweep flipped status to 'failed' at 08:17:00Z (10:26 elapsed). The test threw "Workspace failed: " (empty error from sql.NullString serialization) and CI failed on a healthy boot. Fix: provisioningTimeoutFor(runtime) — same shape as the CP's bootstrapTimeoutFn: - hermes: 30 min (watcher's 25 min + 5 min slack) - others: 10 min (unchanged — claude-code/langgraph/etc. boot in <5 min, 10 min is plenty) PROVISION_TIMEOUT_SECONDS env override still works (applies to all runtimes — operators who care about the runtime distinction shouldn't use the override anyway). Sweep query change: pulls (id, runtime, age_sec) per row instead of pre-filtering by age in SQL. Per-row Go evaluation picks the correct timeout. Slightly more rows scanned but bounded by the status='provisioning' partial index — workspaces in flight, not historical. Tests: - TestProvisioningTimeout_RuntimeAware — locks in the per-runtime mapping - TestSweepStuckProvisioning_HermesGets30MinSlack — hermes at 11 min must NOT be flipped - TestSweepStuckProvisioning_HermesPastDeadline — hermes at 31 min IS flipped, payload includes runtime - Existing tests updated for the new query shape Verified: - go build ./... clean - go vet ./... clean - go test ./... all green Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 01:44:09 -07:00 · 2026-04-26 01:44:09 -07:00 · be1beff4a0
commit be1beff4a0
parent c4681c335e
2 changed files with 188 additions and 52 deletions
--- a/workspace-server/internal/registry/provisiontimeout.go
+++ b/workspace-server/internal/registry/provisiontimeout.go
@ -18,30 +18,49 @@ type ProvisionTimeoutEmitter interface {
 }

 // DefaultProvisioningTimeout is how long a workspace may sit in
-// status='provisioning' before the sweeper flips it to 'failed'. The
-// container-launch path has its own 3-minute context timeout
-// (provisioner.ProvisionTimeout) but that only bounds the docker API call —
-// a container that started but crashes before /registry/register never
-// triggers that path and would sit in provisioning forever. 10 minutes
-// covers pathological image-pull + user-data execution on a cold EC2 worker
-// while still getting well ahead of the "15+ minute" stuck state users see
-// in production.
+// status='provisioning' before the sweeper flips it to 'failed'.
+// Default for non-hermes runtimes (claude-code, langgraph, crewai,
+// autogen, etc.) which cold-boot in <5 min. The container-launch path
+// has its own 3-minute context timeout (provisioner.ProvisionTimeout)
+// but that only bounds the docker API call — a container that started
+// but crashes before /registry/register never triggers that path and
+// would sit in provisioning forever. 10 minutes covers pathological
+// image-pull + user-data execution on a cold EC2 worker while still
+// getting well ahead of the "15+ minute" stuck state users see in
+// production.
 const DefaultProvisioningTimeout = 10 * time.Minute

+// HermesProvisioningTimeout matches the CP bootstrap-watcher's
+// runtime-aware deadline (cp#245) for hermes workspaces: 25 min watcher
+// + 5 min sweep slack. Hermes cold-boot does apt + uv + Python venv +
+// Node + hermes-agent install — 13–25 min on slow apt mirrors is
+// normal. Without this, the sweep would flip the workspace to 'failed'
+// at 10 min while the watcher (and the workspace itself) is still
+// happily progressing through install. Issue #1843 follow-up: a
+// healthy 10.5-min hermes boot was killed by the 10-min sweep on
+// 2026-04-26, breaking #2061's E2E.
+const HermesProvisioningTimeout = 30 * time.Minute
+
 // DefaultProvisionSweepInterval is how often the sweeper polls. Same cadence
 // as the hibernation monitor — cheap and bounded by the provisioning-state
 // query which hits the primary key / status partial index.
 const DefaultProvisionSweepInterval = 30 * time.Second

-// provisioningTimeout reads the override from env, falling back to the
-// default. Env var expressed in seconds so operators can tune via a normal
-// container restart without a code change.
-func provisioningTimeout() time.Duration {
+// provisioningTimeoutFor picks the per-runtime sweep deadline. Mirrors
+// the CP bootstrap-watcher's runtime gating (provisioner.bootstrapTimeoutFn).
+// PROVISION_TIMEOUT_SECONDS env override, when set, applies to ALL
+// runtimes — useful for ops debugging but loses the runtime nuance, so
+// operators should prefer the defaults unless they have a specific
+// reason.
+func provisioningTimeoutFor(runtime string) time.Duration {
 	if v := os.Getenv("PROVISION_TIMEOUT_SECONDS"); v != "" {
 		if n, err := strconv.Atoi(v); err == nil && n > 0 {
 			return time.Duration(n) * time.Second
 		}
 	}
+	if runtime == "hermes" {
+		return HermesProvisioningTimeout
+	}
 	return DefaultProvisioningTimeout
 }

@ -65,7 +84,8 @@ func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeout
 	ticker := time.NewTicker(interval)
 	defer ticker.Stop()

-	log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s)", interval, provisioningTimeout())
+	log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s default / %s hermes)",
+		interval, DefaultProvisioningTimeout, HermesProvisioningTimeout)

 	for {
 		select {
@ -80,33 +100,51 @@ func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeout
 // sweepStuckProvisioning is one tick of the sweeper. Exported-for-test via
 // the package boundary: keep all time.Now reads inside so tests can drive it
 // deterministically by seeding updated_at rather than manipulating time.
+//
+// Runtime-aware: the per-workspace timeout depends on `runtime`. Hermes
+// gets 30 min (matching the CP bootstrap-watcher's 25-min deadline + 5
+// min slack); everything else gets 10 min. Without this distinction a
+// healthy hermes cold-boot at 10–25 min got killed mid-install by this
+// sweep, leaving an incoherent "marked failed but actually working"
+// state. See bootstrap_watcher.go's bootstrapTimeoutFn for the
+// canonical CP-side gating.
 func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter) {
-	timeout := provisioningTimeout()
-	timeoutSec := int(timeout / time.Second)
-
-	// Read candidates first so the event broadcast can include each id. The
-	// subsequent UPDATE re-checks the predicate to stay race-safe against
-	// concurrent restart / register paths that write updated_at.
+	// We can't pre-filter by age in SQL because the threshold depends
+	// on the row's runtime. Pull every provisioning row + its runtime
+	// + its age, evaluate per-row in Go. Still cheap — the
+	// status='provisioning' row count is bounded (workspaces in
+	// flight, not historical) and the partial index on status keeps
+	// it fast.
 	rows, err := db.DB.QueryContext(ctx, `
-		SELECT id FROM workspaces
+		SELECT id, COALESCE(runtime, ''), EXTRACT(EPOCH FROM (now() - updated_at))::int
+		FROM workspaces
 		WHERE status = 'provisioning'
-		  AND updated_at < now() - ($1 || ' seconds')::interval
-	`, timeoutSec)
+	`)
 	if err != nil {
 		log.Printf("Provision-timeout sweep: query error: %v", err)
 		return
 	}
 	defer rows.Close()

-	var ids []string
+	type candidate struct {
+		id      string
+		runtime string
+		ageSec  int
+	}
+	var ids []candidate
 	for rows.Next() {
-		var id string
-		if err := rows.Scan(&id); err == nil {
-			ids = append(ids, id)
+		var c candidate
+		if err := rows.Scan(&c.id, &c.runtime, &c.ageSec); err == nil {
+			ids = append(ids, c)
 		}
 	}

-	for _, id := range ids {
+	for _, c := range ids {
+		timeout := provisioningTimeoutFor(c.runtime)
+		timeoutSec := int(timeout / time.Second)
+		if c.ageSec < timeoutSec {
+			continue
+		}
 		msg := "provisioning timed out — container started but never called /registry/register. Check container logs and network connectivity to the platform."
 		res, err := db.DB.ExecContext(ctx, `
 			UPDATE workspaces
@ -116,9 +154,9 @@ func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter
 			 WHERE id = $1
 			   AND status = 'provisioning'
 			   AND updated_at < now() - ($3 || ' seconds')::interval
-		`, id, msg, timeoutSec)
+		`, c.id, msg, timeoutSec)
 		if err != nil {
-			log.Printf("Provision-timeout sweep: failed to flip %s to failed: %v", id, err)
+			log.Printf("Provision-timeout sweep: failed to flip %s to failed: %v", c.id, err)
 			continue
 		}
 		affected, _ := res.RowsAffected()
@ -126,18 +164,19 @@ func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter
 			// Raced with restart / register — no harm, just skip.
 			continue
 		}
-		log.Printf("Provision-timeout sweep: %s stuck in provisioning > %s — marked failed", id, timeout)
+		log.Printf("Provision-timeout sweep: %s (runtime=%q) stuck in provisioning > %s — marked failed", c.id, c.runtime, timeout)
 		// Emit as WORKSPACE_PROVISION_FAILED, not _TIMEOUT, because the
 		// canvas event handler only flips node state on the _FAILED case.
 		// A separate event type was considered but the UI reaction is
 		// identical either way — operators who need to distinguish can
 		// tell from the `source` payload field.
-		if emitErr := emitter.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", id, map[string]interface{}{
+		if emitErr := emitter.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", c.id, map[string]interface{}{
 			"error":        msg,
 			"timeout_secs": timeoutSec,
+			"runtime":      c.runtime,
 			"source":       "provision_timeout_sweep",
 		}); emitErr != nil {
-			log.Printf("Provision-timeout sweep: broadcast failed for %s: %v", id, emitErr)
+			log.Printf("Provision-timeout sweep: broadcast failed for %s: %v", c.id, emitErr)
 		}
 	}
 }
--- a/workspace-server/internal/registry/provisiontimeout_test.go
+++ b/workspace-server/internal/registry/provisiontimeout_test.go
@ -5,6 +5,7 @@ import (
 	"errors"
 	"sync"
 	"testing"
+	"time"

 	"github.com/DATA-DOG/go-sqlmock"
 )
@ -40,13 +41,24 @@ func (f *fakeEmitter) count() int {
 	return len(f.events)
 }

+// candidateRows builds the new-shape query result (id, runtime, age_sec).
+// Use this in every sweep test to match the runtime-aware SELECT.
+func candidateRows(rows ...[3]any) *sqlmock.Rows {
+	r := sqlmock.NewRows([]string{"id", "runtime", "age_sec"})
+	for _, row := range rows {
+		r = r.AddRow(row[0], row[1], row[2])
+	}
+	return r
+}
+
 // TestSweepStuckProvisioning_FlipsOverdue verifies the happy path: a stuck
 // provisioning workspace gets flipped to failed AND an event is broadcast.
 func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
 	mock := setupTestDB(t)

-	mock.ExpectQuery(`SELECT id FROM workspaces`).
-		WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-stuck"))
+	// claude-code workspace, 700s old > 600s default timeout → flipped.
+	mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
+		WillReturnRows(candidateRows([3]any{"ws-stuck", "claude-code", 700}))

 	mock.ExpectExec(`UPDATE workspaces`).
 		WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg()).
@ -69,6 +81,60 @@ func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
 	}
 }

+// TestSweepStuckProvisioning_HermesGets30MinSlack — the regression that
+// motivated the runtime-aware change. A hermes workspace 11 min into
+// cold-boot must NOT be flipped to failed; the watcher's 25-min budget
+// covers it. Without the fix, the 10-min sweep killed healthy hermes
+// boots mid-install (issue #2061's E2E failure on 2026-04-26).
+func TestSweepStuckProvisioning_HermesGets30MinSlack(t *testing.T) {
+	mock := setupTestDB(t)
+
+	// 11 min = 660 sec. < HermesProvisioningTimeout (1800s).
+	// No UPDATE should fire — hermes still has time.
+	mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
+		WillReturnRows(candidateRows([3]any{"ws-hermes-booting", "hermes", 660}))
+
+	emit := &fakeEmitter{}
+	sweepStuckProvisioning(context.Background(), emit)
+
+	if emit.count() != 0 {
+		t.Fatalf("hermes at 11min should NOT have been flipped, got %d events", emit.count())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet expectations: %v", err)
+	}
+}
+
+// TestSweepStuckProvisioning_HermesPastDeadline — a hermes workspace
+// past 30 min DOES get flipped. Closes the loop on the runtime-aware
+// fix: it's still bounded, just with a longer threshold than other
+// runtimes.
+func TestSweepStuckProvisioning_HermesPastDeadline(t *testing.T) {
+	mock := setupTestDB(t)
+
+	// 31 min = 1860 sec > HermesProvisioningTimeout (1800s).
+	mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
+		WillReturnRows(candidateRows([3]any{"ws-hermes-stuck", "hermes", 1860}))
+	mock.ExpectExec(`UPDATE workspaces`).
+		WithArgs("ws-hermes-stuck", sqlmock.AnyArg(), sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	emit := &fakeEmitter{}
+	sweepStuckProvisioning(context.Background(), emit)
+
+	if emit.count() != 1 {
+		t.Fatalf("hermes past 30min must be flipped, got %d events", emit.count())
+	}
+	// Payload should include runtime so ops can distinguish in logs.
+	payload, ok := emit.events[0].Payload.(map[string]interface{})
+	if !ok {
+		t.Fatalf("payload not a map: %T", emit.events[0].Payload)
+	}
+	if payload["runtime"] != "hermes" {
+		t.Errorf("payload.runtime = %v, want hermes", payload["runtime"])
+	}
+}
+
 // TestSweepStuckProvisioning_RaceSafe covers the case where UPDATE affects
 // 0 rows because the workspace flipped to online (or got restarted) between
 // the SELECT and the UPDATE. We should skip the event, not emit a false
@ -76,8 +142,8 @@ func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
 func TestSweepStuckProvisioning_RaceSafe(t *testing.T) {
 	mock := setupTestDB(t)

-	mock.ExpectQuery(`SELECT id FROM workspaces`).
-		WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-raced"))
+	mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
+		WillReturnRows(candidateRows([3]any{"ws-raced", "claude-code", 700}))

 	mock.ExpectExec(`UPDATE workspaces`).
 		WithArgs("ws-raced", sqlmock.AnyArg(), sqlmock.AnyArg()).
@ -99,8 +165,8 @@ func TestSweepStuckProvisioning_RaceSafe(t *testing.T) {
 func TestSweepStuckProvisioning_NoStuck(t *testing.T) {
 	mock := setupTestDB(t)

-	mock.ExpectQuery(`SELECT id FROM workspaces`).
-		WillReturnRows(sqlmock.NewRows([]string{"id"}))
+	mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
+		WillReturnRows(candidateRows())

 	emit := &fakeEmitter{}
 	sweepStuckProvisioning(context.Background(), emit)
@ -115,14 +181,16 @@ func TestSweepStuckProvisioning_NoStuck(t *testing.T) {

 // TestSweepStuckProvisioning_MultipleStuck covers the realistic case where
 // both agents (claude-code + hermes) are stuck — both should get flipped
-// and both should get events.
+// and both should get events. claude-code at 11 min (over its 10-min
+// limit), hermes at 31 min (over its 30-min limit).
 func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) {
 	mock := setupTestDB(t)

-	mock.ExpectQuery(`SELECT id FROM workspaces`).
-		WillReturnRows(sqlmock.NewRows([]string{"id"}).
-			AddRow("ws-claude-code").
-			AddRow("ws-hermes"))
+	mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
+		WillReturnRows(candidateRows(
+			[3]any{"ws-claude-code", "claude-code", 700},
+			[3]any{"ws-hermes", "hermes", 1860},
+		))

 	mock.ExpectExec(`UPDATE workspaces`).
 		WithArgs("ws-claude-code", sqlmock.AnyArg(), sqlmock.AnyArg()).
@ -145,8 +213,8 @@ func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) {
 func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) {
 	mock := setupTestDB(t)

-	mock.ExpectQuery(`SELECT id FROM workspaces`).
-		WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-stuck"))
+	mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
+		WillReturnRows(candidateRows([3]any{"ws-stuck", "claude-code", 700}))
 	mock.ExpectExec(`UPDATE workspaces`).
 		WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg()).
 		WillReturnResult(sqlmock.NewResult(0, 1))
@ -158,18 +226,47 @@ func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) {

 // TestProvisioningTimeout_EnvOverride verifies PROVISION_TIMEOUT_SECONDS
 // env var takes effect when set to a positive integer, and falls back to
-// default otherwise.
+// the per-runtime default otherwise.
 func TestProvisioningTimeout_EnvOverride(t *testing.T) {
 	t.Setenv("PROVISION_TIMEOUT_SECONDS", "60")
-	if got := provisioningTimeout(); got.Seconds() != 60 {
-		t.Errorf("override: got %v, want 60s", got)
+	// When env override is set it wins over runtime defaults.
+	if got := provisioningTimeoutFor(""); got.Seconds() != 60 {
+		t.Errorf("override (no runtime): got %v, want 60s", got)
+	}
+	if got := provisioningTimeoutFor("hermes"); got.Seconds() != 60 {
+		t.Errorf("override (hermes): got %v, want 60s", got)
 	}
 	t.Setenv("PROVISION_TIMEOUT_SECONDS", "")
-	if got := provisioningTimeout(); got != DefaultProvisioningTimeout {
-		t.Errorf("default: got %v, want %v", got, DefaultProvisioningTimeout)
+	if got := provisioningTimeoutFor(""); got != DefaultProvisioningTimeout {
+		t.Errorf("default (no runtime): got %v, want %v", got, DefaultProvisioningTimeout)
 	}
 	t.Setenv("PROVISION_TIMEOUT_SECONDS", "not-a-number")
-	if got := provisioningTimeout(); got != DefaultProvisioningTimeout {
-		t.Errorf("bad override: got %v, want default %v", got, DefaultProvisioningTimeout)
+	if got := provisioningTimeoutFor("claude-code"); got != DefaultProvisioningTimeout {
+		t.Errorf("bad override (claude-code): got %v, want default %v", got, DefaultProvisioningTimeout)
+	}
+}
+
+// TestProvisioningTimeout_RuntimeAware verifies hermes gets the longer
+// HermesProvisioningTimeout while other runtimes keep the default.
+// Mirrors bootstrap_watcher.go's bootstrapTimeoutFn — these two
+// timeouts must stay in sync (sweep > watcher) or healthy hermes
+// boots get killed mid-install.
+func TestProvisioningTimeout_RuntimeAware(t *testing.T) {
+	cases := []struct {
+		runtime string
+		want    time.Duration
+	}{
+		{"hermes", HermesProvisioningTimeout},
+		{"langgraph", DefaultProvisioningTimeout},
+		{"claude-code", DefaultProvisioningTimeout},
+		{"crewai", DefaultProvisioningTimeout},
+		{"autogen", DefaultProvisioningTimeout},
+		{"", DefaultProvisioningTimeout},
+		{"unknown-runtime", DefaultProvisioningTimeout},
+	}
+	for _, c := range cases {
+		if got := provisioningTimeoutFor(c.runtime); got != c.want {
+			t.Errorf("runtime=%q: got %v, want %v", c.runtime, got, c.want)
+		}
 	}
 }