From be1beff4a089e71a16b47f6e0b522a909f8ffcca Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 26 Apr 2026 01:44:09 -0700 Subject: [PATCH] =?UTF-8?q?fix(registry):=20runtime-aware=20provision-time?= =?UTF-8?q?out=20sweep=20=E2=80=94=20give=20hermes=2030=20min?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-fix: workspace-server's provision-timeout sweep was hardcoded at 10 min for all runtimes. The CP-side bootstrap-watcher (cp#245) correctly gives hermes 25 min for cold-boot (hermes installs include apt + uv + Python venv + Node + hermes-agent — 13–25 min on slow apt mirrors is normal). The two timeout systems disagreed: the watcher would happily wait 25 min, but the workspace-server's 10-min sweep killed healthy hermes boots mid-install at 10 min and marked them failed. Today's example: #2061's E2E run on 2026-04-26 at 08:06:34Z created a hermes workspace, EC2 cloud-init was visibly making progress on apt-installs (libcjson1, libmbedcrypto7t64) when the sweep flipped status to 'failed' at 08:17:00Z (10:26 elapsed). The test threw "Workspace failed: " (empty error from sql.NullString serialization) and CI failed on a healthy boot. Fix: provisioningTimeoutFor(runtime) — same shape as the CP's bootstrapTimeoutFn: - hermes: 30 min (watcher's 25 min + 5 min slack) - others: 10 min (unchanged — claude-code/langgraph/etc. boot in <5 min, 10 min is plenty) PROVISION_TIMEOUT_SECONDS env override still works (applies to all runtimes — operators who care about the runtime distinction shouldn't use the override anyway). Sweep query change: pulls (id, runtime, age_sec) per row instead of pre-filtering by age in SQL. Per-row Go evaluation picks the correct timeout. Slightly more rows scanned but bounded by the status='provisioning' partial index — workspaces in flight, not historical. Tests: - TestProvisioningTimeout_RuntimeAware — locks in the per-runtime mapping - TestSweepStuckProvisioning_HermesGets30MinSlack — hermes at 11 min must NOT be flipped - TestSweepStuckProvisioning_HermesPastDeadline — hermes at 31 min IS flipped, payload includes runtime - Existing tests updated for the new query shape Verified: - go build ./... clean - go vet ./... clean - go test ./... all green Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/registry/provisiontimeout.go | 103 +++++++++---- .../registry/provisiontimeout_test.go | 137 +++++++++++++++--- 2 files changed, 188 insertions(+), 52 deletions(-) diff --git a/workspace-server/internal/registry/provisiontimeout.go b/workspace-server/internal/registry/provisiontimeout.go index 0201eb9b..b88cf54e 100644 --- a/workspace-server/internal/registry/provisiontimeout.go +++ b/workspace-server/internal/registry/provisiontimeout.go @@ -18,30 +18,49 @@ type ProvisionTimeoutEmitter interface { } // DefaultProvisioningTimeout is how long a workspace may sit in -// status='provisioning' before the sweeper flips it to 'failed'. The -// container-launch path has its own 3-minute context timeout -// (provisioner.ProvisionTimeout) but that only bounds the docker API call — -// a container that started but crashes before /registry/register never -// triggers that path and would sit in provisioning forever. 10 minutes -// covers pathological image-pull + user-data execution on a cold EC2 worker -// while still getting well ahead of the "15+ minute" stuck state users see -// in production. +// status='provisioning' before the sweeper flips it to 'failed'. +// Default for non-hermes runtimes (claude-code, langgraph, crewai, +// autogen, etc.) which cold-boot in <5 min. The container-launch path +// has its own 3-minute context timeout (provisioner.ProvisionTimeout) +// but that only bounds the docker API call — a container that started +// but crashes before /registry/register never triggers that path and +// would sit in provisioning forever. 10 minutes covers pathological +// image-pull + user-data execution on a cold EC2 worker while still +// getting well ahead of the "15+ minute" stuck state users see in +// production. const DefaultProvisioningTimeout = 10 * time.Minute +// HermesProvisioningTimeout matches the CP bootstrap-watcher's +// runtime-aware deadline (cp#245) for hermes workspaces: 25 min watcher +// + 5 min sweep slack. Hermes cold-boot does apt + uv + Python venv + +// Node + hermes-agent install — 13–25 min on slow apt mirrors is +// normal. Without this, the sweep would flip the workspace to 'failed' +// at 10 min while the watcher (and the workspace itself) is still +// happily progressing through install. Issue #1843 follow-up: a +// healthy 10.5-min hermes boot was killed by the 10-min sweep on +// 2026-04-26, breaking #2061's E2E. +const HermesProvisioningTimeout = 30 * time.Minute + // DefaultProvisionSweepInterval is how often the sweeper polls. Same cadence // as the hibernation monitor — cheap and bounded by the provisioning-state // query which hits the primary key / status partial index. const DefaultProvisionSweepInterval = 30 * time.Second -// provisioningTimeout reads the override from env, falling back to the -// default. Env var expressed in seconds so operators can tune via a normal -// container restart without a code change. -func provisioningTimeout() time.Duration { +// provisioningTimeoutFor picks the per-runtime sweep deadline. Mirrors +// the CP bootstrap-watcher's runtime gating (provisioner.bootstrapTimeoutFn). +// PROVISION_TIMEOUT_SECONDS env override, when set, applies to ALL +// runtimes — useful for ops debugging but loses the runtime nuance, so +// operators should prefer the defaults unless they have a specific +// reason. +func provisioningTimeoutFor(runtime string) time.Duration { if v := os.Getenv("PROVISION_TIMEOUT_SECONDS"); v != "" { if n, err := strconv.Atoi(v); err == nil && n > 0 { return time.Duration(n) * time.Second } } + if runtime == "hermes" { + return HermesProvisioningTimeout + } return DefaultProvisioningTimeout } @@ -65,7 +84,8 @@ func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeout ticker := time.NewTicker(interval) defer ticker.Stop() - log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s)", interval, provisioningTimeout()) + log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s default / %s hermes)", + interval, DefaultProvisioningTimeout, HermesProvisioningTimeout) for { select { @@ -80,33 +100,51 @@ func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeout // sweepStuckProvisioning is one tick of the sweeper. Exported-for-test via // the package boundary: keep all time.Now reads inside so tests can drive it // deterministically by seeding updated_at rather than manipulating time. +// +// Runtime-aware: the per-workspace timeout depends on `runtime`. Hermes +// gets 30 min (matching the CP bootstrap-watcher's 25-min deadline + 5 +// min slack); everything else gets 10 min. Without this distinction a +// healthy hermes cold-boot at 10–25 min got killed mid-install by this +// sweep, leaving an incoherent "marked failed but actually working" +// state. See bootstrap_watcher.go's bootstrapTimeoutFn for the +// canonical CP-side gating. func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter) { - timeout := provisioningTimeout() - timeoutSec := int(timeout / time.Second) - - // Read candidates first so the event broadcast can include each id. The - // subsequent UPDATE re-checks the predicate to stay race-safe against - // concurrent restart / register paths that write updated_at. + // We can't pre-filter by age in SQL because the threshold depends + // on the row's runtime. Pull every provisioning row + its runtime + // + its age, evaluate per-row in Go. Still cheap — the + // status='provisioning' row count is bounded (workspaces in + // flight, not historical) and the partial index on status keeps + // it fast. rows, err := db.DB.QueryContext(ctx, ` - SELECT id FROM workspaces + SELECT id, COALESCE(runtime, ''), EXTRACT(EPOCH FROM (now() - updated_at))::int + FROM workspaces WHERE status = 'provisioning' - AND updated_at < now() - ($1 || ' seconds')::interval - `, timeoutSec) + `) if err != nil { log.Printf("Provision-timeout sweep: query error: %v", err) return } defer rows.Close() - var ids []string + type candidate struct { + id string + runtime string + ageSec int + } + var ids []candidate for rows.Next() { - var id string - if err := rows.Scan(&id); err == nil { - ids = append(ids, id) + var c candidate + if err := rows.Scan(&c.id, &c.runtime, &c.ageSec); err == nil { + ids = append(ids, c) } } - for _, id := range ids { + for _, c := range ids { + timeout := provisioningTimeoutFor(c.runtime) + timeoutSec := int(timeout / time.Second) + if c.ageSec < timeoutSec { + continue + } msg := "provisioning timed out — container started but never called /registry/register. Check container logs and network connectivity to the platform." res, err := db.DB.ExecContext(ctx, ` UPDATE workspaces @@ -116,9 +154,9 @@ func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter WHERE id = $1 AND status = 'provisioning' AND updated_at < now() - ($3 || ' seconds')::interval - `, id, msg, timeoutSec) + `, c.id, msg, timeoutSec) if err != nil { - log.Printf("Provision-timeout sweep: failed to flip %s to failed: %v", id, err) + log.Printf("Provision-timeout sweep: failed to flip %s to failed: %v", c.id, err) continue } affected, _ := res.RowsAffected() @@ -126,18 +164,19 @@ func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter // Raced with restart / register — no harm, just skip. continue } - log.Printf("Provision-timeout sweep: %s stuck in provisioning > %s — marked failed", id, timeout) + log.Printf("Provision-timeout sweep: %s (runtime=%q) stuck in provisioning > %s — marked failed", c.id, c.runtime, timeout) // Emit as WORKSPACE_PROVISION_FAILED, not _TIMEOUT, because the // canvas event handler only flips node state on the _FAILED case. // A separate event type was considered but the UI reaction is // identical either way — operators who need to distinguish can // tell from the `source` payload field. - if emitErr := emitter.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", id, map[string]interface{}{ + if emitErr := emitter.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", c.id, map[string]interface{}{ "error": msg, "timeout_secs": timeoutSec, + "runtime": c.runtime, "source": "provision_timeout_sweep", }); emitErr != nil { - log.Printf("Provision-timeout sweep: broadcast failed for %s: %v", id, emitErr) + log.Printf("Provision-timeout sweep: broadcast failed for %s: %v", c.id, emitErr) } } } diff --git a/workspace-server/internal/registry/provisiontimeout_test.go b/workspace-server/internal/registry/provisiontimeout_test.go index a5009a56..830f6774 100644 --- a/workspace-server/internal/registry/provisiontimeout_test.go +++ b/workspace-server/internal/registry/provisiontimeout_test.go @@ -5,6 +5,7 @@ import ( "errors" "sync" "testing" + "time" "github.com/DATA-DOG/go-sqlmock" ) @@ -40,13 +41,24 @@ func (f *fakeEmitter) count() int { return len(f.events) } +// candidateRows builds the new-shape query result (id, runtime, age_sec). +// Use this in every sweep test to match the runtime-aware SELECT. +func candidateRows(rows ...[3]any) *sqlmock.Rows { + r := sqlmock.NewRows([]string{"id", "runtime", "age_sec"}) + for _, row := range rows { + r = r.AddRow(row[0], row[1], row[2]) + } + return r +} + // TestSweepStuckProvisioning_FlipsOverdue verifies the happy path: a stuck // provisioning workspace gets flipped to failed AND an event is broadcast. func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) { mock := setupTestDB(t) - mock.ExpectQuery(`SELECT id FROM workspaces`). - WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-stuck")) + // claude-code workspace, 700s old > 600s default timeout → flipped. + mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`). + WillReturnRows(candidateRows([3]any{"ws-stuck", "claude-code", 700})) mock.ExpectExec(`UPDATE workspaces`). WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg()). @@ -69,6 +81,60 @@ func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) { } } +// TestSweepStuckProvisioning_HermesGets30MinSlack — the regression that +// motivated the runtime-aware change. A hermes workspace 11 min into +// cold-boot must NOT be flipped to failed; the watcher's 25-min budget +// covers it. Without the fix, the 10-min sweep killed healthy hermes +// boots mid-install (issue #2061's E2E failure on 2026-04-26). +func TestSweepStuckProvisioning_HermesGets30MinSlack(t *testing.T) { + mock := setupTestDB(t) + + // 11 min = 660 sec. < HermesProvisioningTimeout (1800s). + // No UPDATE should fire — hermes still has time. + mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`). + WillReturnRows(candidateRows([3]any{"ws-hermes-booting", "hermes", 660})) + + emit := &fakeEmitter{} + sweepStuckProvisioning(context.Background(), emit) + + if emit.count() != 0 { + t.Fatalf("hermes at 11min should NOT have been flipped, got %d events", emit.count()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet expectations: %v", err) + } +} + +// TestSweepStuckProvisioning_HermesPastDeadline — a hermes workspace +// past 30 min DOES get flipped. Closes the loop on the runtime-aware +// fix: it's still bounded, just with a longer threshold than other +// runtimes. +func TestSweepStuckProvisioning_HermesPastDeadline(t *testing.T) { + mock := setupTestDB(t) + + // 31 min = 1860 sec > HermesProvisioningTimeout (1800s). + mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`). + WillReturnRows(candidateRows([3]any{"ws-hermes-stuck", "hermes", 1860})) + mock.ExpectExec(`UPDATE workspaces`). + WithArgs("ws-hermes-stuck", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + + emit := &fakeEmitter{} + sweepStuckProvisioning(context.Background(), emit) + + if emit.count() != 1 { + t.Fatalf("hermes past 30min must be flipped, got %d events", emit.count()) + } + // Payload should include runtime so ops can distinguish in logs. + payload, ok := emit.events[0].Payload.(map[string]interface{}) + if !ok { + t.Fatalf("payload not a map: %T", emit.events[0].Payload) + } + if payload["runtime"] != "hermes" { + t.Errorf("payload.runtime = %v, want hermes", payload["runtime"]) + } +} + // TestSweepStuckProvisioning_RaceSafe covers the case where UPDATE affects // 0 rows because the workspace flipped to online (or got restarted) between // the SELECT and the UPDATE. We should skip the event, not emit a false @@ -76,8 +142,8 @@ func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) { func TestSweepStuckProvisioning_RaceSafe(t *testing.T) { mock := setupTestDB(t) - mock.ExpectQuery(`SELECT id FROM workspaces`). - WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-raced")) + mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`). + WillReturnRows(candidateRows([3]any{"ws-raced", "claude-code", 700})) mock.ExpectExec(`UPDATE workspaces`). WithArgs("ws-raced", sqlmock.AnyArg(), sqlmock.AnyArg()). @@ -99,8 +165,8 @@ func TestSweepStuckProvisioning_RaceSafe(t *testing.T) { func TestSweepStuckProvisioning_NoStuck(t *testing.T) { mock := setupTestDB(t) - mock.ExpectQuery(`SELECT id FROM workspaces`). - WillReturnRows(sqlmock.NewRows([]string{"id"})) + mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`). + WillReturnRows(candidateRows()) emit := &fakeEmitter{} sweepStuckProvisioning(context.Background(), emit) @@ -115,14 +181,16 @@ func TestSweepStuckProvisioning_NoStuck(t *testing.T) { // TestSweepStuckProvisioning_MultipleStuck covers the realistic case where // both agents (claude-code + hermes) are stuck — both should get flipped -// and both should get events. +// and both should get events. claude-code at 11 min (over its 10-min +// limit), hermes at 31 min (over its 30-min limit). func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) { mock := setupTestDB(t) - mock.ExpectQuery(`SELECT id FROM workspaces`). - WillReturnRows(sqlmock.NewRows([]string{"id"}). - AddRow("ws-claude-code"). - AddRow("ws-hermes")) + mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`). + WillReturnRows(candidateRows( + [3]any{"ws-claude-code", "claude-code", 700}, + [3]any{"ws-hermes", "hermes", 1860}, + )) mock.ExpectExec(`UPDATE workspaces`). WithArgs("ws-claude-code", sqlmock.AnyArg(), sqlmock.AnyArg()). @@ -145,8 +213,8 @@ func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) { func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) { mock := setupTestDB(t) - mock.ExpectQuery(`SELECT id FROM workspaces`). - WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-stuck")) + mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`). + WillReturnRows(candidateRows([3]any{"ws-stuck", "claude-code", 700})) mock.ExpectExec(`UPDATE workspaces`). WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg()). WillReturnResult(sqlmock.NewResult(0, 1)) @@ -158,18 +226,47 @@ func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) { // TestProvisioningTimeout_EnvOverride verifies PROVISION_TIMEOUT_SECONDS // env var takes effect when set to a positive integer, and falls back to -// default otherwise. +// the per-runtime default otherwise. func TestProvisioningTimeout_EnvOverride(t *testing.T) { t.Setenv("PROVISION_TIMEOUT_SECONDS", "60") - if got := provisioningTimeout(); got.Seconds() != 60 { - t.Errorf("override: got %v, want 60s", got) + // When env override is set it wins over runtime defaults. + if got := provisioningTimeoutFor(""); got.Seconds() != 60 { + t.Errorf("override (no runtime): got %v, want 60s", got) + } + if got := provisioningTimeoutFor("hermes"); got.Seconds() != 60 { + t.Errorf("override (hermes): got %v, want 60s", got) } t.Setenv("PROVISION_TIMEOUT_SECONDS", "") - if got := provisioningTimeout(); got != DefaultProvisioningTimeout { - t.Errorf("default: got %v, want %v", got, DefaultProvisioningTimeout) + if got := provisioningTimeoutFor(""); got != DefaultProvisioningTimeout { + t.Errorf("default (no runtime): got %v, want %v", got, DefaultProvisioningTimeout) } t.Setenv("PROVISION_TIMEOUT_SECONDS", "not-a-number") - if got := provisioningTimeout(); got != DefaultProvisioningTimeout { - t.Errorf("bad override: got %v, want default %v", got, DefaultProvisioningTimeout) + if got := provisioningTimeoutFor("claude-code"); got != DefaultProvisioningTimeout { + t.Errorf("bad override (claude-code): got %v, want default %v", got, DefaultProvisioningTimeout) + } +} + +// TestProvisioningTimeout_RuntimeAware verifies hermes gets the longer +// HermesProvisioningTimeout while other runtimes keep the default. +// Mirrors bootstrap_watcher.go's bootstrapTimeoutFn — these two +// timeouts must stay in sync (sweep > watcher) or healthy hermes +// boots get killed mid-install. +func TestProvisioningTimeout_RuntimeAware(t *testing.T) { + cases := []struct { + runtime string + want time.Duration + }{ + {"hermes", HermesProvisioningTimeout}, + {"langgraph", DefaultProvisioningTimeout}, + {"claude-code", DefaultProvisioningTimeout}, + {"crewai", DefaultProvisioningTimeout}, + {"autogen", DefaultProvisioningTimeout}, + {"", DefaultProvisioningTimeout}, + {"unknown-runtime", DefaultProvisioningTimeout}, + } + for _, c := range cases { + if got := provisioningTimeoutFor(c.runtime); got != c.want { + t.Errorf("runtime=%q: got %v, want %v", c.runtime, got, c.want) + } } }