diff --git a/workspace-server/internal/registry/provisiontimeout.go b/workspace-server/internal/registry/provisiontimeout.go index 41c489da1..64185f89e 100644 --- a/workspace-server/internal/registry/provisiontimeout.go +++ b/workspace-server/internal/registry/provisiontimeout.go @@ -30,7 +30,22 @@ type ProvisionTimeoutEmitter interface { // image-pull + user-data execution on a cold EC2 worker while still // getting well ahead of the "15+ minute" stuck state users see in // production. -const DefaultProvisioningTimeout = 10 * time.Minute +// Bumped 10 -> 12 min: a fresh-provision cold boot was measured at +// ~7 min on a clean instance (i-052962296ad0c7240, 2026-06-10: +// launched 14:51:01Z, registered (/registry/register 200) 14:58:04Z; +// cloud-init blame = 381s in config-scripts_user, ~1m47s of it the ECR +// image pull. The slow tail of that boot distribution crosses the old +// 10-min sweep, which then false-fails a workspace that is still +// healthily booting and registers seconds later (the recurring MiniMax +// workspace-0c96b3ab failure). 12 min clears the observed ~7 min boot +// plus tail slack while staying ahead of the genuinely-stuck state. +// NOTE: the CP-side bootstrap-watcher +// (controlplane internal/provisioner/bootstrap_watcher.go +// bootstrapTimeoutFn) still ENDS its serial-console crash-diagnosis at +// 5 min for non-hermes runtimes. It does not flip-to-failed (this sweep +// owns that verdict), but bump it to 12 min in lockstep so early-crash +// reporting covers the full boot window. +const DefaultProvisioningTimeout = 12 * time.Minute // HermesProvisioningTimeout matches the CP bootstrap-watcher's // runtime-aware deadline (cp#245) for hermes workspaces: 25 min watcher diff --git a/workspace-server/internal/registry/provisiontimeout_rescue_test.go b/workspace-server/internal/registry/provisiontimeout_rescue_test.go index 283d244b8..1359a30e3 100644 --- a/workspace-server/internal/registry/provisiontimeout_rescue_test.go +++ b/workspace-server/internal/registry/provisiontimeout_rescue_test.go @@ -51,7 +51,7 @@ func TestSweep_RescueFiresOnBootFailureVerdict(t *testing.T) { rec := withRescueHook(t) mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`). - WillReturnRows(candidateRows([4]any{"ws-stuck", "codex", "i-0badf00d", 700})) + WillReturnRows(candidateRows([4]any{"ws-stuck", "codex", "i-0badf00d", 800})) mock.ExpectExec(`UPDATE workspaces`). WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed). WillReturnResult(sqlmock.NewResult(0, 1)) @@ -76,7 +76,7 @@ func TestSweep_RescueDoesNotFireOnRace(t *testing.T) { rec := withRescueHook(t) mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`). - WillReturnRows(candidateRows([4]any{"ws-raced", "codex", "i-raced", 700})) + WillReturnRows(candidateRows([4]any{"ws-raced", "codex", "i-raced", 800})) mock.ExpectExec(`UPDATE workspaces`). WithArgs("ws-raced", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed). WillReturnResult(sqlmock.NewResult(0, 0)) // raced — 0 rows @@ -116,7 +116,7 @@ func TestSweep_RescueNilHookIsSafe(t *testing.T) { t.Cleanup(func() { BootFailureRescueHook = prev }) mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`). - WillReturnRows(candidateRows([4]any{"ws-stuck", "codex", "i-x", 700})) + WillReturnRows(candidateRows([4]any{"ws-stuck", "codex", "i-x", 800})) mock.ExpectExec(`UPDATE workspaces`). WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed). WillReturnResult(sqlmock.NewResult(0, 1)) diff --git a/workspace-server/internal/registry/provisiontimeout_test.go b/workspace-server/internal/registry/provisiontimeout_test.go index f314d05fd..80f029db1 100644 --- a/workspace-server/internal/registry/provisiontimeout_test.go +++ b/workspace-server/internal/registry/provisiontimeout_test.go @@ -60,9 +60,9 @@ func candidateRows(rows ...[4]any) *sqlmock.Rows { func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) { mock := setupTestDB(t) - // claude-code workspace, 700s old > 600s default timeout → flipped. + // claude-code workspace, 800s old > 720s default timeout → flipped. mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`). - WillReturnRows(candidateRows([4]any{"ws-stuck", "claude-code", "i-stuck", 700})) + WillReturnRows(candidateRows([4]any{"ws-stuck", "claude-code", "i-stuck", 800})) mock.ExpectExec(`UPDATE workspaces`). WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed). @@ -147,7 +147,7 @@ func TestSweepStuckProvisioning_HermesPastDeadline(t *testing.T) { // unit test would catch it. This test fails on that refactor too. // // Scenario: a claude-code workspace 11 min old (660s). Default budget -// is 10 min (600s) → without manifest override, this would be flipped +// is 12 min (720s) → without manifest override, this would be flipped // to failed. Manifest override declares 1200s → it should be SPARED. // No UPDATE, no event emitted. func TestSweepStuckProvisioning_ManifestOverrideSparesRow(t *testing.T) { @@ -225,7 +225,7 @@ func TestSweepStuckProvisioning_RaceSafe(t *testing.T) { mock := setupTestDB(t) mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`). - WillReturnRows(candidateRows([4]any{"ws-raced", "claude-code", "i-raced", 700})) + WillReturnRows(candidateRows([4]any{"ws-raced", "claude-code", "i-raced", 800})) mock.ExpectExec(`UPDATE workspaces`). WithArgs("ws-raced", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed). @@ -263,14 +263,14 @@ func TestSweepStuckProvisioning_NoStuck(t *testing.T) { // TestSweepStuckProvisioning_MultipleStuck covers the realistic case where // both agents (claude-code + hermes) are stuck — both should get flipped -// and both should get events. claude-code at 11 min (over its 10-min +// and both should get events. claude-code at ~13 min (over its 12-min // limit), hermes at 31 min (over its 30-min limit). func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) { mock := setupTestDB(t) mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`). WillReturnRows(candidateRows( - [4]any{"ws-claude-code", "claude-code", "i-cc", 700}, + [4]any{"ws-claude-code", "claude-code", "i-cc", 800}, [4]any{"ws-hermes", "hermes", "i-hh", 1860}, )) @@ -296,7 +296,7 @@ func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) { mock := setupTestDB(t) mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`). - WillReturnRows(candidateRows([4]any{"ws-stuck", "claude-code", "i-stuck", 700})) + WillReturnRows(candidateRows([4]any{"ws-stuck", "claude-code", "i-stuck", 800})) mock.ExpectExec(`UPDATE workspaces`). WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed). WillReturnResult(sqlmock.NewResult(0, 1))