From 850434994ba67cb284fca2bbe0b6b7e17634cd88 Mon Sep 17 00:00:00 2001 From: devops-engineer Date: Thu, 11 Jun 2026 00:13:27 +0000 Subject: [PATCH] fix(provisioner): raise register-wait timeout 600s->720s (slow boot races sweep) The provision-timeout sweep flips a workspace status=provisioning -> failed once it has sat past DefaultProvisioningTimeout (was 10 min) with the message "provisioning timed out -- container started but never called /registry/register". On a slow fresh-provision cold boot this fires against a workspace that is still healthily booting and registers seconds later. Measured evidence (clean fresh instance, 2026-06-10): instance i-052962296ad0c7240 launched 14:51:01Z "Registered with platform: 200" 14:58:04Z (~7 min) cloud-init analyze blame = 381s in config-scripts_user (user-data): ~1m47s ECR image pull, ~4.5 min pre-pull apt/docker/dependency setup. The tail of that boot distribution (slow ECR pull + cold apt/docker bring-up on a busy worker) crosses the old 10-min sweep window, which then false-fails the workspace -- the recurring MiniMax (workspace 0c96b3ab) provisioning failures. Bumping the non-hermes default to 12 min comfortably clears the observed ~7 min boot plus tail slack while staying well ahead of the genuinely-stuck "15+ minute" state. Second timeout note (NOT changed here -- different repo): the CP-side bootstrap-watcher (molecule-controlplane internal/provisioner/ bootstrap_watcher.go bootstrapTimeoutFn) returns 5 min for non-hermes. It is a serial-console crash-DIAGNOSIS poller, not the flip-to-failed authority (this sweep owns that verdict), so it does not by itself kill a slow boot. But it should be bumped to 12 min in lockstep so early-crash reporting covers the full boot window. Companion CP PR to follow. Tests: stuck-age literals bumped 700s->800s to stay overdue against the new 720s budget; the 660s "still-booting, spared" cases are unchanged (660 < 720). Comments refreshed to the new values. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../internal/registry/provisiontimeout.go | 17 ++++++++++++++++- .../registry/provisiontimeout_rescue_test.go | 6 +++--- .../internal/registry/provisiontimeout_test.go | 14 +++++++------- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/workspace-server/internal/registry/provisiontimeout.go b/workspace-server/internal/registry/provisiontimeout.go index 41c489da1..64185f89e 100644 --- a/workspace-server/internal/registry/provisiontimeout.go +++ b/workspace-server/internal/registry/provisiontimeout.go @@ -30,7 +30,22 @@ type ProvisionTimeoutEmitter interface { // image-pull + user-data execution on a cold EC2 worker while still // getting well ahead of the "15+ minute" stuck state users see in // production. -const DefaultProvisioningTimeout = 10 * time.Minute +// Bumped 10 -> 12 min: a fresh-provision cold boot was measured at +// ~7 min on a clean instance (i-052962296ad0c7240, 2026-06-10: +// launched 14:51:01Z, registered (/registry/register 200) 14:58:04Z; +// cloud-init blame = 381s in config-scripts_user, ~1m47s of it the ECR +// image pull. The slow tail of that boot distribution crosses the old +// 10-min sweep, which then false-fails a workspace that is still +// healthily booting and registers seconds later (the recurring MiniMax +// workspace-0c96b3ab failure). 12 min clears the observed ~7 min boot +// plus tail slack while staying ahead of the genuinely-stuck state. +// NOTE: the CP-side bootstrap-watcher +// (controlplane internal/provisioner/bootstrap_watcher.go +// bootstrapTimeoutFn) still ENDS its serial-console crash-diagnosis at +// 5 min for non-hermes runtimes. It does not flip-to-failed (this sweep +// owns that verdict), but bump it to 12 min in lockstep so early-crash +// reporting covers the full boot window. +const DefaultProvisioningTimeout = 12 * time.Minute // HermesProvisioningTimeout matches the CP bootstrap-watcher's // runtime-aware deadline (cp#245) for hermes workspaces: 25 min watcher diff --git a/workspace-server/internal/registry/provisiontimeout_rescue_test.go b/workspace-server/internal/registry/provisiontimeout_rescue_test.go index 283d244b8..1359a30e3 100644 --- a/workspace-server/internal/registry/provisiontimeout_rescue_test.go +++ b/workspace-server/internal/registry/provisiontimeout_rescue_test.go @@ -51,7 +51,7 @@ func TestSweep_RescueFiresOnBootFailureVerdict(t *testing.T) { rec := withRescueHook(t) mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`). - WillReturnRows(candidateRows([4]any{"ws-stuck", "codex", "i-0badf00d", 700})) + WillReturnRows(candidateRows([4]any{"ws-stuck", "codex", "i-0badf00d", 800})) mock.ExpectExec(`UPDATE workspaces`). WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed). WillReturnResult(sqlmock.NewResult(0, 1)) @@ -76,7 +76,7 @@ func TestSweep_RescueDoesNotFireOnRace(t *testing.T) { rec := withRescueHook(t) mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`). - WillReturnRows(candidateRows([4]any{"ws-raced", "codex", "i-raced", 700})) + WillReturnRows(candidateRows([4]any{"ws-raced", "codex", "i-raced", 800})) mock.ExpectExec(`UPDATE workspaces`). WithArgs("ws-raced", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed). WillReturnResult(sqlmock.NewResult(0, 0)) // raced — 0 rows @@ -116,7 +116,7 @@ func TestSweep_RescueNilHookIsSafe(t *testing.T) { t.Cleanup(func() { BootFailureRescueHook = prev }) mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`). - WillReturnRows(candidateRows([4]any{"ws-stuck", "codex", "i-x", 700})) + WillReturnRows(candidateRows([4]any{"ws-stuck", "codex", "i-x", 800})) mock.ExpectExec(`UPDATE workspaces`). WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed). WillReturnResult(sqlmock.NewResult(0, 1)) diff --git a/workspace-server/internal/registry/provisiontimeout_test.go b/workspace-server/internal/registry/provisiontimeout_test.go index f314d05fd..80f029db1 100644 --- a/workspace-server/internal/registry/provisiontimeout_test.go +++ b/workspace-server/internal/registry/provisiontimeout_test.go @@ -60,9 +60,9 @@ func candidateRows(rows ...[4]any) *sqlmock.Rows { func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) { mock := setupTestDB(t) - // claude-code workspace, 700s old > 600s default timeout → flipped. + // claude-code workspace, 800s old > 720s default timeout → flipped. mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`). - WillReturnRows(candidateRows([4]any{"ws-stuck", "claude-code", "i-stuck", 700})) + WillReturnRows(candidateRows([4]any{"ws-stuck", "claude-code", "i-stuck", 800})) mock.ExpectExec(`UPDATE workspaces`). WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed). @@ -147,7 +147,7 @@ func TestSweepStuckProvisioning_HermesPastDeadline(t *testing.T) { // unit test would catch it. This test fails on that refactor too. // // Scenario: a claude-code workspace 11 min old (660s). Default budget -// is 10 min (600s) → without manifest override, this would be flipped +// is 12 min (720s) → without manifest override, this would be flipped // to failed. Manifest override declares 1200s → it should be SPARED. // No UPDATE, no event emitted. func TestSweepStuckProvisioning_ManifestOverrideSparesRow(t *testing.T) { @@ -225,7 +225,7 @@ func TestSweepStuckProvisioning_RaceSafe(t *testing.T) { mock := setupTestDB(t) mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`). - WillReturnRows(candidateRows([4]any{"ws-raced", "claude-code", "i-raced", 700})) + WillReturnRows(candidateRows([4]any{"ws-raced", "claude-code", "i-raced", 800})) mock.ExpectExec(`UPDATE workspaces`). WithArgs("ws-raced", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed). @@ -263,14 +263,14 @@ func TestSweepStuckProvisioning_NoStuck(t *testing.T) { // TestSweepStuckProvisioning_MultipleStuck covers the realistic case where // both agents (claude-code + hermes) are stuck — both should get flipped -// and both should get events. claude-code at 11 min (over its 10-min +// and both should get events. claude-code at ~13 min (over its 12-min // limit), hermes at 31 min (over its 30-min limit). func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) { mock := setupTestDB(t) mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`). WillReturnRows(candidateRows( - [4]any{"ws-claude-code", "claude-code", "i-cc", 700}, + [4]any{"ws-claude-code", "claude-code", "i-cc", 800}, [4]any{"ws-hermes", "hermes", "i-hh", 1860}, )) @@ -296,7 +296,7 @@ func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) { mock := setupTestDB(t) mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`). - WillReturnRows(candidateRows([4]any{"ws-stuck", "claude-code", "i-stuck", 700})) + WillReturnRows(candidateRows([4]any{"ws-stuck", "claude-code", "i-stuck", 800})) mock.ExpectExec(`UPDATE workspaces`). WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed). WillReturnResult(sqlmock.NewResult(0, 1)) -- 2.52.0