fix(provisioner): raise register-wait timeout 300s->720s (slow boot races timeout) #2564

Merged
agent-reviewer-cr2 merged 1 commits from fix/provision-timeout-720s into main 2026-06-11 05:02:11 +00:00
3 changed files with 26 additions and 11 deletions
@@ -30,7 +30,22 @@ type ProvisionTimeoutEmitter interface {
// image-pull + user-data execution on a cold EC2 worker while still
// getting well ahead of the "15+ minute" stuck state users see in
// production.
const DefaultProvisioningTimeout = 10 * time.Minute
// Bumped 10 -> 12 min: a fresh-provision cold boot was measured at
// ~7 min on a clean instance (i-052962296ad0c7240, 2026-06-10:
// launched 14:51:01Z, registered (/registry/register 200) 14:58:04Z;
// cloud-init blame = 381s in config-scripts_user, ~1m47s of it the ECR
// image pull. The slow tail of that boot distribution crosses the old
// 10-min sweep, which then false-fails a workspace that is still
// healthily booting and registers seconds later (the recurring MiniMax
// workspace-0c96b3ab failure). 12 min clears the observed ~7 min boot
// plus tail slack while staying ahead of the genuinely-stuck state.
// NOTE: the CP-side bootstrap-watcher
// (controlplane internal/provisioner/bootstrap_watcher.go
// bootstrapTimeoutFn) still ENDS its serial-console crash-diagnosis at
// 5 min for non-hermes runtimes. It does not flip-to-failed (this sweep
// owns that verdict), but bump it to 12 min in lockstep so early-crash
// reporting covers the full boot window.
const DefaultProvisioningTimeout = 12 * time.Minute
// HermesProvisioningTimeout matches the CP bootstrap-watcher's
// runtime-aware deadline (cp#245) for hermes workspaces: 25 min watcher
@@ -51,7 +51,7 @@ func TestSweep_RescueFiresOnBootFailureVerdict(t *testing.T) {
rec := withRescueHook(t)
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`).
WillReturnRows(candidateRows([4]any{"ws-stuck", "codex", "i-0badf00d", 700}))
WillReturnRows(candidateRows([4]any{"ws-stuck", "codex", "i-0badf00d", 800}))
mock.ExpectExec(`UPDATE workspaces`).
WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
WillReturnResult(sqlmock.NewResult(0, 1))
@@ -76,7 +76,7 @@ func TestSweep_RescueDoesNotFireOnRace(t *testing.T) {
rec := withRescueHook(t)
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`).
WillReturnRows(candidateRows([4]any{"ws-raced", "codex", "i-raced", 700}))
WillReturnRows(candidateRows([4]any{"ws-raced", "codex", "i-raced", 800}))
mock.ExpectExec(`UPDATE workspaces`).
WithArgs("ws-raced", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
WillReturnResult(sqlmock.NewResult(0, 0)) // raced — 0 rows
@@ -116,7 +116,7 @@ func TestSweep_RescueNilHookIsSafe(t *testing.T) {
t.Cleanup(func() { BootFailureRescueHook = prev })
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`).
WillReturnRows(candidateRows([4]any{"ws-stuck", "codex", "i-x", 700}))
WillReturnRows(candidateRows([4]any{"ws-stuck", "codex", "i-x", 800}))
mock.ExpectExec(`UPDATE workspaces`).
WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
WillReturnResult(sqlmock.NewResult(0, 1))
@@ -60,9 +60,9 @@ func candidateRows(rows ...[4]any) *sqlmock.Rows {
func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
mock := setupTestDB(t)
// claude-code workspace, 700s old > 600s default timeout → flipped.
// claude-code workspace, 800s old > 720s default timeout → flipped.
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`).
WillReturnRows(candidateRows([4]any{"ws-stuck", "claude-code", "i-stuck", 700}))
WillReturnRows(candidateRows([4]any{"ws-stuck", "claude-code", "i-stuck", 800}))
mock.ExpectExec(`UPDATE workspaces`).
WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
@@ -147,7 +147,7 @@ func TestSweepStuckProvisioning_HermesPastDeadline(t *testing.T) {
// unit test would catch it. This test fails on that refactor too.
//
// Scenario: a claude-code workspace 11 min old (660s). Default budget
// is 10 min (600s) → without manifest override, this would be flipped
// is 12 min (720s) → without manifest override, this would be flipped
// to failed. Manifest override declares 1200s → it should be SPARED.
// No UPDATE, no event emitted.
func TestSweepStuckProvisioning_ManifestOverrideSparesRow(t *testing.T) {
@@ -225,7 +225,7 @@ func TestSweepStuckProvisioning_RaceSafe(t *testing.T) {
mock := setupTestDB(t)
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`).
WillReturnRows(candidateRows([4]any{"ws-raced", "claude-code", "i-raced", 700}))
WillReturnRows(candidateRows([4]any{"ws-raced", "claude-code", "i-raced", 800}))
mock.ExpectExec(`UPDATE workspaces`).
WithArgs("ws-raced", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
@@ -263,14 +263,14 @@ func TestSweepStuckProvisioning_NoStuck(t *testing.T) {
// TestSweepStuckProvisioning_MultipleStuck covers the realistic case where
// both agents (claude-code + hermes) are stuck — both should get flipped
// and both should get events. claude-code at 11 min (over its 10-min
// and both should get events. claude-code at ~13 min (over its 12-min
// limit), hermes at 31 min (over its 30-min limit).
func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) {
mock := setupTestDB(t)
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`).
WillReturnRows(candidateRows(
[4]any{"ws-claude-code", "claude-code", "i-cc", 700},
[4]any{"ws-claude-code", "claude-code", "i-cc", 800},
[4]any{"ws-hermes", "hermes", "i-hh", 1860},
))
@@ -296,7 +296,7 @@ func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) {
mock := setupTestDB(t)
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`).
WillReturnRows(candidateRows([4]any{"ws-stuck", "claude-code", "i-stuck", 700}))
WillReturnRows(candidateRows([4]any{"ws-stuck", "claude-code", "i-stuck", 800}))
mock.ExpectExec(`UPDATE workspaces`).
WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
WillReturnResult(sqlmock.NewResult(0, 1))