fix(provisioner): raise register-wait timeout 300s->720s (slow boot races timeout) #2564
@@ -30,7 +30,22 @@ type ProvisionTimeoutEmitter interface {
|
||||
// image-pull + user-data execution on a cold EC2 worker while still
|
||||
// getting well ahead of the "15+ minute" stuck state users see in
|
||||
// production.
|
||||
const DefaultProvisioningTimeout = 10 * time.Minute
|
||||
// Bumped 10 -> 12 min: a fresh-provision cold boot was measured at
|
||||
// ~7 min on a clean instance (i-052962296ad0c7240, 2026-06-10:
|
||||
// launched 14:51:01Z, registered (/registry/register 200) 14:58:04Z;
|
||||
// cloud-init blame = 381s in config-scripts_user, ~1m47s of it the ECR
|
||||
// image pull. The slow tail of that boot distribution crosses the old
|
||||
// 10-min sweep, which then false-fails a workspace that is still
|
||||
// healthily booting and registers seconds later (the recurring MiniMax
|
||||
// workspace-0c96b3ab failure). 12 min clears the observed ~7 min boot
|
||||
// plus tail slack while staying ahead of the genuinely-stuck state.
|
||||
// NOTE: the CP-side bootstrap-watcher
|
||||
// (controlplane internal/provisioner/bootstrap_watcher.go
|
||||
// bootstrapTimeoutFn) still ENDS its serial-console crash-diagnosis at
|
||||
// 5 min for non-hermes runtimes. It does not flip-to-failed (this sweep
|
||||
// owns that verdict), but bump it to 12 min in lockstep so early-crash
|
||||
// reporting covers the full boot window.
|
||||
const DefaultProvisioningTimeout = 12 * time.Minute
|
||||
|
||||
// HermesProvisioningTimeout matches the CP bootstrap-watcher's
|
||||
// runtime-aware deadline (cp#245) for hermes workspaces: 25 min watcher
|
||||
|
||||
@@ -51,7 +51,7 @@ func TestSweep_RescueFiresOnBootFailureVerdict(t *testing.T) {
|
||||
rec := withRescueHook(t)
|
||||
|
||||
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`).
|
||||
WillReturnRows(candidateRows([4]any{"ws-stuck", "codex", "i-0badf00d", 700}))
|
||||
WillReturnRows(candidateRows([4]any{"ws-stuck", "codex", "i-0badf00d", 800}))
|
||||
mock.ExpectExec(`UPDATE workspaces`).
|
||||
WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
@@ -76,7 +76,7 @@ func TestSweep_RescueDoesNotFireOnRace(t *testing.T) {
|
||||
rec := withRescueHook(t)
|
||||
|
||||
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`).
|
||||
WillReturnRows(candidateRows([4]any{"ws-raced", "codex", "i-raced", 700}))
|
||||
WillReturnRows(candidateRows([4]any{"ws-raced", "codex", "i-raced", 800}))
|
||||
mock.ExpectExec(`UPDATE workspaces`).
|
||||
WithArgs("ws-raced", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
|
||||
WillReturnResult(sqlmock.NewResult(0, 0)) // raced — 0 rows
|
||||
@@ -116,7 +116,7 @@ func TestSweep_RescueNilHookIsSafe(t *testing.T) {
|
||||
t.Cleanup(func() { BootFailureRescueHook = prev })
|
||||
|
||||
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`).
|
||||
WillReturnRows(candidateRows([4]any{"ws-stuck", "codex", "i-x", 700}))
|
||||
WillReturnRows(candidateRows([4]any{"ws-stuck", "codex", "i-x", 800}))
|
||||
mock.ExpectExec(`UPDATE workspaces`).
|
||||
WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
@@ -60,9 +60,9 @@ func candidateRows(rows ...[4]any) *sqlmock.Rows {
|
||||
func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
|
||||
// claude-code workspace, 700s old > 600s default timeout → flipped.
|
||||
// claude-code workspace, 800s old > 720s default timeout → flipped.
|
||||
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`).
|
||||
WillReturnRows(candidateRows([4]any{"ws-stuck", "claude-code", "i-stuck", 700}))
|
||||
WillReturnRows(candidateRows([4]any{"ws-stuck", "claude-code", "i-stuck", 800}))
|
||||
|
||||
mock.ExpectExec(`UPDATE workspaces`).
|
||||
WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
|
||||
@@ -147,7 +147,7 @@ func TestSweepStuckProvisioning_HermesPastDeadline(t *testing.T) {
|
||||
// unit test would catch it. This test fails on that refactor too.
|
||||
//
|
||||
// Scenario: a claude-code workspace 11 min old (660s). Default budget
|
||||
// is 10 min (600s) → without manifest override, this would be flipped
|
||||
// is 12 min (720s) → without manifest override, this would be flipped
|
||||
// to failed. Manifest override declares 1200s → it should be SPARED.
|
||||
// No UPDATE, no event emitted.
|
||||
func TestSweepStuckProvisioning_ManifestOverrideSparesRow(t *testing.T) {
|
||||
@@ -225,7 +225,7 @@ func TestSweepStuckProvisioning_RaceSafe(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
|
||||
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`).
|
||||
WillReturnRows(candidateRows([4]any{"ws-raced", "claude-code", "i-raced", 700}))
|
||||
WillReturnRows(candidateRows([4]any{"ws-raced", "claude-code", "i-raced", 800}))
|
||||
|
||||
mock.ExpectExec(`UPDATE workspaces`).
|
||||
WithArgs("ws-raced", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
|
||||
@@ -263,14 +263,14 @@ func TestSweepStuckProvisioning_NoStuck(t *testing.T) {
|
||||
|
||||
// TestSweepStuckProvisioning_MultipleStuck covers the realistic case where
|
||||
// both agents (claude-code + hermes) are stuck — both should get flipped
|
||||
// and both should get events. claude-code at 11 min (over its 10-min
|
||||
// and both should get events. claude-code at ~13 min (over its 12-min
|
||||
// limit), hermes at 31 min (over its 30-min limit).
|
||||
func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
|
||||
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`).
|
||||
WillReturnRows(candidateRows(
|
||||
[4]any{"ws-claude-code", "claude-code", "i-cc", 700},
|
||||
[4]any{"ws-claude-code", "claude-code", "i-cc", 800},
|
||||
[4]any{"ws-hermes", "hermes", "i-hh", 1860},
|
||||
))
|
||||
|
||||
@@ -296,7 +296,7 @@ func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
|
||||
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), COALESCE\(instance_id, ''\), EXTRACT`).
|
||||
WillReturnRows(candidateRows([4]any{"ws-stuck", "claude-code", "i-stuck", 700}))
|
||||
WillReturnRows(candidateRows([4]any{"ws-stuck", "claude-code", "i-stuck", 800}))
|
||||
mock.ExpectExec(`UPDATE workspaces`).
|
||||
WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
Reference in New Issue
Block a user