diff --git a/workspace-server/internal/registry/cp_instance_reconciler.go b/workspace-server/internal/registry/cp_instance_reconciler.go index e5801e570..d1cd02a4f 100644 --- a/workspace-server/internal/registry/cp_instance_reconciler.go +++ b/workspace-server/internal/registry/cp_instance_reconciler.go @@ -36,11 +36,22 @@ package registry // runtime <> 'external'. Paused/hibernated/removed/provisioning/ // awaiting_agent rows are out of scope; external rows are covered by // the remote-heartbeat pass. -// - Per-cycle row cap + per-workspace timeout so one slow CP call can't -// stall the sweep. +// - Per-cycle row cap + per-cycle deadline + per-workspace timeout so +// one slow CP call (or a degraded-but-not-erroring CP) can't stall +// the sweep. +// - TOCTOU re-confirm before any flip: IsRunning resolves instance_id +// independently, so a row whose instance_id was cleared/NULLed (by a +// concurrent delete, the CP-orphan-sweeper, or a reprovision) between +// the reconciler's SELECT and the IsRunning probe yields a STALE +// (false, nil) that does NOT prove the EC2 is dead. We re-read the +// row's current (status, instance_id) and flip ONLY when the SAME +// non-empty instance we asked CP about is still the workspace's +// recorded instance AND it's still online/degraded. Mirrors the +// guarded-write re-confirm in healthsweep. import ( "context" + "database/sql" "log" "time" @@ -70,6 +81,20 @@ const CPInstanceReconcileLimit = 200 // timeout context derived from the cycle context. const cpInstanceCheckTimeout = 10 * time.Second +// cpInstanceCycleDeadline bounds the wall-time of one whole reconcile +// pass. With per-workspace 10s timeouts and a 200-row cap, a degraded- +// but-not-erroring CP (each IsRunning slow but under the per-workspace +// cap) could otherwise drag one cycle out for tens of minutes and starve +// the next tick. Mirrors cp_orphan_sweeper's orphanSweepDeadline; chosen +// under the 60s interval so a stuck cycle is abandoned before the next +// one is due and the backlog drains across subsequent cycles. +const cpInstanceCycleDeadline = 45 * time.Second + +// cpInstanceReconfirmTimeout bounds the TOCTOU re-confirm read. This is a +// single indexed primary-key lookup, so it should never be slow; a tight +// timeout keeps the re-confirm from itself becoming a stall point. +const cpInstanceReconfirmTimeout = 5 * time.Second + // StartCPInstanceReconciler runs the authoritative EC2-state reconcile // loop until ctx is cancelled. A nil checker makes the loop a no-op // (matches the nil-tolerant pattern of the sibling CP sweeper). @@ -106,21 +131,41 @@ func StartCPInstanceReconciler(ctx context.Context, checker InstanceRunningCheck } } +// reconcileRow pairs a workspace id with the instance_id captured in the +// SAME SELECT, so the TOCTOU re-confirm can verify CP's (false, nil) +// answer is about the instance the row still records — not one cleared +// out from under us between the SELECT and the IsRunning probe. +type reconcileRow struct { + id string + instanceID string +} + // reconcileOnce executes one reconcile pass. Defensive against db.DB // being nil so a misconfigured boot doesn't panic. // -// Scope: online + SaaS-EC2 workspaces only. runtime='external' rows are -// excluded (covered by the remote-heartbeat pass); paused/hibernated/ -// removed/provisioning/awaiting_agent are excluded by the status filter. -func reconcileOnce(ctx context.Context, checker InstanceRunningChecker, onOffline OfflineHandler) { +// Scope: online/degraded + SaaS-EC2 workspaces only. runtime='external' +// rows are excluded (covered by the remote-heartbeat pass); paused/ +// hibernated/removed/provisioning/awaiting_agent are excluded by the +// status filter. `degraded` is included because a SaaS workspace whose +// heartbeat handler flipped it degraded then lost its EC2 falls through +// every other sweep (matches healthsweep's `status IN ('online', +// 'degraded')`). +func reconcileOnce(parent context.Context, checker InstanceRunningChecker, onOffline OfflineHandler) { if db.DB == nil { return } - rows, err := db.DB.QueryContext(ctx, ` - SELECT id::text + // Per-cycle deadline so a degraded-but-not-erroring CP (each IsRunning + // slow but under the per-workspace cap) can't drag one cycle out for + // tens of minutes and starve the next tick. Per-workspace IsRunning + // timeouts derive from this cycle context. + cycleCtx, cancelCycle := context.WithTimeout(parent, cpInstanceCycleDeadline) + defer cancelCycle() + + rows, err := db.DB.QueryContext(cycleCtx, ` + SELECT id::text, instance_id FROM workspaces - WHERE status = 'online' + WHERE status IN ('online', 'degraded') AND instance_id IS NOT NULL AND instance_id != '' AND COALESCE(runtime, '') <> 'external' @@ -133,46 +178,130 @@ func reconcileOnce(ctx context.Context, checker InstanceRunningChecker, onOfflin } defer rows.Close() - var ids []string + var candidates []reconcileRow for rows.Next() { - var id string - if scanErr := rows.Scan(&id); scanErr != nil { + var r reconcileRow + if scanErr := rows.Scan(&r.id, &r.instanceID); scanErr != nil { log.Printf("cp-instance-reconciler: row scan failed: %v", scanErr) continue } - ids = append(ids, id) + candidates = append(candidates, r) } if iterErr := rows.Err(); iterErr != nil { log.Printf("cp-instance-reconciler: rows iteration failed: %v", iterErr) return } - for _, id := range ids { + processed, skipped := 0, 0 + for _, c := range candidates { + // Abandon the cycle if we've blown the per-cycle deadline; the + // next tick re-reads from the top (ORDER BY updated_at DESC) and + // drains the backlog. Without this a slow CP could keep one cycle + // running past its interval and never let a fresh one start. + if cycleCtx.Err() != nil { + log.Printf("cp-instance-reconciler: cycle deadline reached — processed %d, %d skipped (TOCTOU/changed), remaining deferred to next cycle", processed, skipped) + return + } + processed++ + // Per-workspace timeout so one slow CP round-trip can't stall - // the whole sweep. - checkCtx, cancel := context.WithTimeout(ctx, cpInstanceCheckTimeout) - running, checkErr := checker.IsRunning(checkCtx, id) + // the whole sweep. Derived from cycleCtx so the cycle deadline + // always dominates. + checkCtx, cancel := context.WithTimeout(cycleCtx, cpInstanceCheckTimeout) + running, checkErr := checker.IsRunning(checkCtx, c.id) cancel() if checkErr != nil { // FAIL-SAFE: transient DB/transport error (or a no-backend // signal). IsRunning returns (true, err) on these, so never // flip — leave the row online and retry next cycle. - log.Printf("cp-instance-reconciler: IsRunning(%s) errored, leaving online (fail-safe): %v", id, checkErr) + log.Printf("cp-instance-reconciler: IsRunning(%s) errored, leaving online (fail-safe): %v", c.id, checkErr) continue } if running { continue } - // CLEAN "not running" — CP authoritatively reports the EC2 is - // terminated/stopped/absent. Feed it into the existing offline + + // (false, nil) is NOT yet proof the EC2 is dead. IsRunning + // resolves instance_id independently (resolveInstanceID); if the + // row's instance_id was cleared/NULLed (concurrent delete, the + // CP-orphan-sweeper NULLing it, a reprovision) or the row moved + // off online/degraded between our SELECT and this probe, + // IsRunning returns a STALE (false, nil) that reflects a missing + // instance_id, NOT a confirmed-terminated EC2. Re-confirm against + // the row's CURRENT state and flip ONLY when the SAME non-empty + // instance we asked CP about is still recorded AND the row is + // still online/degraded. Mirrors healthsweep's guarded write. + if !reconfirmStillOfflineCandidate(cycleCtx, c) { + skipped++ + continue + } + + // CONFIRMED "not running" — CP authoritatively reports the EC2 is + // terminated/stopped/absent AND the row still records that exact + // instance as online/degraded. Feed it into the existing offline + // auto-heal machinery: onOffline flips the row offline and // triggers RestartByID, which reprovisions with the existing // volume. - log.Printf("cp-instance-reconciler: workspace %s is status=online but its EC2 is not running (terminated/stopped) — flipping offline + triggering reprovision", id) + log.Printf("cp-instance-reconciler: workspace %s (instance %s) is online/degraded but its EC2 is not running (terminated/stopped) — flipping offline + triggering reprovision", c.id, c.instanceID) if onOffline != nil { - onOffline(ctx, id) + onOffline(cycleCtx, c.id) } } } + +// reconfirmStillOfflineCandidate re-reads the workspace's CURRENT +// (status, instance_id) and reports whether it is STILL a valid offline +// candidate for the instance we just probed. It returns true ONLY when: +// +// - the row still exists, AND +// - current status IN ('online','degraded'), AND +// - current instance_id is non-empty, AND +// - current instance_id == the instance_id captured in the original +// SELECT (the one whose liveness CP just answered about). +// +// Any other outcome (row gone, status moved off online/degraded, +// instance_id cleared or now points at a different instance) means the +// IsRunning (false, nil) was a stale/cleared-instance snapshot rather +// than a confirmed-terminated EC2 — return false so the caller skips the +// flip. A DB error during re-confirm is treated as "not confirmed" +// (false): fail-safe toward NOT flipping a workspace we can't re-verify. +func reconfirmStillOfflineCandidate(parent context.Context, c reconcileRow) bool { + if db.DB == nil { + return false + } + ctx, cancel := context.WithTimeout(parent, cpInstanceReconfirmTimeout) + defer cancel() + + var curStatus, curInstanceID string + err := db.DB.QueryRowContext(ctx, ` + SELECT status, COALESCE(instance_id, '') + FROM workspaces + WHERE id = $1 + `, c.id).Scan(&curStatus, &curInstanceID) + if err != nil { + if err == sql.ErrNoRows { + // Row deleted between SELECT and re-confirm — definitely not a + // terminated-EC2 signal. Skip. + log.Printf("cp-instance-reconciler: re-confirm %s: row gone — skipping flip (stale snapshot, not a dead EC2)", c.id) + return false + } + // Transient DB error — fail-safe toward NOT flipping. + log.Printf("cp-instance-reconciler: re-confirm %s errored, skipping flip (fail-safe): %v", c.id, err) + return false + } + + if curStatus != "online" && curStatus != "degraded" { + log.Printf("cp-instance-reconciler: re-confirm %s: status moved to %q since SELECT — skipping flip", c.id, curStatus) + return false + } + if curInstanceID == "" { + log.Printf("cp-instance-reconciler: re-confirm %s: instance_id cleared since SELECT — skipping flip (CP answered about a now-detached instance)", c.id) + return false + } + if curInstanceID != c.instanceID { + log.Printf("cp-instance-reconciler: re-confirm %s: instance_id changed %s -> %s since SELECT (reprovision) — skipping flip", c.id, c.instanceID, curInstanceID) + return false + } + return true +} diff --git a/workspace-server/internal/registry/cp_instance_reconciler_test.go b/workspace-server/internal/registry/cp_instance_reconciler_test.go index e1ea66c91..d91be78a3 100644 --- a/workspace-server/internal/registry/cp_instance_reconciler_test.go +++ b/workspace-server/internal/registry/cp_instance_reconciler_test.go @@ -63,16 +63,48 @@ func (r *recordingOffline) got() []string { } // expectReconcileQuery registers the reconciler's SELECT, pinning the -// scope-critical predicates: status='online', instance_id present, and -// runtime <> 'external'. A future widening that drops any of these (e.g. -// sweeping paused rows, or external rows the heartbeat pass owns) fails -// every test that uses this helper. +// scope-critical predicates: status IN ('online','degraded'), instance_id +// present (captured as a column for the TOCTOU re-confirm), and runtime +// <> 'external'. A future widening that drops any of these (e.g. sweeping +// paused rows, or external rows the heartbeat pass owns), or that drops +// the instance_id column the re-confirm depends on, fails every test that +// uses this helper. func expectReconcileQuery(mock sqlmock.Sqlmock, rows *sqlmock.Rows) { - mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces\s+WHERE status = 'online'\s+AND instance_id IS NOT NULL\s+AND instance_id != ''\s+AND COALESCE\(runtime, ''\) <> 'external'\s+ORDER BY updated_at DESC\s+LIMIT \$1`). + mock.ExpectQuery(`(?s)^\s*SELECT id::text, instance_id\s+FROM workspaces\s+WHERE status IN \('online', 'degraded'\)\s+AND instance_id IS NOT NULL\s+AND instance_id != ''\s+AND COALESCE\(runtime, ''\) <> 'external'\s+ORDER BY updated_at DESC\s+LIMIT \$1`). WithArgs(CPInstanceReconcileLimit). WillReturnRows(rows) } +// reconcileRows builds the two-column (id, instance_id) result the +// reconciler's SELECT now returns. Pass id/instance_id pairs. +func reconcileRows(pairs ...[2]string) *sqlmock.Rows { + r := sqlmock.NewRows([]string{"id", "instance_id"}) + for _, p := range pairs { + r.AddRow(p[0], p[1]) + } + return r +} + +// expectReconfirm registers the TOCTOU re-confirm primary-key lookup for +// workspace id `wsID`, returning the row's CURRENT (status, instance_id). +// This is what the reconciler re-reads after IsRunning returns (false, +// nil), before it flips: it only flips when the SAME non-empty instance +// is still recorded AND status is still online/degraded. +func expectReconfirm(mock sqlmock.Sqlmock, wsID, curStatus, curInstanceID string) { + mock.ExpectQuery(`(?s)^\s*SELECT status, COALESCE\(instance_id, ''\)\s+FROM workspaces\s+WHERE id = \$1`). + WithArgs(wsID). + WillReturnRows(sqlmock.NewRows([]string{"status", "instance_id"}).AddRow(curStatus, curInstanceID)) +} + +// expectReconfirmNoRows registers a re-confirm lookup that finds the row +// gone (deleted between SELECT and re-confirm) — the reconciler must +// treat this as "not a dead EC2" and skip the flip. +func expectReconfirmNoRows(mock sqlmock.Sqlmock, wsID string) { + mock.ExpectQuery(`(?s)^\s*SELECT status, COALESCE\(instance_id, ''\)\s+FROM workspaces\s+WHERE id = \$1`). + WithArgs(wsID). + WillReturnRows(sqlmock.NewRows([]string{"status", "instance_id"})) +} + // TestReconcileOnce_NotRunning_FlipsOffline — the core bug (core#2247): // an online SaaS workspace whose EC2 is terminated. CP reports a CLEAN // (false, nil); onOffline MUST be called with that id so the existing @@ -82,7 +114,10 @@ func TestReconcileOnce_NotRunning_FlipsOffline(t *testing.T) { checker := &fakeRunningChecker{running: map[string]bool{"ws-dead": false}} off := &recordingOffline{} - expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).AddRow("ws-dead")) + expectReconcileQuery(mock, reconcileRows([2]string{"ws-dead", "i-dead"})) + // (false,nil) → re-confirm: row still online with the SAME instance → + // confirmed-dead → flip. + expectReconfirm(mock, "ws-dead", "online", "i-dead") reconcileOnce(context.Background(), checker, off.handler()) @@ -102,7 +137,8 @@ func TestReconcileOnce_Running_DoesNotFlip(t *testing.T) { checker := &fakeRunningChecker{running: map[string]bool{"ws-alive": true}} off := &recordingOffline{} - expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).AddRow("ws-alive")) + // Running → no re-confirm, no flip. + expectReconcileQuery(mock, reconcileRows([2]string{"ws-alive", "i-alive"})) reconcileOnce(context.Background(), checker, off.handler()) @@ -126,7 +162,9 @@ func TestReconcileOnce_TransientError_DoesNotFlip(t *testing.T) { } off := &recordingOffline{} - expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).AddRow("ws-blip")) + // (true,err) short-circuits BEFORE the re-confirm — no re-confirm query + // is registered, so a stray re-confirm would fail ExpectationsWereMet. + expectReconcileQuery(mock, reconcileRows([2]string{"ws-blip", "i-blip"})) reconcileOnce(context.Background(), checker, off.handler()) @@ -143,19 +181,20 @@ func TestReconcileOnce_TransientError_DoesNotFlip(t *testing.T) { // TestReconcileOnce_QueryScopeExcludesExternalAndNonOnline — pins the // SELECT predicate. The regex in expectReconcileQuery requires -// status='online' AND runtime <> 'external'; if a future edit widens the -// scope to include paused/hibernated/removed rows or external rows (owned -// by the heartbeat pass), this query no longer matches and sqlmock fails -// the test. With the predicate intact, a DB that has only out-of-scope -// rows returns empty → no IsRunning, no flip. +// status IN ('online','degraded') AND runtime <> 'external'; if a future +// edit widens the scope to include paused/hibernated/removed rows or +// external rows (owned by the heartbeat pass), or narrows it back to drop +// 'degraded', this query no longer matches and sqlmock fails the test. +// With the predicate intact, a DB that has only out-of-scope rows returns +// empty → no IsRunning, no flip. func TestReconcileOnce_QueryScopeExcludesExternalAndNonOnline(t *testing.T) { mock := setupTestDB(t) checker := &fakeRunningChecker{} off := &recordingOffline{} - // The predicate filters out external + non-online rows server-side, - // modelled as the empty result those filters produce. - expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"})) + // The predicate filters out external + out-of-scope-status rows + // server-side, modelled as the empty result those filters produce. + expectReconcileQuery(mock, reconcileRows()) reconcileOnce(context.Background(), checker, off.handler()) @@ -180,10 +219,13 @@ func TestReconcileOnce_MixedBatch(t *testing.T) { } off := &recordingOffline{} - expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}). - AddRow("ws-dead"). - AddRow("ws-alive"). - AddRow("ws-blip")) + expectReconcileQuery(mock, reconcileRows( + [2]string{"ws-dead", "i-dead"}, + [2]string{"ws-alive", "i-alive"}, + [2]string{"ws-blip", "i-blip"}, + )) + // Only ws-dead reaches the re-confirm ((false,nil)); it confirms. + expectReconfirm(mock, "ws-dead", "online", "i-dead") reconcileOnce(context.Background(), checker, off.handler()) @@ -195,6 +237,147 @@ func TestReconcileOnce_MixedBatch(t *testing.T) { } } +// TestReconcileOnce_TOCTOU_InstanceChanged_DoesNotFlip — the HIGH-1 +// regression guard. IsRunning returns a CLEAN (false, nil), but between +// the reconciler's SELECT and the probe the row's instance_id changed +// (reprovision attached a fresh EC2). IsRunning's independent +// resolveInstanceID is the reason the (false,nil) is stale: it may have +// resolved an empty/old instance. The re-confirm sees a DIFFERENT +// instance_id and MUST skip — flipping here would knock out a workspace +// whose NEW EC2 is not proven dead and fire RestartByID on a just- +// reprovisioned row. +func TestReconcileOnce_TOCTOU_InstanceChanged_DoesNotFlip(t *testing.T) { + mock := setupTestDB(t) + checker := &fakeRunningChecker{running: map[string]bool{"ws-race": false}} + off := &recordingOffline{} + + expectReconcileQuery(mock, reconcileRows([2]string{"ws-race", "i-old"})) + // Re-confirm: row is still online but now points at a DIFFERENT + // instance (reprovisioned) → the (false,nil) was about i-old which is + // no longer attached → skip. + expectReconfirm(mock, "ws-race", "online", "i-new") + + reconcileOnce(context.Background(), checker, off.handler()) + + if got := off.got(); len(got) != 0 { + t.Fatalf("TOCTOU guard violated: instance_id changed since SELECT must NOT flip, got %v", got) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet expectations: %v", err) + } +} + +// TestReconcileOnce_TOCTOU_InstanceCleared_DoesNotFlip — same HIGH-1 +// guard, the instance_id-NULLed variant (CP-orphan-sweeper or a delete +// cleared it). Re-confirm sees an empty instance_id → skip. +func TestReconcileOnce_TOCTOU_InstanceCleared_DoesNotFlip(t *testing.T) { + mock := setupTestDB(t) + checker := &fakeRunningChecker{running: map[string]bool{"ws-cleared": false}} + off := &recordingOffline{} + + expectReconcileQuery(mock, reconcileRows([2]string{"ws-cleared", "i-gone"})) + expectReconfirm(mock, "ws-cleared", "online", "") // instance_id cleared + + reconcileOnce(context.Background(), checker, off.handler()) + + if got := off.got(); len(got) != 0 { + t.Fatalf("TOCTOU guard violated: cleared instance_id must NOT flip, got %v", got) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet expectations: %v", err) + } +} + +// TestReconcileOnce_TOCTOU_StatusMoved_DoesNotFlip — same HIGH-1 guard, +// the status-moved variant. The row left online/degraded (e.g. paused or +// removed) between SELECT and re-confirm → skip. +func TestReconcileOnce_TOCTOU_StatusMoved_DoesNotFlip(t *testing.T) { + mock := setupTestDB(t) + checker := &fakeRunningChecker{running: map[string]bool{"ws-paused": false}} + off := &recordingOffline{} + + expectReconcileQuery(mock, reconcileRows([2]string{"ws-paused", "i-keep"})) + expectReconfirm(mock, "ws-paused", "paused", "i-keep") // status moved out of scope + + reconcileOnce(context.Background(), checker, off.handler()) + + if got := off.got(); len(got) != 0 { + t.Fatalf("TOCTOU guard violated: row no longer online/degraded must NOT flip, got %v", got) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet expectations: %v", err) + } +} + +// TestReconcileOnce_TOCTOU_RowGone_DoesNotFlip — same HIGH-1 guard, the +// row-deleted variant. The re-confirm finds no row (concurrent delete) → +// skip; a stale (false,nil) about a just-deleted row must never fire +// onOffline/RestartByID. +func TestReconcileOnce_TOCTOU_RowGone_DoesNotFlip(t *testing.T) { + mock := setupTestDB(t) + checker := &fakeRunningChecker{running: map[string]bool{"ws-deleted": false}} + off := &recordingOffline{} + + expectReconcileQuery(mock, reconcileRows([2]string{"ws-deleted", "i-x"})) + expectReconfirmNoRows(mock, "ws-deleted") // row gone + + reconcileOnce(context.Background(), checker, off.handler()) + + if got := off.got(); len(got) != 0 { + t.Fatalf("TOCTOU guard violated: deleted row must NOT flip, got %v", got) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet expectations: %v", err) + } +} + +// TestReconcileOnce_Degraded_FlipsOffline — MED-3 scope. A `degraded` +// SaaS workspace whose EC2 is gone is otherwise covered by NO sweep. It's +// in scope (the SELECT regex requires status IN ('online','degraded')), +// CP reports (false,nil), the re-confirm shows it STILL degraded with the +// SAME instance → flip. +func TestReconcileOnce_Degraded_FlipsOffline(t *testing.T) { + mock := setupTestDB(t) + checker := &fakeRunningChecker{running: map[string]bool{"ws-degraded": false}} + off := &recordingOffline{} + + expectReconcileQuery(mock, reconcileRows([2]string{"ws-degraded", "i-deg"})) + expectReconfirm(mock, "ws-degraded", "degraded", "i-deg") + + reconcileOnce(context.Background(), checker, off.handler()) + + if got := off.got(); len(got) != 1 || got[0] != "ws-degraded" { + t.Fatalf("expected onOffline(ws-degraded), got %v", got) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet expectations: %v", err) + } +} + +// TestReconfirm_DBError_DoesNotFlip — re-confirm fail-safe. If the +// re-confirm read itself errors (transient DB blip), we treat it as "not +// confirmed" and skip the flip rather than acting on an unverifiable +// (false,nil). +func TestReconcileOnce_ReconfirmDBError_DoesNotFlip(t *testing.T) { + mock := setupTestDB(t) + checker := &fakeRunningChecker{running: map[string]bool{"ws-x": false}} + off := &recordingOffline{} + + expectReconcileQuery(mock, reconcileRows([2]string{"ws-x", "i-x"})) + mock.ExpectQuery(`(?s)^\s*SELECT status, COALESCE\(instance_id, ''\)\s+FROM workspaces\s+WHERE id = \$1`). + WithArgs("ws-x"). + WillReturnError(errors.New("connection reset")) + + reconcileOnce(context.Background(), checker, off.handler()) + + if got := off.got(); len(got) != 0 { + t.Fatalf("re-confirm DB error must fail-safe (no flip), got %v", got) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("unmet expectations: %v", err) + } +} + // TestReconcileOnce_QueryError — DB transient failure. Reconcile returns // without panicking and never probes IsRunning or flips anything. func TestReconcileOnce_QueryError(t *testing.T) { @@ -202,7 +385,7 @@ func TestReconcileOnce_QueryError(t *testing.T) { checker := &fakeRunningChecker{} off := &recordingOffline{} - mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces`). + mock.ExpectQuery(`(?s)^\s*SELECT id::text, instance_id\s+FROM workspaces`). WithArgs(CPInstanceReconcileLimit). WillReturnError(errors.New("connection refused"))