fix(registry): reconciler — guard TOCTOU false-flip + cover degraded + cycle deadline (core#2261 review) #2283

Merged
hongming merged 1 commits from fix/core2261-reconciler-toctou-degraded-hardening into main 2026-06-05 03:36:59 +00:00
2 changed files with 355 additions and 43 deletions
@@ -36,11 +36,22 @@ package registry
// runtime <> 'external'. Paused/hibernated/removed/provisioning/
// awaiting_agent rows are out of scope; external rows are covered by
// the remote-heartbeat pass.
// - Per-cycle row cap + per-workspace timeout so one slow CP call can't
// stall the sweep.
// - Per-cycle row cap + per-cycle deadline + per-workspace timeout so
// one slow CP call (or a degraded-but-not-erroring CP) can't stall
// the sweep.
// - TOCTOU re-confirm before any flip: IsRunning resolves instance_id
// independently, so a row whose instance_id was cleared/NULLed (by a
// concurrent delete, the CP-orphan-sweeper, or a reprovision) between
// the reconciler's SELECT and the IsRunning probe yields a STALE
// (false, nil) that does NOT prove the EC2 is dead. We re-read the
// row's current (status, instance_id) and flip ONLY when the SAME
// non-empty instance we asked CP about is still the workspace's
// recorded instance AND it's still online/degraded. Mirrors the
// guarded-write re-confirm in healthsweep.
import (
"context"
"database/sql"
"log"
"time"
@@ -70,6 +81,20 @@ const CPInstanceReconcileLimit = 200
// timeout context derived from the cycle context.
const cpInstanceCheckTimeout = 10 * time.Second
// cpInstanceCycleDeadline bounds the wall-time of one whole reconcile
// pass. With per-workspace 10s timeouts and a 200-row cap, a degraded-
// but-not-erroring CP (each IsRunning slow but under the per-workspace
// cap) could otherwise drag one cycle out for tens of minutes and starve
// the next tick. Mirrors cp_orphan_sweeper's orphanSweepDeadline; chosen
// under the 60s interval so a stuck cycle is abandoned before the next
// one is due and the backlog drains across subsequent cycles.
const cpInstanceCycleDeadline = 45 * time.Second
// cpInstanceReconfirmTimeout bounds the TOCTOU re-confirm read. This is a
// single indexed primary-key lookup, so it should never be slow; a tight
// timeout keeps the re-confirm from itself becoming a stall point.
const cpInstanceReconfirmTimeout = 5 * time.Second
// StartCPInstanceReconciler runs the authoritative EC2-state reconcile
// loop until ctx is cancelled. A nil checker makes the loop a no-op
// (matches the nil-tolerant pattern of the sibling CP sweeper).
@@ -106,21 +131,41 @@ func StartCPInstanceReconciler(ctx context.Context, checker InstanceRunningCheck
}
}
// reconcileRow pairs a workspace id with the instance_id captured in the
// SAME SELECT, so the TOCTOU re-confirm can verify CP's (false, nil)
// answer is about the instance the row still records — not one cleared
// out from under us between the SELECT and the IsRunning probe.
type reconcileRow struct {
id string
instanceID string
}
// reconcileOnce executes one reconcile pass. Defensive against db.DB
// being nil so a misconfigured boot doesn't panic.
//
// Scope: online + SaaS-EC2 workspaces only. runtime='external' rows are
// excluded (covered by the remote-heartbeat pass); paused/hibernated/
// removed/provisioning/awaiting_agent are excluded by the status filter.
func reconcileOnce(ctx context.Context, checker InstanceRunningChecker, onOffline OfflineHandler) {
// Scope: online/degraded + SaaS-EC2 workspaces only. runtime='external'
// rows are excluded (covered by the remote-heartbeat pass); paused/
// hibernated/removed/provisioning/awaiting_agent are excluded by the
// status filter. `degraded` is included because a SaaS workspace whose
// heartbeat handler flipped it degraded then lost its EC2 falls through
// every other sweep (matches healthsweep's `status IN ('online',
// 'degraded')`).
func reconcileOnce(parent context.Context, checker InstanceRunningChecker, onOffline OfflineHandler) {
if db.DB == nil {
return
}
rows, err := db.DB.QueryContext(ctx, `
SELECT id::text
// Per-cycle deadline so a degraded-but-not-erroring CP (each IsRunning
// slow but under the per-workspace cap) can't drag one cycle out for
// tens of minutes and starve the next tick. Per-workspace IsRunning
// timeouts derive from this cycle context.
cycleCtx, cancelCycle := context.WithTimeout(parent, cpInstanceCycleDeadline)
defer cancelCycle()
rows, err := db.DB.QueryContext(cycleCtx, `
SELECT id::text, instance_id
FROM workspaces
WHERE status = 'online'
WHERE status IN ('online', 'degraded')
AND instance_id IS NOT NULL
AND instance_id != ''
AND COALESCE(runtime, '') <> 'external'
@@ -133,46 +178,130 @@ func reconcileOnce(ctx context.Context, checker InstanceRunningChecker, onOfflin
}
defer rows.Close()
var ids []string
var candidates []reconcileRow
for rows.Next() {
var id string
if scanErr := rows.Scan(&id); scanErr != nil {
var r reconcileRow
if scanErr := rows.Scan(&r.id, &r.instanceID); scanErr != nil {
log.Printf("cp-instance-reconciler: row scan failed: %v", scanErr)
continue
}
ids = append(ids, id)
candidates = append(candidates, r)
}
if iterErr := rows.Err(); iterErr != nil {
log.Printf("cp-instance-reconciler: rows iteration failed: %v", iterErr)
return
}
for _, id := range ids {
processed, skipped := 0, 0
for _, c := range candidates {
// Abandon the cycle if we've blown the per-cycle deadline; the
// next tick re-reads from the top (ORDER BY updated_at DESC) and
// drains the backlog. Without this a slow CP could keep one cycle
// running past its interval and never let a fresh one start.
if cycleCtx.Err() != nil {
log.Printf("cp-instance-reconciler: cycle deadline reached — processed %d, %d skipped (TOCTOU/changed), remaining deferred to next cycle", processed, skipped)
return
}
processed++
// Per-workspace timeout so one slow CP round-trip can't stall
// the whole sweep.
checkCtx, cancel := context.WithTimeout(ctx, cpInstanceCheckTimeout)
running, checkErr := checker.IsRunning(checkCtx, id)
// the whole sweep. Derived from cycleCtx so the cycle deadline
// always dominates.
checkCtx, cancel := context.WithTimeout(cycleCtx, cpInstanceCheckTimeout)
running, checkErr := checker.IsRunning(checkCtx, c.id)
cancel()
if checkErr != nil {
// FAIL-SAFE: transient DB/transport error (or a no-backend
// signal). IsRunning returns (true, err) on these, so never
// flip — leave the row online and retry next cycle.
log.Printf("cp-instance-reconciler: IsRunning(%s) errored, leaving online (fail-safe): %v", id, checkErr)
log.Printf("cp-instance-reconciler: IsRunning(%s) errored, leaving online (fail-safe): %v", c.id, checkErr)
continue
}
if running {
continue
}
// CLEAN "not running" — CP authoritatively reports the EC2 is
// terminated/stopped/absent. Feed it into the existing offline +
// (false, nil) is NOT yet proof the EC2 is dead. IsRunning
// resolves instance_id independently (resolveInstanceID); if the
// row's instance_id was cleared/NULLed (concurrent delete, the
// CP-orphan-sweeper NULLing it, a reprovision) or the row moved
// off online/degraded between our SELECT and this probe,
// IsRunning returns a STALE (false, nil) that reflects a missing
// instance_id, NOT a confirmed-terminated EC2. Re-confirm against
// the row's CURRENT state and flip ONLY when the SAME non-empty
// instance we asked CP about is still recorded AND the row is
// still online/degraded. Mirrors healthsweep's guarded write.
if !reconfirmStillOfflineCandidate(cycleCtx, c) {
skipped++
continue
}
// CONFIRMED "not running" — CP authoritatively reports the EC2 is
// terminated/stopped/absent AND the row still records that exact
// instance as online/degraded. Feed it into the existing offline +
// auto-heal machinery: onOffline flips the row offline and
// triggers RestartByID, which reprovisions with the existing
// volume.
log.Printf("cp-instance-reconciler: workspace %s is status=online but its EC2 is not running (terminated/stopped) — flipping offline + triggering reprovision", id)
log.Printf("cp-instance-reconciler: workspace %s (instance %s) is online/degraded but its EC2 is not running (terminated/stopped) — flipping offline + triggering reprovision", c.id, c.instanceID)
if onOffline != nil {
onOffline(ctx, id)
onOffline(cycleCtx, c.id)
}
}
}
// reconfirmStillOfflineCandidate re-reads the workspace's CURRENT
// (status, instance_id) and reports whether it is STILL a valid offline
// candidate for the instance we just probed. It returns true ONLY when:
//
// - the row still exists, AND
// - current status IN ('online','degraded'), AND
// - current instance_id is non-empty, AND
// - current instance_id == the instance_id captured in the original
// SELECT (the one whose liveness CP just answered about).
//
// Any other outcome (row gone, status moved off online/degraded,
// instance_id cleared or now points at a different instance) means the
// IsRunning (false, nil) was a stale/cleared-instance snapshot rather
// than a confirmed-terminated EC2 — return false so the caller skips the
// flip. A DB error during re-confirm is treated as "not confirmed"
// (false): fail-safe toward NOT flipping a workspace we can't re-verify.
func reconfirmStillOfflineCandidate(parent context.Context, c reconcileRow) bool {
if db.DB == nil {
return false
}
ctx, cancel := context.WithTimeout(parent, cpInstanceReconfirmTimeout)
defer cancel()
var curStatus, curInstanceID string
err := db.DB.QueryRowContext(ctx, `
SELECT status, COALESCE(instance_id, '')
FROM workspaces
WHERE id = $1
`, c.id).Scan(&curStatus, &curInstanceID)
if err != nil {
if err == sql.ErrNoRows {
// Row deleted between SELECT and re-confirm — definitely not a
// terminated-EC2 signal. Skip.
log.Printf("cp-instance-reconciler: re-confirm %s: row gone — skipping flip (stale snapshot, not a dead EC2)", c.id)
return false
}
// Transient DB error — fail-safe toward NOT flipping.
log.Printf("cp-instance-reconciler: re-confirm %s errored, skipping flip (fail-safe): %v", c.id, err)
return false
}
if curStatus != "online" && curStatus != "degraded" {
log.Printf("cp-instance-reconciler: re-confirm %s: status moved to %q since SELECT — skipping flip", c.id, curStatus)
return false
}
if curInstanceID == "" {
log.Printf("cp-instance-reconciler: re-confirm %s: instance_id cleared since SELECT — skipping flip (CP answered about a now-detached instance)", c.id)
return false
}
if curInstanceID != c.instanceID {
log.Printf("cp-instance-reconciler: re-confirm %s: instance_id changed %s -> %s since SELECT (reprovision) — skipping flip", c.id, c.instanceID, curInstanceID)
return false
}
return true
}
@@ -63,16 +63,48 @@ func (r *recordingOffline) got() []string {
}
// expectReconcileQuery registers the reconciler's SELECT, pinning the
// scope-critical predicates: status='online', instance_id present, and
// runtime <> 'external'. A future widening that drops any of these (e.g.
// sweeping paused rows, or external rows the heartbeat pass owns) fails
// every test that uses this helper.
// scope-critical predicates: status IN ('online','degraded'), instance_id
// present (captured as a column for the TOCTOU re-confirm), and runtime
// <> 'external'. A future widening that drops any of these (e.g. sweeping
// paused rows, or external rows the heartbeat pass owns), or that drops
// the instance_id column the re-confirm depends on, fails every test that
// uses this helper.
func expectReconcileQuery(mock sqlmock.Sqlmock, rows *sqlmock.Rows) {
mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces\s+WHERE status = 'online'\s+AND instance_id IS NOT NULL\s+AND instance_id != ''\s+AND COALESCE\(runtime, ''\) <> 'external'\s+ORDER BY updated_at DESC\s+LIMIT \$1`).
mock.ExpectQuery(`(?s)^\s*SELECT id::text, instance_id\s+FROM workspaces\s+WHERE status IN \('online', 'degraded'\)\s+AND instance_id IS NOT NULL\s+AND instance_id != ''\s+AND COALESCE\(runtime, ''\) <> 'external'\s+ORDER BY updated_at DESC\s+LIMIT \$1`).
WithArgs(CPInstanceReconcileLimit).
WillReturnRows(rows)
}
// reconcileRows builds the two-column (id, instance_id) result the
// reconciler's SELECT now returns. Pass id/instance_id pairs.
func reconcileRows(pairs ...[2]string) *sqlmock.Rows {
r := sqlmock.NewRows([]string{"id", "instance_id"})
for _, p := range pairs {
r.AddRow(p[0], p[1])
}
return r
}
// expectReconfirm registers the TOCTOU re-confirm primary-key lookup for
// workspace id `wsID`, returning the row's CURRENT (status, instance_id).
// This is what the reconciler re-reads after IsRunning returns (false,
// nil), before it flips: it only flips when the SAME non-empty instance
// is still recorded AND status is still online/degraded.
func expectReconfirm(mock sqlmock.Sqlmock, wsID, curStatus, curInstanceID string) {
mock.ExpectQuery(`(?s)^\s*SELECT status, COALESCE\(instance_id, ''\)\s+FROM workspaces\s+WHERE id = \$1`).
WithArgs(wsID).
WillReturnRows(sqlmock.NewRows([]string{"status", "instance_id"}).AddRow(curStatus, curInstanceID))
}
// expectReconfirmNoRows registers a re-confirm lookup that finds the row
// gone (deleted between SELECT and re-confirm) — the reconciler must
// treat this as "not a dead EC2" and skip the flip.
func expectReconfirmNoRows(mock sqlmock.Sqlmock, wsID string) {
mock.ExpectQuery(`(?s)^\s*SELECT status, COALESCE\(instance_id, ''\)\s+FROM workspaces\s+WHERE id = \$1`).
WithArgs(wsID).
WillReturnRows(sqlmock.NewRows([]string{"status", "instance_id"}))
}
// TestReconcileOnce_NotRunning_FlipsOffline — the core bug (core#2247):
// an online SaaS workspace whose EC2 is terminated. CP reports a CLEAN
// (false, nil); onOffline MUST be called with that id so the existing
@@ -82,7 +114,10 @@ func TestReconcileOnce_NotRunning_FlipsOffline(t *testing.T) {
checker := &fakeRunningChecker{running: map[string]bool{"ws-dead": false}}
off := &recordingOffline{}
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).AddRow("ws-dead"))
expectReconcileQuery(mock, reconcileRows([2]string{"ws-dead", "i-dead"}))
// (false,nil) → re-confirm: row still online with the SAME instance →
// confirmed-dead → flip.
expectReconfirm(mock, "ws-dead", "online", "i-dead")
reconcileOnce(context.Background(), checker, off.handler())
@@ -102,7 +137,8 @@ func TestReconcileOnce_Running_DoesNotFlip(t *testing.T) {
checker := &fakeRunningChecker{running: map[string]bool{"ws-alive": true}}
off := &recordingOffline{}
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).AddRow("ws-alive"))
// Running → no re-confirm, no flip.
expectReconcileQuery(mock, reconcileRows([2]string{"ws-alive", "i-alive"}))
reconcileOnce(context.Background(), checker, off.handler())
@@ -126,7 +162,9 @@ func TestReconcileOnce_TransientError_DoesNotFlip(t *testing.T) {
}
off := &recordingOffline{}
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).AddRow("ws-blip"))
// (true,err) short-circuits BEFORE the re-confirm — no re-confirm query
// is registered, so a stray re-confirm would fail ExpectationsWereMet.
expectReconcileQuery(mock, reconcileRows([2]string{"ws-blip", "i-blip"}))
reconcileOnce(context.Background(), checker, off.handler())
@@ -143,19 +181,20 @@ func TestReconcileOnce_TransientError_DoesNotFlip(t *testing.T) {
// TestReconcileOnce_QueryScopeExcludesExternalAndNonOnline — pins the
// SELECT predicate. The regex in expectReconcileQuery requires
// status='online' AND runtime <> 'external'; if a future edit widens the
// scope to include paused/hibernated/removed rows or external rows (owned
// by the heartbeat pass), this query no longer matches and sqlmock fails
// the test. With the predicate intact, a DB that has only out-of-scope
// rows returns empty → no IsRunning, no flip.
// status IN ('online','degraded') AND runtime <> 'external'; if a future
// edit widens the scope to include paused/hibernated/removed rows or
// external rows (owned by the heartbeat pass), or narrows it back to drop
// 'degraded', this query no longer matches and sqlmock fails the test.
// With the predicate intact, a DB that has only out-of-scope rows returns
// empty → no IsRunning, no flip.
func TestReconcileOnce_QueryScopeExcludesExternalAndNonOnline(t *testing.T) {
mock := setupTestDB(t)
checker := &fakeRunningChecker{}
off := &recordingOffline{}
// The predicate filters out external + non-online rows server-side,
// modelled as the empty result those filters produce.
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}))
// The predicate filters out external + out-of-scope-status rows
// server-side, modelled as the empty result those filters produce.
expectReconcileQuery(mock, reconcileRows())
reconcileOnce(context.Background(), checker, off.handler())
@@ -180,10 +219,13 @@ func TestReconcileOnce_MixedBatch(t *testing.T) {
}
off := &recordingOffline{}
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).
AddRow("ws-dead").
AddRow("ws-alive").
AddRow("ws-blip"))
expectReconcileQuery(mock, reconcileRows(
[2]string{"ws-dead", "i-dead"},
[2]string{"ws-alive", "i-alive"},
[2]string{"ws-blip", "i-blip"},
))
// Only ws-dead reaches the re-confirm ((false,nil)); it confirms.
expectReconfirm(mock, "ws-dead", "online", "i-dead")
reconcileOnce(context.Background(), checker, off.handler())
@@ -195,6 +237,147 @@ func TestReconcileOnce_MixedBatch(t *testing.T) {
}
}
// TestReconcileOnce_TOCTOU_InstanceChanged_DoesNotFlip — the HIGH-1
// regression guard. IsRunning returns a CLEAN (false, nil), but between
// the reconciler's SELECT and the probe the row's instance_id changed
// (reprovision attached a fresh EC2). IsRunning's independent
// resolveInstanceID is the reason the (false,nil) is stale: it may have
// resolved an empty/old instance. The re-confirm sees a DIFFERENT
// instance_id and MUST skip — flipping here would knock out a workspace
// whose NEW EC2 is not proven dead and fire RestartByID on a just-
// reprovisioned row.
func TestReconcileOnce_TOCTOU_InstanceChanged_DoesNotFlip(t *testing.T) {
mock := setupTestDB(t)
checker := &fakeRunningChecker{running: map[string]bool{"ws-race": false}}
off := &recordingOffline{}
expectReconcileQuery(mock, reconcileRows([2]string{"ws-race", "i-old"}))
// Re-confirm: row is still online but now points at a DIFFERENT
// instance (reprovisioned) → the (false,nil) was about i-old which is
// no longer attached → skip.
expectReconfirm(mock, "ws-race", "online", "i-new")
reconcileOnce(context.Background(), checker, off.handler())
if got := off.got(); len(got) != 0 {
t.Fatalf("TOCTOU guard violated: instance_id changed since SELECT must NOT flip, got %v", got)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("unmet expectations: %v", err)
}
}
// TestReconcileOnce_TOCTOU_InstanceCleared_DoesNotFlip — same HIGH-1
// guard, the instance_id-NULLed variant (CP-orphan-sweeper or a delete
// cleared it). Re-confirm sees an empty instance_id → skip.
func TestReconcileOnce_TOCTOU_InstanceCleared_DoesNotFlip(t *testing.T) {
mock := setupTestDB(t)
checker := &fakeRunningChecker{running: map[string]bool{"ws-cleared": false}}
off := &recordingOffline{}
expectReconcileQuery(mock, reconcileRows([2]string{"ws-cleared", "i-gone"}))
expectReconfirm(mock, "ws-cleared", "online", "") // instance_id cleared
reconcileOnce(context.Background(), checker, off.handler())
if got := off.got(); len(got) != 0 {
t.Fatalf("TOCTOU guard violated: cleared instance_id must NOT flip, got %v", got)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("unmet expectations: %v", err)
}
}
// TestReconcileOnce_TOCTOU_StatusMoved_DoesNotFlip — same HIGH-1 guard,
// the status-moved variant. The row left online/degraded (e.g. paused or
// removed) between SELECT and re-confirm → skip.
func TestReconcileOnce_TOCTOU_StatusMoved_DoesNotFlip(t *testing.T) {
mock := setupTestDB(t)
checker := &fakeRunningChecker{running: map[string]bool{"ws-paused": false}}
off := &recordingOffline{}
expectReconcileQuery(mock, reconcileRows([2]string{"ws-paused", "i-keep"}))
expectReconfirm(mock, "ws-paused", "paused", "i-keep") // status moved out of scope
reconcileOnce(context.Background(), checker, off.handler())
if got := off.got(); len(got) != 0 {
t.Fatalf("TOCTOU guard violated: row no longer online/degraded must NOT flip, got %v", got)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("unmet expectations: %v", err)
}
}
// TestReconcileOnce_TOCTOU_RowGone_DoesNotFlip — same HIGH-1 guard, the
// row-deleted variant. The re-confirm finds no row (concurrent delete) →
// skip; a stale (false,nil) about a just-deleted row must never fire
// onOffline/RestartByID.
func TestReconcileOnce_TOCTOU_RowGone_DoesNotFlip(t *testing.T) {
mock := setupTestDB(t)
checker := &fakeRunningChecker{running: map[string]bool{"ws-deleted": false}}
off := &recordingOffline{}
expectReconcileQuery(mock, reconcileRows([2]string{"ws-deleted", "i-x"}))
expectReconfirmNoRows(mock, "ws-deleted") // row gone
reconcileOnce(context.Background(), checker, off.handler())
if got := off.got(); len(got) != 0 {
t.Fatalf("TOCTOU guard violated: deleted row must NOT flip, got %v", got)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("unmet expectations: %v", err)
}
}
// TestReconcileOnce_Degraded_FlipsOffline — MED-3 scope. A `degraded`
// SaaS workspace whose EC2 is gone is otherwise covered by NO sweep. It's
// in scope (the SELECT regex requires status IN ('online','degraded')),
// CP reports (false,nil), the re-confirm shows it STILL degraded with the
// SAME instance → flip.
func TestReconcileOnce_Degraded_FlipsOffline(t *testing.T) {
mock := setupTestDB(t)
checker := &fakeRunningChecker{running: map[string]bool{"ws-degraded": false}}
off := &recordingOffline{}
expectReconcileQuery(mock, reconcileRows([2]string{"ws-degraded", "i-deg"}))
expectReconfirm(mock, "ws-degraded", "degraded", "i-deg")
reconcileOnce(context.Background(), checker, off.handler())
if got := off.got(); len(got) != 1 || got[0] != "ws-degraded" {
t.Fatalf("expected onOffline(ws-degraded), got %v", got)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("unmet expectations: %v", err)
}
}
// TestReconfirm_DBError_DoesNotFlip — re-confirm fail-safe. If the
// re-confirm read itself errors (transient DB blip), we treat it as "not
// confirmed" and skip the flip rather than acting on an unverifiable
// (false,nil).
func TestReconcileOnce_ReconfirmDBError_DoesNotFlip(t *testing.T) {
mock := setupTestDB(t)
checker := &fakeRunningChecker{running: map[string]bool{"ws-x": false}}
off := &recordingOffline{}
expectReconcileQuery(mock, reconcileRows([2]string{"ws-x", "i-x"}))
mock.ExpectQuery(`(?s)^\s*SELECT status, COALESCE\(instance_id, ''\)\s+FROM workspaces\s+WHERE id = \$1`).
WithArgs("ws-x").
WillReturnError(errors.New("connection reset"))
reconcileOnce(context.Background(), checker, off.handler())
if got := off.got(); len(got) != 0 {
t.Fatalf("re-confirm DB error must fail-safe (no flip), got %v", got)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("unmet expectations: %v", err)
}
}
// TestReconcileOnce_QueryError — DB transient failure. Reconcile returns
// without panicking and never probes IsRunning or flips anything.
func TestReconcileOnce_QueryError(t *testing.T) {
@@ -202,7 +385,7 @@ func TestReconcileOnce_QueryError(t *testing.T) {
checker := &fakeRunningChecker{}
off := &recordingOffline{}
mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces`).
mock.ExpectQuery(`(?s)^\s*SELECT id::text, instance_id\s+FROM workspaces`).
WithArgs(CPInstanceReconcileLimit).
WillReturnError(errors.New("connection refused"))