2026-06-05 03:36:59 +00:00
2 changed files with 355 additions and 43 deletions
@@ -36,11 +36,22 @@ package registry
 //     runtime <> 'external'. Paused/hibernated/removed/provisioning/
 //     awaiting_agent rows are out of scope; external rows are covered by
 //     the remote-heartbeat pass.
-//   - Per-cycle row cap + per-workspace timeout so one slow CP call can't
-//     stall the sweep.
+//   - Per-cycle row cap + per-cycle deadline + per-workspace timeout so
+//     one slow CP call (or a degraded-but-not-erroring CP) can't stall
+//     the sweep.
+//   - TOCTOU re-confirm before any flip: IsRunning resolves instance_id
+//     independently, so a row whose instance_id was cleared/NULLed (by a
+//     concurrent delete, the CP-orphan-sweeper, or a reprovision) between
+//     the reconciler's SELECT and the IsRunning probe yields a STALE
+//     (false, nil) that does NOT prove the EC2 is dead. We re-read the
+//     row's current (status, instance_id) and flip ONLY when the SAME
+//     non-empty instance we asked CP about is still the workspace's
+//     recorded instance AND it's still online/degraded. Mirrors the
+//     guarded-write re-confirm in healthsweep.

 import (
 	"context"
+	"database/sql"
 	"log"
 	"time"

@@ -70,6 +81,20 @@ const CPInstanceReconcileLimit = 200
 // timeout context derived from the cycle context.
 const cpInstanceCheckTimeout = 10 * time.Second

+// cpInstanceCycleDeadline bounds the wall-time of one whole reconcile
+// pass. With per-workspace 10s timeouts and a 200-row cap, a degraded-
+// but-not-erroring CP (each IsRunning slow but under the per-workspace
+// cap) could otherwise drag one cycle out for tens of minutes and starve
+// the next tick. Mirrors cp_orphan_sweeper's orphanSweepDeadline; chosen
+// under the 60s interval so a stuck cycle is abandoned before the next
+// one is due and the backlog drains across subsequent cycles.
+const cpInstanceCycleDeadline = 45 * time.Second
+
+// cpInstanceReconfirmTimeout bounds the TOCTOU re-confirm read. This is a
+// single indexed primary-key lookup, so it should never be slow; a tight
+// timeout keeps the re-confirm from itself becoming a stall point.
+const cpInstanceReconfirmTimeout = 5 * time.Second
+
 // StartCPInstanceReconciler runs the authoritative EC2-state reconcile
 // loop until ctx is cancelled. A nil checker makes the loop a no-op
 // (matches the nil-tolerant pattern of the sibling CP sweeper).
@@ -106,21 +131,41 @@ func StartCPInstanceReconciler(ctx context.Context, checker InstanceRunningCheck
 	}
 }

+// reconcileRow pairs a workspace id with the instance_id captured in the
+// SAME SELECT, so the TOCTOU re-confirm can verify CP's (false, nil)
+// answer is about the instance the row still records — not one cleared
+// out from under us between the SELECT and the IsRunning probe.
+type reconcileRow struct {
+	id         string
+	instanceID string
+}
+
 // reconcileOnce executes one reconcile pass. Defensive against db.DB
 // being nil so a misconfigured boot doesn't panic.
 //
-// Scope: online + SaaS-EC2 workspaces only. runtime='external' rows are
-// excluded (covered by the remote-heartbeat pass); paused/hibernated/
-// removed/provisioning/awaiting_agent are excluded by the status filter.
-func reconcileOnce(ctx context.Context, checker InstanceRunningChecker, onOffline OfflineHandler) {
+// Scope: online/degraded + SaaS-EC2 workspaces only. runtime='external'
+// rows are excluded (covered by the remote-heartbeat pass); paused/
+// hibernated/removed/provisioning/awaiting_agent are excluded by the
+// status filter. `degraded` is included because a SaaS workspace whose
+// heartbeat handler flipped it degraded then lost its EC2 falls through
+// every other sweep (matches healthsweep's `status IN ('online',
+// 'degraded')`).
+func reconcileOnce(parent context.Context, checker InstanceRunningChecker, onOffline OfflineHandler) {
 	if db.DB == nil {
 		return
 	}

-	rows, err := db.DB.QueryContext(ctx, `
-		SELECT id::text
+	// Per-cycle deadline so a degraded-but-not-erroring CP (each IsRunning
+	// slow but under the per-workspace cap) can't drag one cycle out for
+	// tens of minutes and starve the next tick. Per-workspace IsRunning
+	// timeouts derive from this cycle context.
+	cycleCtx, cancelCycle := context.WithTimeout(parent, cpInstanceCycleDeadline)
+	defer cancelCycle()
+
+	rows, err := db.DB.QueryContext(cycleCtx, `
+		SELECT id::text, instance_id
 		  FROM workspaces
-		 WHERE status = 'online'
+		 WHERE status IN ('online', 'degraded')
 		   AND instance_id IS NOT NULL
 		   AND instance_id != ''
 		   AND COALESCE(runtime, '') <> 'external'
@@ -133,46 +178,130 @@ func reconcileOnce(ctx context.Context, checker InstanceRunningChecker, onOfflin
 	}
 	defer rows.Close()

-	var ids []string
+	var candidates []reconcileRow
 	for rows.Next() {
-		var id string
-		if scanErr := rows.Scan(&id); scanErr != nil {
+		var r reconcileRow
+		if scanErr := rows.Scan(&r.id, &r.instanceID); scanErr != nil {
 			log.Printf("cp-instance-reconciler: row scan failed: %v", scanErr)
 			continue
 		}
-		ids = append(ids, id)
+		candidates = append(candidates, r)
 	}
 	if iterErr := rows.Err(); iterErr != nil {
 		log.Printf("cp-instance-reconciler: rows iteration failed: %v", iterErr)
 		return
 	}

-	for _, id := range ids {
+	processed, skipped := 0, 0
+	for _, c := range candidates {
+		// Abandon the cycle if we've blown the per-cycle deadline; the
+		// next tick re-reads from the top (ORDER BY updated_at DESC) and
+		// drains the backlog. Without this a slow CP could keep one cycle
+		// running past its interval and never let a fresh one start.
+		if cycleCtx.Err() != nil {
+			log.Printf("cp-instance-reconciler: cycle deadline reached — processed %d, %d skipped (TOCTOU/changed), remaining deferred to next cycle", processed, skipped)
+			return
+		}
+		processed++
+
 		// Per-workspace timeout so one slow CP round-trip can't stall
-		// the whole sweep.
-		checkCtx, cancel := context.WithTimeout(ctx, cpInstanceCheckTimeout)
-		running, checkErr := checker.IsRunning(checkCtx, id)
+		// the whole sweep. Derived from cycleCtx so the cycle deadline
+		// always dominates.
+		checkCtx, cancel := context.WithTimeout(cycleCtx, cpInstanceCheckTimeout)
+		running, checkErr := checker.IsRunning(checkCtx, c.id)
 		cancel()

 		if checkErr != nil {
 			// FAIL-SAFE: transient DB/transport error (or a no-backend
 			// signal). IsRunning returns (true, err) on these, so never
 			// flip — leave the row online and retry next cycle.
-			log.Printf("cp-instance-reconciler: IsRunning(%s) errored, leaving online (fail-safe): %v", id, checkErr)
+			log.Printf("cp-instance-reconciler: IsRunning(%s) errored, leaving online (fail-safe): %v", c.id, checkErr)
 			continue
 		}
 		if running {
 			continue
 		}

-		// CLEAN "not running" — CP authoritatively reports the EC2 is
-		// terminated/stopped/absent. Feed it into the existing offline +
+		// (false, nil) is NOT yet proof the EC2 is dead. IsRunning
+		// resolves instance_id independently (resolveInstanceID); if the
+		// row's instance_id was cleared/NULLed (concurrent delete, the
+		// CP-orphan-sweeper NULLing it, a reprovision) or the row moved
+		// off online/degraded between our SELECT and this probe,
+		// IsRunning returns a STALE (false, nil) that reflects a missing
+		// instance_id, NOT a confirmed-terminated EC2. Re-confirm against
+		// the row's CURRENT state and flip ONLY when the SAME non-empty
+		// instance we asked CP about is still recorded AND the row is
+		// still online/degraded. Mirrors healthsweep's guarded write.
+		if !reconfirmStillOfflineCandidate(cycleCtx, c) {
+			skipped++
+			continue
+		}
+
+		// CONFIRMED "not running" — CP authoritatively reports the EC2 is
+		// terminated/stopped/absent AND the row still records that exact
+		// instance as online/degraded. Feed it into the existing offline +
 		// auto-heal machinery: onOffline flips the row offline and
 		// triggers RestartByID, which reprovisions with the existing
 		// volume.
-		log.Printf("cp-instance-reconciler: workspace %s is status=online but its EC2 is not running (terminated/stopped) — flipping offline + triggering reprovision", id)
+		log.Printf("cp-instance-reconciler: workspace %s (instance %s) is online/degraded but its EC2 is not running (terminated/stopped) — flipping offline + triggering reprovision", c.id, c.instanceID)
 		if onOffline != nil {
-			onOffline(ctx, id)
+			onOffline(cycleCtx, c.id)
 		}
 	}
 }
+
+// reconfirmStillOfflineCandidate re-reads the workspace's CURRENT
+// (status, instance_id) and reports whether it is STILL a valid offline
+// candidate for the instance we just probed. It returns true ONLY when:
+//
+//   - the row still exists, AND
+//   - current status IN ('online','degraded'), AND
+//   - current instance_id is non-empty, AND
+//   - current instance_id == the instance_id captured in the original
+//     SELECT (the one whose liveness CP just answered about).
+//
+// Any other outcome (row gone, status moved off online/degraded,
+// instance_id cleared or now points at a different instance) means the
+// IsRunning (false, nil) was a stale/cleared-instance snapshot rather
+// than a confirmed-terminated EC2 — return false so the caller skips the
+// flip. A DB error during re-confirm is treated as "not confirmed"
+// (false): fail-safe toward NOT flipping a workspace we can't re-verify.
+func reconfirmStillOfflineCandidate(parent context.Context, c reconcileRow) bool {
+	if db.DB == nil {
+		return false
+	}
+	ctx, cancel := context.WithTimeout(parent, cpInstanceReconfirmTimeout)
+	defer cancel()
+
+	var curStatus, curInstanceID string
+	err := db.DB.QueryRowContext(ctx, `
+		SELECT status, COALESCE(instance_id, '')
+		  FROM workspaces
+		 WHERE id = $1
+	`, c.id).Scan(&curStatus, &curInstanceID)
+	if err != nil {
+		if err == sql.ErrNoRows {
+			// Row deleted between SELECT and re-confirm — definitely not a
+			// terminated-EC2 signal. Skip.
+			log.Printf("cp-instance-reconciler: re-confirm %s: row gone — skipping flip (stale snapshot, not a dead EC2)", c.id)
+			return false
+		}
+		// Transient DB error — fail-safe toward NOT flipping.
+		log.Printf("cp-instance-reconciler: re-confirm %s errored, skipping flip (fail-safe): %v", c.id, err)
+		return false
+	}
+
+	if curStatus != "online" && curStatus != "degraded" {
+		log.Printf("cp-instance-reconciler: re-confirm %s: status moved to %q since SELECT — skipping flip", c.id, curStatus)
+		return false
+	}
+	if curInstanceID == "" {
+		log.Printf("cp-instance-reconciler: re-confirm %s: instance_id cleared since SELECT — skipping flip (CP answered about a now-detached instance)", c.id)
+		return false
+	}
+	if curInstanceID != c.instanceID {
+		log.Printf("cp-instance-reconciler: re-confirm %s: instance_id changed %s -> %s since SELECT (reprovision) — skipping flip", c.id, c.instanceID, curInstanceID)
+		return false
+	}
+	return true
+}
@@ -63,16 +63,48 @@ func (r *recordingOffline) got() []string {
 }

 // expectReconcileQuery registers the reconciler's SELECT, pinning the
-// scope-critical predicates: status='online', instance_id present, and
-// runtime <> 'external'. A future widening that drops any of these (e.g.
-// sweeping paused rows, or external rows the heartbeat pass owns) fails
-// every test that uses this helper.
+// scope-critical predicates: status IN ('online','degraded'), instance_id
+// present (captured as a column for the TOCTOU re-confirm), and runtime
+// <> 'external'. A future widening that drops any of these (e.g. sweeping
+// paused rows, or external rows the heartbeat pass owns), or that drops
+// the instance_id column the re-confirm depends on, fails every test that
+// uses this helper.
 func expectReconcileQuery(mock sqlmock.Sqlmock, rows *sqlmock.Rows) {
-	mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces\s+WHERE status = 'online'\s+AND instance_id IS NOT NULL\s+AND instance_id != ''\s+AND COALESCE\(runtime, ''\) <> 'external'\s+ORDER BY updated_at DESC\s+LIMIT \$1`).
+	mock.ExpectQuery(`(?s)^\s*SELECT id::text, instance_id\s+FROM workspaces\s+WHERE status IN \('online', 'degraded'\)\s+AND instance_id IS NOT NULL\s+AND instance_id != ''\s+AND COALESCE\(runtime, ''\) <> 'external'\s+ORDER BY updated_at DESC\s+LIMIT \$1`).
 		WithArgs(CPInstanceReconcileLimit).
 		WillReturnRows(rows)
 }

+// reconcileRows builds the two-column (id, instance_id) result the
+// reconciler's SELECT now returns. Pass id/instance_id pairs.
+func reconcileRows(pairs ...[2]string) *sqlmock.Rows {
+	r := sqlmock.NewRows([]string{"id", "instance_id"})
+	for _, p := range pairs {
+		r.AddRow(p[0], p[1])
+	}
+	return r
+}
+
+// expectReconfirm registers the TOCTOU re-confirm primary-key lookup for
+// workspace id `wsID`, returning the row's CURRENT (status, instance_id).
+// This is what the reconciler re-reads after IsRunning returns (false,
+// nil), before it flips: it only flips when the SAME non-empty instance
+// is still recorded AND status is still online/degraded.
+func expectReconfirm(mock sqlmock.Sqlmock, wsID, curStatus, curInstanceID string) {
+	mock.ExpectQuery(`(?s)^\s*SELECT status, COALESCE\(instance_id, ''\)\s+FROM workspaces\s+WHERE id = \$1`).
+		WithArgs(wsID).
+		WillReturnRows(sqlmock.NewRows([]string{"status", "instance_id"}).AddRow(curStatus, curInstanceID))
+}
+
+// expectReconfirmNoRows registers a re-confirm lookup that finds the row
+// gone (deleted between SELECT and re-confirm) — the reconciler must
+// treat this as "not a dead EC2" and skip the flip.
+func expectReconfirmNoRows(mock sqlmock.Sqlmock, wsID string) {
+	mock.ExpectQuery(`(?s)^\s*SELECT status, COALESCE\(instance_id, ''\)\s+FROM workspaces\s+WHERE id = \$1`).
+		WithArgs(wsID).
+		WillReturnRows(sqlmock.NewRows([]string{"status", "instance_id"}))
+}
+
 // TestReconcileOnce_NotRunning_FlipsOffline — the core bug (core#2247):
 // an online SaaS workspace whose EC2 is terminated. CP reports a CLEAN
 // (false, nil); onOffline MUST be called with that id so the existing
@@ -82,7 +114,10 @@ func TestReconcileOnce_NotRunning_FlipsOffline(t *testing.T) {
 	checker := &fakeRunningChecker{running: map[string]bool{"ws-dead": false}}
 	off := &recordingOffline{}

-	expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).AddRow("ws-dead"))
+	expectReconcileQuery(mock, reconcileRows([2]string{"ws-dead", "i-dead"}))
+	// (false,nil) → re-confirm: row still online with the SAME instance →
+	// confirmed-dead → flip.
+	expectReconfirm(mock, "ws-dead", "online", "i-dead")

 	reconcileOnce(context.Background(), checker, off.handler())

@@ -102,7 +137,8 @@ func TestReconcileOnce_Running_DoesNotFlip(t *testing.T) {
 	checker := &fakeRunningChecker{running: map[string]bool{"ws-alive": true}}
 	off := &recordingOffline{}

-	expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).AddRow("ws-alive"))
+	// Running → no re-confirm, no flip.
+	expectReconcileQuery(mock, reconcileRows([2]string{"ws-alive", "i-alive"}))

 	reconcileOnce(context.Background(), checker, off.handler())

@@ -126,7 +162,9 @@ func TestReconcileOnce_TransientError_DoesNotFlip(t *testing.T) {
 	}
 	off := &recordingOffline{}

-	expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).AddRow("ws-blip"))
+	// (true,err) short-circuits BEFORE the re-confirm — no re-confirm query
+	// is registered, so a stray re-confirm would fail ExpectationsWereMet.
+	expectReconcileQuery(mock, reconcileRows([2]string{"ws-blip", "i-blip"}))

 	reconcileOnce(context.Background(), checker, off.handler())

@@ -143,19 +181,20 @@ func TestReconcileOnce_TransientError_DoesNotFlip(t *testing.T) {

 // TestReconcileOnce_QueryScopeExcludesExternalAndNonOnline — pins the
 // SELECT predicate. The regex in expectReconcileQuery requires
-// status='online' AND runtime <> 'external'; if a future edit widens the
-// scope to include paused/hibernated/removed rows or external rows (owned
-// by the heartbeat pass), this query no longer matches and sqlmock fails
-// the test. With the predicate intact, a DB that has only out-of-scope
-// rows returns empty → no IsRunning, no flip.
+// status IN ('online','degraded') AND runtime <> 'external'; if a future
+// edit widens the scope to include paused/hibernated/removed rows or
+// external rows (owned by the heartbeat pass), or narrows it back to drop
+// 'degraded', this query no longer matches and sqlmock fails the test.
+// With the predicate intact, a DB that has only out-of-scope rows returns
+// empty → no IsRunning, no flip.
 func TestReconcileOnce_QueryScopeExcludesExternalAndNonOnline(t *testing.T) {
 	mock := setupTestDB(t)
 	checker := &fakeRunningChecker{}
 	off := &recordingOffline{}

-	// The predicate filters out external + non-online rows server-side,
-	// modelled as the empty result those filters produce.
-	expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}))
+	// The predicate filters out external + out-of-scope-status rows
+	// server-side, modelled as the empty result those filters produce.
+	expectReconcileQuery(mock, reconcileRows())

 	reconcileOnce(context.Background(), checker, off.handler())

@@ -180,10 +219,13 @@ func TestReconcileOnce_MixedBatch(t *testing.T) {
 	}
 	off := &recordingOffline{}

-	expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).
-		AddRow("ws-dead").
-		AddRow("ws-alive").
-		AddRow("ws-blip"))
+	expectReconcileQuery(mock, reconcileRows(
+		[2]string{"ws-dead", "i-dead"},
+		[2]string{"ws-alive", "i-alive"},
+		[2]string{"ws-blip", "i-blip"},
+	))
+	// Only ws-dead reaches the re-confirm ((false,nil)); it confirms.
+	expectReconfirm(mock, "ws-dead", "online", "i-dead")

 	reconcileOnce(context.Background(), checker, off.handler())

@@ -195,6 +237,147 @@ func TestReconcileOnce_MixedBatch(t *testing.T) {
 	}
 }

+// TestReconcileOnce_TOCTOU_InstanceChanged_DoesNotFlip — the HIGH-1
+// regression guard. IsRunning returns a CLEAN (false, nil), but between
+// the reconciler's SELECT and the probe the row's instance_id changed
+// (reprovision attached a fresh EC2). IsRunning's independent
+// resolveInstanceID is the reason the (false,nil) is stale: it may have
+// resolved an empty/old instance. The re-confirm sees a DIFFERENT
+// instance_id and MUST skip — flipping here would knock out a workspace
+// whose NEW EC2 is not proven dead and fire RestartByID on a just-
+// reprovisioned row.
+func TestReconcileOnce_TOCTOU_InstanceChanged_DoesNotFlip(t *testing.T) {
+	mock := setupTestDB(t)
+	checker := &fakeRunningChecker{running: map[string]bool{"ws-race": false}}
+	off := &recordingOffline{}
+
+	expectReconcileQuery(mock, reconcileRows([2]string{"ws-race", "i-old"}))
+	// Re-confirm: row is still online but now points at a DIFFERENT
+	// instance (reprovisioned) → the (false,nil) was about i-old which is
+	// no longer attached → skip.
+	expectReconfirm(mock, "ws-race", "online", "i-new")
+
+	reconcileOnce(context.Background(), checker, off.handler())
+
+	if got := off.got(); len(got) != 0 {
+		t.Fatalf("TOCTOU guard violated: instance_id changed since SELECT must NOT flip, got %v", got)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("unmet expectations: %v", err)
+	}
+}
+
+// TestReconcileOnce_TOCTOU_InstanceCleared_DoesNotFlip — same HIGH-1
+// guard, the instance_id-NULLed variant (CP-orphan-sweeper or a delete
+// cleared it). Re-confirm sees an empty instance_id → skip.
+func TestReconcileOnce_TOCTOU_InstanceCleared_DoesNotFlip(t *testing.T) {
+	mock := setupTestDB(t)
+	checker := &fakeRunningChecker{running: map[string]bool{"ws-cleared": false}}
+	off := &recordingOffline{}
+
+	expectReconcileQuery(mock, reconcileRows([2]string{"ws-cleared", "i-gone"}))
+	expectReconfirm(mock, "ws-cleared", "online", "") // instance_id cleared
+
+	reconcileOnce(context.Background(), checker, off.handler())
+
+	if got := off.got(); len(got) != 0 {
+		t.Fatalf("TOCTOU guard violated: cleared instance_id must NOT flip, got %v", got)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("unmet expectations: %v", err)
+	}
+}
+
+// TestReconcileOnce_TOCTOU_StatusMoved_DoesNotFlip — same HIGH-1 guard,
+// the status-moved variant. The row left online/degraded (e.g. paused or
+// removed) between SELECT and re-confirm → skip.
+func TestReconcileOnce_TOCTOU_StatusMoved_DoesNotFlip(t *testing.T) {
+	mock := setupTestDB(t)
+	checker := &fakeRunningChecker{running: map[string]bool{"ws-paused": false}}
+	off := &recordingOffline{}
+
+	expectReconcileQuery(mock, reconcileRows([2]string{"ws-paused", "i-keep"}))
+	expectReconfirm(mock, "ws-paused", "paused", "i-keep") // status moved out of scope
+
+	reconcileOnce(context.Background(), checker, off.handler())
+
+	if got := off.got(); len(got) != 0 {
+		t.Fatalf("TOCTOU guard violated: row no longer online/degraded must NOT flip, got %v", got)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("unmet expectations: %v", err)
+	}
+}
+
+// TestReconcileOnce_TOCTOU_RowGone_DoesNotFlip — same HIGH-1 guard, the
+// row-deleted variant. The re-confirm finds no row (concurrent delete) →
+// skip; a stale (false,nil) about a just-deleted row must never fire
+// onOffline/RestartByID.
+func TestReconcileOnce_TOCTOU_RowGone_DoesNotFlip(t *testing.T) {
+	mock := setupTestDB(t)
+	checker := &fakeRunningChecker{running: map[string]bool{"ws-deleted": false}}
+	off := &recordingOffline{}
+
+	expectReconcileQuery(mock, reconcileRows([2]string{"ws-deleted", "i-x"}))
+	expectReconfirmNoRows(mock, "ws-deleted") // row gone
+
+	reconcileOnce(context.Background(), checker, off.handler())
+
+	if got := off.got(); len(got) != 0 {
+		t.Fatalf("TOCTOU guard violated: deleted row must NOT flip, got %v", got)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("unmet expectations: %v", err)
+	}
+}
+
+// TestReconcileOnce_Degraded_FlipsOffline — MED-3 scope. A `degraded`
+// SaaS workspace whose EC2 is gone is otherwise covered by NO sweep. It's
+// in scope (the SELECT regex requires status IN ('online','degraded')),
+// CP reports (false,nil), the re-confirm shows it STILL degraded with the
+// SAME instance → flip.
+func TestReconcileOnce_Degraded_FlipsOffline(t *testing.T) {
+	mock := setupTestDB(t)
+	checker := &fakeRunningChecker{running: map[string]bool{"ws-degraded": false}}
+	off := &recordingOffline{}
+
+	expectReconcileQuery(mock, reconcileRows([2]string{"ws-degraded", "i-deg"}))
+	expectReconfirm(mock, "ws-degraded", "degraded", "i-deg")
+
+	reconcileOnce(context.Background(), checker, off.handler())
+
+	if got := off.got(); len(got) != 1 || got[0] != "ws-degraded" {
+		t.Fatalf("expected onOffline(ws-degraded), got %v", got)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("unmet expectations: %v", err)
+	}
+}
+
+// TestReconfirm_DBError_DoesNotFlip — re-confirm fail-safe. If the
+// re-confirm read itself errors (transient DB blip), we treat it as "not
+// confirmed" and skip the flip rather than acting on an unverifiable
+// (false,nil).
+func TestReconcileOnce_ReconfirmDBError_DoesNotFlip(t *testing.T) {
+	mock := setupTestDB(t)
+	checker := &fakeRunningChecker{running: map[string]bool{"ws-x": false}}
+	off := &recordingOffline{}
+
+	expectReconcileQuery(mock, reconcileRows([2]string{"ws-x", "i-x"}))
+	mock.ExpectQuery(`(?s)^\s*SELECT status, COALESCE\(instance_id, ''\)\s+FROM workspaces\s+WHERE id = \$1`).
+		WithArgs("ws-x").
+		WillReturnError(errors.New("connection reset"))
+
+	reconcileOnce(context.Background(), checker, off.handler())
+
+	if got := off.got(); len(got) != 0 {
+		t.Fatalf("re-confirm DB error must fail-safe (no flip), got %v", got)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("unmet expectations: %v", err)
+	}
+}
+
 // TestReconcileOnce_QueryError — DB transient failure. Reconcile returns
 // without panicking and never probes IsRunning or flips anything.
 func TestReconcileOnce_QueryError(t *testing.T) {
@@ -202,7 +385,7 @@ func TestReconcileOnce_QueryError(t *testing.T) {
 	checker := &fakeRunningChecker{}
 	off := &recordingOffline{}

-	mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces`).
+	mock.ExpectQuery(`(?s)^\s*SELECT id::text, instance_id\s+FROM workspaces`).
 		WithArgs(CPInstanceReconcileLimit).
 		WillReturnError(errors.New("connection refused"))