fix(registry): reconciler — guard TOCTOU false-flip + cover degraded + cycle deadline (core#2261 review) #2283
@@ -36,11 +36,22 @@ package registry
|
||||
// runtime <> 'external'. Paused/hibernated/removed/provisioning/
|
||||
// awaiting_agent rows are out of scope; external rows are covered by
|
||||
// the remote-heartbeat pass.
|
||||
// - Per-cycle row cap + per-workspace timeout so one slow CP call can't
|
||||
// stall the sweep.
|
||||
// - Per-cycle row cap + per-cycle deadline + per-workspace timeout so
|
||||
// one slow CP call (or a degraded-but-not-erroring CP) can't stall
|
||||
// the sweep.
|
||||
// - TOCTOU re-confirm before any flip: IsRunning resolves instance_id
|
||||
// independently, so a row whose instance_id was cleared/NULLed (by a
|
||||
// concurrent delete, the CP-orphan-sweeper, or a reprovision) between
|
||||
// the reconciler's SELECT and the IsRunning probe yields a STALE
|
||||
// (false, nil) that does NOT prove the EC2 is dead. We re-read the
|
||||
// row's current (status, instance_id) and flip ONLY when the SAME
|
||||
// non-empty instance we asked CP about is still the workspace's
|
||||
// recorded instance AND it's still online/degraded. Mirrors the
|
||||
// guarded-write re-confirm in healthsweep.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"log"
|
||||
"time"
|
||||
|
||||
@@ -70,6 +81,20 @@ const CPInstanceReconcileLimit = 200
|
||||
// timeout context derived from the cycle context.
|
||||
const cpInstanceCheckTimeout = 10 * time.Second
|
||||
|
||||
// cpInstanceCycleDeadline bounds the wall-time of one whole reconcile
|
||||
// pass. With per-workspace 10s timeouts and a 200-row cap, a degraded-
|
||||
// but-not-erroring CP (each IsRunning slow but under the per-workspace
|
||||
// cap) could otherwise drag one cycle out for tens of minutes and starve
|
||||
// the next tick. Mirrors cp_orphan_sweeper's orphanSweepDeadline; chosen
|
||||
// under the 60s interval so a stuck cycle is abandoned before the next
|
||||
// one is due and the backlog drains across subsequent cycles.
|
||||
const cpInstanceCycleDeadline = 45 * time.Second
|
||||
|
||||
// cpInstanceReconfirmTimeout bounds the TOCTOU re-confirm read. This is a
|
||||
// single indexed primary-key lookup, so it should never be slow; a tight
|
||||
// timeout keeps the re-confirm from itself becoming a stall point.
|
||||
const cpInstanceReconfirmTimeout = 5 * time.Second
|
||||
|
||||
// StartCPInstanceReconciler runs the authoritative EC2-state reconcile
|
||||
// loop until ctx is cancelled. A nil checker makes the loop a no-op
|
||||
// (matches the nil-tolerant pattern of the sibling CP sweeper).
|
||||
@@ -106,21 +131,41 @@ func StartCPInstanceReconciler(ctx context.Context, checker InstanceRunningCheck
|
||||
}
|
||||
}
|
||||
|
||||
// reconcileRow pairs a workspace id with the instance_id captured in the
|
||||
// SAME SELECT, so the TOCTOU re-confirm can verify CP's (false, nil)
|
||||
// answer is about the instance the row still records — not one cleared
|
||||
// out from under us between the SELECT and the IsRunning probe.
|
||||
type reconcileRow struct {
|
||||
id string
|
||||
instanceID string
|
||||
}
|
||||
|
||||
// reconcileOnce executes one reconcile pass. Defensive against db.DB
|
||||
// being nil so a misconfigured boot doesn't panic.
|
||||
//
|
||||
// Scope: online + SaaS-EC2 workspaces only. runtime='external' rows are
|
||||
// excluded (covered by the remote-heartbeat pass); paused/hibernated/
|
||||
// removed/provisioning/awaiting_agent are excluded by the status filter.
|
||||
func reconcileOnce(ctx context.Context, checker InstanceRunningChecker, onOffline OfflineHandler) {
|
||||
// Scope: online/degraded + SaaS-EC2 workspaces only. runtime='external'
|
||||
// rows are excluded (covered by the remote-heartbeat pass); paused/
|
||||
// hibernated/removed/provisioning/awaiting_agent are excluded by the
|
||||
// status filter. `degraded` is included because a SaaS workspace whose
|
||||
// heartbeat handler flipped it degraded then lost its EC2 falls through
|
||||
// every other sweep (matches healthsweep's `status IN ('online',
|
||||
// 'degraded')`).
|
||||
func reconcileOnce(parent context.Context, checker InstanceRunningChecker, onOffline OfflineHandler) {
|
||||
if db.DB == nil {
|
||||
return
|
||||
}
|
||||
|
||||
rows, err := db.DB.QueryContext(ctx, `
|
||||
SELECT id::text
|
||||
// Per-cycle deadline so a degraded-but-not-erroring CP (each IsRunning
|
||||
// slow but under the per-workspace cap) can't drag one cycle out for
|
||||
// tens of minutes and starve the next tick. Per-workspace IsRunning
|
||||
// timeouts derive from this cycle context.
|
||||
cycleCtx, cancelCycle := context.WithTimeout(parent, cpInstanceCycleDeadline)
|
||||
defer cancelCycle()
|
||||
|
||||
rows, err := db.DB.QueryContext(cycleCtx, `
|
||||
SELECT id::text, instance_id
|
||||
FROM workspaces
|
||||
WHERE status = 'online'
|
||||
WHERE status IN ('online', 'degraded')
|
||||
AND instance_id IS NOT NULL
|
||||
AND instance_id != ''
|
||||
AND COALESCE(runtime, '') <> 'external'
|
||||
@@ -133,46 +178,130 @@ func reconcileOnce(ctx context.Context, checker InstanceRunningChecker, onOfflin
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var ids []string
|
||||
var candidates []reconcileRow
|
||||
for rows.Next() {
|
||||
var id string
|
||||
if scanErr := rows.Scan(&id); scanErr != nil {
|
||||
var r reconcileRow
|
||||
if scanErr := rows.Scan(&r.id, &r.instanceID); scanErr != nil {
|
||||
log.Printf("cp-instance-reconciler: row scan failed: %v", scanErr)
|
||||
continue
|
||||
}
|
||||
ids = append(ids, id)
|
||||
candidates = append(candidates, r)
|
||||
}
|
||||
if iterErr := rows.Err(); iterErr != nil {
|
||||
log.Printf("cp-instance-reconciler: rows iteration failed: %v", iterErr)
|
||||
return
|
||||
}
|
||||
|
||||
for _, id := range ids {
|
||||
processed, skipped := 0, 0
|
||||
for _, c := range candidates {
|
||||
// Abandon the cycle if we've blown the per-cycle deadline; the
|
||||
// next tick re-reads from the top (ORDER BY updated_at DESC) and
|
||||
// drains the backlog. Without this a slow CP could keep one cycle
|
||||
// running past its interval and never let a fresh one start.
|
||||
if cycleCtx.Err() != nil {
|
||||
log.Printf("cp-instance-reconciler: cycle deadline reached — processed %d, %d skipped (TOCTOU/changed), remaining deferred to next cycle", processed, skipped)
|
||||
return
|
||||
}
|
||||
processed++
|
||||
|
||||
// Per-workspace timeout so one slow CP round-trip can't stall
|
||||
// the whole sweep.
|
||||
checkCtx, cancel := context.WithTimeout(ctx, cpInstanceCheckTimeout)
|
||||
running, checkErr := checker.IsRunning(checkCtx, id)
|
||||
// the whole sweep. Derived from cycleCtx so the cycle deadline
|
||||
// always dominates.
|
||||
checkCtx, cancel := context.WithTimeout(cycleCtx, cpInstanceCheckTimeout)
|
||||
running, checkErr := checker.IsRunning(checkCtx, c.id)
|
||||
cancel()
|
||||
|
||||
if checkErr != nil {
|
||||
// FAIL-SAFE: transient DB/transport error (or a no-backend
|
||||
// signal). IsRunning returns (true, err) on these, so never
|
||||
// flip — leave the row online and retry next cycle.
|
||||
log.Printf("cp-instance-reconciler: IsRunning(%s) errored, leaving online (fail-safe): %v", id, checkErr)
|
||||
log.Printf("cp-instance-reconciler: IsRunning(%s) errored, leaving online (fail-safe): %v", c.id, checkErr)
|
||||
continue
|
||||
}
|
||||
if running {
|
||||
continue
|
||||
}
|
||||
|
||||
// CLEAN "not running" — CP authoritatively reports the EC2 is
|
||||
// terminated/stopped/absent. Feed it into the existing offline +
|
||||
// (false, nil) is NOT yet proof the EC2 is dead. IsRunning
|
||||
// resolves instance_id independently (resolveInstanceID); if the
|
||||
// row's instance_id was cleared/NULLed (concurrent delete, the
|
||||
// CP-orphan-sweeper NULLing it, a reprovision) or the row moved
|
||||
// off online/degraded between our SELECT and this probe,
|
||||
// IsRunning returns a STALE (false, nil) that reflects a missing
|
||||
// instance_id, NOT a confirmed-terminated EC2. Re-confirm against
|
||||
// the row's CURRENT state and flip ONLY when the SAME non-empty
|
||||
// instance we asked CP about is still recorded AND the row is
|
||||
// still online/degraded. Mirrors healthsweep's guarded write.
|
||||
if !reconfirmStillOfflineCandidate(cycleCtx, c) {
|
||||
skipped++
|
||||
continue
|
||||
}
|
||||
|
||||
// CONFIRMED "not running" — CP authoritatively reports the EC2 is
|
||||
// terminated/stopped/absent AND the row still records that exact
|
||||
// instance as online/degraded. Feed it into the existing offline +
|
||||
// auto-heal machinery: onOffline flips the row offline and
|
||||
// triggers RestartByID, which reprovisions with the existing
|
||||
// volume.
|
||||
log.Printf("cp-instance-reconciler: workspace %s is status=online but its EC2 is not running (terminated/stopped) — flipping offline + triggering reprovision", id)
|
||||
log.Printf("cp-instance-reconciler: workspace %s (instance %s) is online/degraded but its EC2 is not running (terminated/stopped) — flipping offline + triggering reprovision", c.id, c.instanceID)
|
||||
if onOffline != nil {
|
||||
onOffline(ctx, id)
|
||||
onOffline(cycleCtx, c.id)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// reconfirmStillOfflineCandidate re-reads the workspace's CURRENT
|
||||
// (status, instance_id) and reports whether it is STILL a valid offline
|
||||
// candidate for the instance we just probed. It returns true ONLY when:
|
||||
//
|
||||
// - the row still exists, AND
|
||||
// - current status IN ('online','degraded'), AND
|
||||
// - current instance_id is non-empty, AND
|
||||
// - current instance_id == the instance_id captured in the original
|
||||
// SELECT (the one whose liveness CP just answered about).
|
||||
//
|
||||
// Any other outcome (row gone, status moved off online/degraded,
|
||||
// instance_id cleared or now points at a different instance) means the
|
||||
// IsRunning (false, nil) was a stale/cleared-instance snapshot rather
|
||||
// than a confirmed-terminated EC2 — return false so the caller skips the
|
||||
// flip. A DB error during re-confirm is treated as "not confirmed"
|
||||
// (false): fail-safe toward NOT flipping a workspace we can't re-verify.
|
||||
func reconfirmStillOfflineCandidate(parent context.Context, c reconcileRow) bool {
|
||||
if db.DB == nil {
|
||||
return false
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(parent, cpInstanceReconfirmTimeout)
|
||||
defer cancel()
|
||||
|
||||
var curStatus, curInstanceID string
|
||||
err := db.DB.QueryRowContext(ctx, `
|
||||
SELECT status, COALESCE(instance_id, '')
|
||||
FROM workspaces
|
||||
WHERE id = $1
|
||||
`, c.id).Scan(&curStatus, &curInstanceID)
|
||||
if err != nil {
|
||||
if err == sql.ErrNoRows {
|
||||
// Row deleted between SELECT and re-confirm — definitely not a
|
||||
// terminated-EC2 signal. Skip.
|
||||
log.Printf("cp-instance-reconciler: re-confirm %s: row gone — skipping flip (stale snapshot, not a dead EC2)", c.id)
|
||||
return false
|
||||
}
|
||||
// Transient DB error — fail-safe toward NOT flipping.
|
||||
log.Printf("cp-instance-reconciler: re-confirm %s errored, skipping flip (fail-safe): %v", c.id, err)
|
||||
return false
|
||||
}
|
||||
|
||||
if curStatus != "online" && curStatus != "degraded" {
|
||||
log.Printf("cp-instance-reconciler: re-confirm %s: status moved to %q since SELECT — skipping flip", c.id, curStatus)
|
||||
return false
|
||||
}
|
||||
if curInstanceID == "" {
|
||||
log.Printf("cp-instance-reconciler: re-confirm %s: instance_id cleared since SELECT — skipping flip (CP answered about a now-detached instance)", c.id)
|
||||
return false
|
||||
}
|
||||
if curInstanceID != c.instanceID {
|
||||
log.Printf("cp-instance-reconciler: re-confirm %s: instance_id changed %s -> %s since SELECT (reprovision) — skipping flip", c.id, c.instanceID, curInstanceID)
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
@@ -63,16 +63,48 @@ func (r *recordingOffline) got() []string {
|
||||
}
|
||||
|
||||
// expectReconcileQuery registers the reconciler's SELECT, pinning the
|
||||
// scope-critical predicates: status='online', instance_id present, and
|
||||
// runtime <> 'external'. A future widening that drops any of these (e.g.
|
||||
// sweeping paused rows, or external rows the heartbeat pass owns) fails
|
||||
// every test that uses this helper.
|
||||
// scope-critical predicates: status IN ('online','degraded'), instance_id
|
||||
// present (captured as a column for the TOCTOU re-confirm), and runtime
|
||||
// <> 'external'. A future widening that drops any of these (e.g. sweeping
|
||||
// paused rows, or external rows the heartbeat pass owns), or that drops
|
||||
// the instance_id column the re-confirm depends on, fails every test that
|
||||
// uses this helper.
|
||||
func expectReconcileQuery(mock sqlmock.Sqlmock, rows *sqlmock.Rows) {
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces\s+WHERE status = 'online'\s+AND instance_id IS NOT NULL\s+AND instance_id != ''\s+AND COALESCE\(runtime, ''\) <> 'external'\s+ORDER BY updated_at DESC\s+LIMIT \$1`).
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT id::text, instance_id\s+FROM workspaces\s+WHERE status IN \('online', 'degraded'\)\s+AND instance_id IS NOT NULL\s+AND instance_id != ''\s+AND COALESCE\(runtime, ''\) <> 'external'\s+ORDER BY updated_at DESC\s+LIMIT \$1`).
|
||||
WithArgs(CPInstanceReconcileLimit).
|
||||
WillReturnRows(rows)
|
||||
}
|
||||
|
||||
// reconcileRows builds the two-column (id, instance_id) result the
|
||||
// reconciler's SELECT now returns. Pass id/instance_id pairs.
|
||||
func reconcileRows(pairs ...[2]string) *sqlmock.Rows {
|
||||
r := sqlmock.NewRows([]string{"id", "instance_id"})
|
||||
for _, p := range pairs {
|
||||
r.AddRow(p[0], p[1])
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
// expectReconfirm registers the TOCTOU re-confirm primary-key lookup for
|
||||
// workspace id `wsID`, returning the row's CURRENT (status, instance_id).
|
||||
// This is what the reconciler re-reads after IsRunning returns (false,
|
||||
// nil), before it flips: it only flips when the SAME non-empty instance
|
||||
// is still recorded AND status is still online/degraded.
|
||||
func expectReconfirm(mock sqlmock.Sqlmock, wsID, curStatus, curInstanceID string) {
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT status, COALESCE\(instance_id, ''\)\s+FROM workspaces\s+WHERE id = \$1`).
|
||||
WithArgs(wsID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "instance_id"}).AddRow(curStatus, curInstanceID))
|
||||
}
|
||||
|
||||
// expectReconfirmNoRows registers a re-confirm lookup that finds the row
|
||||
// gone (deleted between SELECT and re-confirm) — the reconciler must
|
||||
// treat this as "not a dead EC2" and skip the flip.
|
||||
func expectReconfirmNoRows(mock sqlmock.Sqlmock, wsID string) {
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT status, COALESCE\(instance_id, ''\)\s+FROM workspaces\s+WHERE id = \$1`).
|
||||
WithArgs(wsID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "instance_id"}))
|
||||
}
|
||||
|
||||
// TestReconcileOnce_NotRunning_FlipsOffline — the core bug (core#2247):
|
||||
// an online SaaS workspace whose EC2 is terminated. CP reports a CLEAN
|
||||
// (false, nil); onOffline MUST be called with that id so the existing
|
||||
@@ -82,7 +114,10 @@ func TestReconcileOnce_NotRunning_FlipsOffline(t *testing.T) {
|
||||
checker := &fakeRunningChecker{running: map[string]bool{"ws-dead": false}}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).AddRow("ws-dead"))
|
||||
expectReconcileQuery(mock, reconcileRows([2]string{"ws-dead", "i-dead"}))
|
||||
// (false,nil) → re-confirm: row still online with the SAME instance →
|
||||
// confirmed-dead → flip.
|
||||
expectReconfirm(mock, "ws-dead", "online", "i-dead")
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
@@ -102,7 +137,8 @@ func TestReconcileOnce_Running_DoesNotFlip(t *testing.T) {
|
||||
checker := &fakeRunningChecker{running: map[string]bool{"ws-alive": true}}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).AddRow("ws-alive"))
|
||||
// Running → no re-confirm, no flip.
|
||||
expectReconcileQuery(mock, reconcileRows([2]string{"ws-alive", "i-alive"}))
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
@@ -126,7 +162,9 @@ func TestReconcileOnce_TransientError_DoesNotFlip(t *testing.T) {
|
||||
}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).AddRow("ws-blip"))
|
||||
// (true,err) short-circuits BEFORE the re-confirm — no re-confirm query
|
||||
// is registered, so a stray re-confirm would fail ExpectationsWereMet.
|
||||
expectReconcileQuery(mock, reconcileRows([2]string{"ws-blip", "i-blip"}))
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
@@ -143,19 +181,20 @@ func TestReconcileOnce_TransientError_DoesNotFlip(t *testing.T) {
|
||||
|
||||
// TestReconcileOnce_QueryScopeExcludesExternalAndNonOnline — pins the
|
||||
// SELECT predicate. The regex in expectReconcileQuery requires
|
||||
// status='online' AND runtime <> 'external'; if a future edit widens the
|
||||
// scope to include paused/hibernated/removed rows or external rows (owned
|
||||
// by the heartbeat pass), this query no longer matches and sqlmock fails
|
||||
// the test. With the predicate intact, a DB that has only out-of-scope
|
||||
// rows returns empty → no IsRunning, no flip.
|
||||
// status IN ('online','degraded') AND runtime <> 'external'; if a future
|
||||
// edit widens the scope to include paused/hibernated/removed rows or
|
||||
// external rows (owned by the heartbeat pass), or narrows it back to drop
|
||||
// 'degraded', this query no longer matches and sqlmock fails the test.
|
||||
// With the predicate intact, a DB that has only out-of-scope rows returns
|
||||
// empty → no IsRunning, no flip.
|
||||
func TestReconcileOnce_QueryScopeExcludesExternalAndNonOnline(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
checker := &fakeRunningChecker{}
|
||||
off := &recordingOffline{}
|
||||
|
||||
// The predicate filters out external + non-online rows server-side,
|
||||
// modelled as the empty result those filters produce.
|
||||
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}))
|
||||
// The predicate filters out external + out-of-scope-status rows
|
||||
// server-side, modelled as the empty result those filters produce.
|
||||
expectReconcileQuery(mock, reconcileRows())
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
@@ -180,10 +219,13 @@ func TestReconcileOnce_MixedBatch(t *testing.T) {
|
||||
}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).
|
||||
AddRow("ws-dead").
|
||||
AddRow("ws-alive").
|
||||
AddRow("ws-blip"))
|
||||
expectReconcileQuery(mock, reconcileRows(
|
||||
[2]string{"ws-dead", "i-dead"},
|
||||
[2]string{"ws-alive", "i-alive"},
|
||||
[2]string{"ws-blip", "i-blip"},
|
||||
))
|
||||
// Only ws-dead reaches the re-confirm ((false,nil)); it confirms.
|
||||
expectReconfirm(mock, "ws-dead", "online", "i-dead")
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
@@ -195,6 +237,147 @@ func TestReconcileOnce_MixedBatch(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcileOnce_TOCTOU_InstanceChanged_DoesNotFlip — the HIGH-1
|
||||
// regression guard. IsRunning returns a CLEAN (false, nil), but between
|
||||
// the reconciler's SELECT and the probe the row's instance_id changed
|
||||
// (reprovision attached a fresh EC2). IsRunning's independent
|
||||
// resolveInstanceID is the reason the (false,nil) is stale: it may have
|
||||
// resolved an empty/old instance. The re-confirm sees a DIFFERENT
|
||||
// instance_id and MUST skip — flipping here would knock out a workspace
|
||||
// whose NEW EC2 is not proven dead and fire RestartByID on a just-
|
||||
// reprovisioned row.
|
||||
func TestReconcileOnce_TOCTOU_InstanceChanged_DoesNotFlip(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
checker := &fakeRunningChecker{running: map[string]bool{"ws-race": false}}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, reconcileRows([2]string{"ws-race", "i-old"}))
|
||||
// Re-confirm: row is still online but now points at a DIFFERENT
|
||||
// instance (reprovisioned) → the (false,nil) was about i-old which is
|
||||
// no longer attached → skip.
|
||||
expectReconfirm(mock, "ws-race", "online", "i-new")
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
if got := off.got(); len(got) != 0 {
|
||||
t.Fatalf("TOCTOU guard violated: instance_id changed since SELECT must NOT flip, got %v", got)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcileOnce_TOCTOU_InstanceCleared_DoesNotFlip — same HIGH-1
|
||||
// guard, the instance_id-NULLed variant (CP-orphan-sweeper or a delete
|
||||
// cleared it). Re-confirm sees an empty instance_id → skip.
|
||||
func TestReconcileOnce_TOCTOU_InstanceCleared_DoesNotFlip(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
checker := &fakeRunningChecker{running: map[string]bool{"ws-cleared": false}}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, reconcileRows([2]string{"ws-cleared", "i-gone"}))
|
||||
expectReconfirm(mock, "ws-cleared", "online", "") // instance_id cleared
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
if got := off.got(); len(got) != 0 {
|
||||
t.Fatalf("TOCTOU guard violated: cleared instance_id must NOT flip, got %v", got)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcileOnce_TOCTOU_StatusMoved_DoesNotFlip — same HIGH-1 guard,
|
||||
// the status-moved variant. The row left online/degraded (e.g. paused or
|
||||
// removed) between SELECT and re-confirm → skip.
|
||||
func TestReconcileOnce_TOCTOU_StatusMoved_DoesNotFlip(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
checker := &fakeRunningChecker{running: map[string]bool{"ws-paused": false}}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, reconcileRows([2]string{"ws-paused", "i-keep"}))
|
||||
expectReconfirm(mock, "ws-paused", "paused", "i-keep") // status moved out of scope
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
if got := off.got(); len(got) != 0 {
|
||||
t.Fatalf("TOCTOU guard violated: row no longer online/degraded must NOT flip, got %v", got)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcileOnce_TOCTOU_RowGone_DoesNotFlip — same HIGH-1 guard, the
|
||||
// row-deleted variant. The re-confirm finds no row (concurrent delete) →
|
||||
// skip; a stale (false,nil) about a just-deleted row must never fire
|
||||
// onOffline/RestartByID.
|
||||
func TestReconcileOnce_TOCTOU_RowGone_DoesNotFlip(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
checker := &fakeRunningChecker{running: map[string]bool{"ws-deleted": false}}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, reconcileRows([2]string{"ws-deleted", "i-x"}))
|
||||
expectReconfirmNoRows(mock, "ws-deleted") // row gone
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
if got := off.got(); len(got) != 0 {
|
||||
t.Fatalf("TOCTOU guard violated: deleted row must NOT flip, got %v", got)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcileOnce_Degraded_FlipsOffline — MED-3 scope. A `degraded`
|
||||
// SaaS workspace whose EC2 is gone is otherwise covered by NO sweep. It's
|
||||
// in scope (the SELECT regex requires status IN ('online','degraded')),
|
||||
// CP reports (false,nil), the re-confirm shows it STILL degraded with the
|
||||
// SAME instance → flip.
|
||||
func TestReconcileOnce_Degraded_FlipsOffline(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
checker := &fakeRunningChecker{running: map[string]bool{"ws-degraded": false}}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, reconcileRows([2]string{"ws-degraded", "i-deg"}))
|
||||
expectReconfirm(mock, "ws-degraded", "degraded", "i-deg")
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
if got := off.got(); len(got) != 1 || got[0] != "ws-degraded" {
|
||||
t.Fatalf("expected onOffline(ws-degraded), got %v", got)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconfirm_DBError_DoesNotFlip — re-confirm fail-safe. If the
|
||||
// re-confirm read itself errors (transient DB blip), we treat it as "not
|
||||
// confirmed" and skip the flip rather than acting on an unverifiable
|
||||
// (false,nil).
|
||||
func TestReconcileOnce_ReconfirmDBError_DoesNotFlip(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
checker := &fakeRunningChecker{running: map[string]bool{"ws-x": false}}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, reconcileRows([2]string{"ws-x", "i-x"}))
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT status, COALESCE\(instance_id, ''\)\s+FROM workspaces\s+WHERE id = \$1`).
|
||||
WithArgs("ws-x").
|
||||
WillReturnError(errors.New("connection reset"))
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
if got := off.got(); len(got) != 0 {
|
||||
t.Fatalf("re-confirm DB error must fail-safe (no flip), got %v", got)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcileOnce_QueryError — DB transient failure. Reconcile returns
|
||||
// without panicking and never probes IsRunning or flips anything.
|
||||
func TestReconcileOnce_QueryError(t *testing.T) {
|
||||
@@ -202,7 +385,7 @@ func TestReconcileOnce_QueryError(t *testing.T) {
|
||||
checker := &fakeRunningChecker{}
|
||||
off := &recordingOffline{}
|
||||
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces`).
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT id::text, instance_id\s+FROM workspaces`).
|
||||
WithArgs(CPInstanceReconcileLimit).
|
||||
WillReturnError(errors.New("connection refused"))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user