diff --git a/workspace-server/internal/db/workspace_status_enum_drift_test.go b/workspace-server/internal/db/workspace_status_enum_drift_test.go new file mode 100644 index 00000000..d0b6a8dc --- /dev/null +++ b/workspace-server/internal/db/workspace_status_enum_drift_test.go @@ -0,0 +1,227 @@ +package db_test + +// Static drift gate: every workspaces.status literal written in the Go +// tree must exist in the workspace_status enum defined by the migrations. +// +// Why this exists: the `workspace_status` enum (migrations 043 + 046) +// shipped without 'awaiting_agent' even though application code wrote +// that value, and every UPDATE silently failed in production for five +// days before the gap surfaced (see 046_workspace_status_awaiting_agent.up.sql). +// The unit tests passed because sqlmock matches SQL by regex, not against +// a live enum constraint. +// +// Approach: extract every Go string literal whose body matches +// (?i)workspaces[^a-z_].*status (so "UPDATE workspaces SET status", +// "FROM workspaces WHERE ... status", "INSERT INTO workspaces ... status", +// CTEs that reference workspaces, etc.). For each such SQL fragment, +// pull the single-quoted status values out of `status =`, `status IN`, +// `THEN`, and `ELSE`. Every value must be in the union of CREATE TYPE + +// ALTER TYPE ADD VALUE across all migrations. + +import ( + "os" + "path/filepath" + "regexp" + "sort" + "strings" + "testing" +) + +func TestWorkspaceStatusEnum_NoLiteralDrift(t *testing.T) { + t.Parallel() + + repoRoot := findRepoRoot(t) + migrationsDir := filepath.Join(repoRoot, "workspace-server", "migrations") + internalDir := filepath.Join(repoRoot, "workspace-server", "internal") + + enum := loadWorkspaceStatusEnum(t, migrationsDir) + if len(enum) == 0 { + t.Fatalf("could not parse workspace_status enum from %s — gate is non-functional", migrationsDir) + } + + literals := collectWorkspacesStatusLiterals(t, internalDir) + if len(literals) == 0 { + t.Fatalf("found zero workspaces.status literals under %s — gate is non-functional", internalDir) + } + + var rogue []string + for lit := range literals { + if _, ok := enum[lit]; ok { + continue + } + rogue = append(rogue, lit) + } + if len(rogue) > 0 { + sort.Strings(rogue) + t.Errorf( + "workspaces.status literal(s) %v are written by Go code but not in the workspace_status enum.\n"+ + "Add a migration `ALTER TYPE workspace_status ADD VALUE 'X';` (see 046 for shape).\n"+ + "Enum currently is: %v", + rogue, sortedKeys(enum), + ) + } +} + +// loadWorkspaceStatusEnum scans every *.up.sql file for either: +// +// CREATE TYPE workspace_status AS ENUM ('a', 'b', ...) +// ALTER TYPE workspace_status ADD VALUE [IF NOT EXISTS] 'X' [BEFORE|AFTER 'Y'] +// +// and returns the union of every value the enum will hold after all +// migrations apply. +func loadWorkspaceStatusEnum(t *testing.T, migrationsDir string) map[string]struct{} { + t.Helper() + + out := make(map[string]struct{}) + + files, err := filepath.Glob(filepath.Join(migrationsDir, "*.up.sql")) + if err != nil { + t.Fatalf("glob migrations: %v", err) + } + sort.Strings(files) + + createRE := regexp.MustCompile(`(?is)CREATE\s+TYPE\s+workspace_status\s+AS\s+ENUM\s*\(([^)]+)\)`) + addValueRE := regexp.MustCompile(`(?i)ALTER\s+TYPE\s+workspace_status\s+ADD\s+VALUE(?:\s+IF\s+NOT\s+EXISTS)?\s+'([^']+)'`) + literalRE := regexp.MustCompile(`'([^']+)'`) + + for _, f := range files { + body, err := os.ReadFile(f) + if err != nil { + t.Fatalf("read %s: %v", f, err) + } + for _, m := range createRE.FindAllStringSubmatch(string(body), -1) { + for _, lit := range literalRE.FindAllStringSubmatch(m[1], -1) { + out[lit[1]] = struct{}{} + } + } + for _, m := range addValueRE.FindAllStringSubmatch(string(body), -1) { + out[m[1]] = struct{}{} + } + } + return out +} + +// collectWorkspacesStatusLiterals walks every non-test .go file under +// root, finds Go string literals that contain `UPDATE workspaces` or +// `INSERT INTO workspaces`, and extracts the status literals appearing +// inside the matching SQL statement. +// +// Why this scope: any UPDATE/INSERT against `workspaces` is the moment +// a status literal hits the column constrained by the enum. Read-side +// SQL (SELECT ... WHERE status = 'X') cannot fail on enum drift, so it's +// out of scope. JOINs to `workspaces` from other tables (e.g. approvals +// joining workspaces for display) write to a different table's status — +// also out of scope. Anchoring on the leading `UPDATE workspaces` / +// `INSERT INTO workspaces` keyword unambiguously identifies the writes +// we care about. +func collectWorkspacesStatusLiterals(t *testing.T, root string) map[string]struct{} { + t.Helper() + + // Match raw-string and double-quoted Go string literals. Backtick + // strings can span multiple lines. Both forms are extracted via the + // same DOTALL regex over the whole file body. + rawRE := regexp.MustCompile("(?s)`([^`]*?)`") + dquoteRE := regexp.MustCompile(`"((?:[^"\\]|\\.)*)"`) + + // A SQL string is in scope if it begins (after optional leading + // whitespace) with UPDATE workspaces or INSERT INTO workspaces. + // `(?i)` is case-insensitive; `\s*` allows the format-friendly + // leading newline and indent that the codebase uses. + updateWorkspacesRE := regexp.MustCompile(`(?is)^\s*UPDATE\s+workspaces\b`) + insertWorkspacesRE := regexp.MustCompile(`(?is)^\s*INSERT\s+INTO\s+workspaces\b`) + + // Inside a scoped SQL fragment, status literals appear in: + // status = 'X' — assignment in SET (or filter in WHERE) + // status IN ('X', ...) — filter + // status NOT IN ('X') — filter + // THEN 'X' — CASE arm + // ELSE 'X' — CASE default + statusEqRE := regexp.MustCompile(`(?i)status\s*(?:=|!=|<>)\s*'([a-z_]+)'`) + statusInRE := regexp.MustCompile(`(?i)status\s+(?:NOT\s+)?IN\s*\(([^)]*)\)`) + thenRE := regexp.MustCompile(`(?i)THEN\s+'([a-z_]+)'`) + elseRE := regexp.MustCompile(`(?i)ELSE\s+'([a-z_]+)'`) + inListLiteralRE := regexp.MustCompile(`'([a-z_]+)'`) + + out := make(map[string]struct{}) + + walkErr := filepath.Walk(root, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if info.IsDir() { + return nil + } + if !strings.HasSuffix(path, ".go") { + return nil + } + if strings.HasSuffix(path, "_test.go") { + return nil + } + body, err := os.ReadFile(path) + if err != nil { + return err + } + text := string(body) + + harvest := func(fragment string) { + if !updateWorkspacesRE.MatchString(fragment) && !insertWorkspacesRE.MatchString(fragment) { + return + } + for _, m := range statusEqRE.FindAllStringSubmatch(fragment, -1) { + out[m[1]] = struct{}{} + } + for _, m := range statusInRE.FindAllStringSubmatch(fragment, -1) { + for _, lit := range inListLiteralRE.FindAllStringSubmatch(m[1], -1) { + out[lit[1]] = struct{}{} + } + } + for _, m := range thenRE.FindAllStringSubmatch(fragment, -1) { + out[m[1]] = struct{}{} + } + for _, m := range elseRE.FindAllStringSubmatch(fragment, -1) { + out[m[1]] = struct{}{} + } + } + + for _, m := range rawRE.FindAllStringSubmatch(text, -1) { + harvest(m[1]) + } + for _, m := range dquoteRE.FindAllStringSubmatch(text, -1) { + harvest(m[1]) + } + return nil + }) + if walkErr != nil { + t.Fatalf("walk %s: %v", root, walkErr) + } + return out +} + +func findRepoRoot(t *testing.T) string { + t.Helper() + dir, err := os.Getwd() + if err != nil { + t.Fatalf("getwd: %v", err) + } + for i := 0; i < 8; i++ { + if _, err := os.Stat(filepath.Join(dir, "workspace-server", "migrations")); err == nil { + return dir + } + parent := filepath.Dir(dir) + if parent == dir { + break + } + dir = parent + } + t.Fatalf("could not locate repo root with workspace-server/migrations from %s", dir) + return "" +} + +func sortedKeys(m map[string]struct{}) []string { + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + sort.Strings(out) + return out +} diff --git a/workspace-server/migrations/046_workspace_status_awaiting_agent.down.sql b/workspace-server/migrations/046_workspace_status_awaiting_agent.down.sql new file mode 100644 index 00000000..8754b5d3 --- /dev/null +++ b/workspace-server/migrations/046_workspace_status_awaiting_agent.down.sql @@ -0,0 +1,53 @@ +-- 046_workspace_status_awaiting_agent.down.sql +-- +-- Reverse 046_workspace_status_awaiting_agent.up.sql. +-- +-- Postgres does NOT support DROP VALUE on an enum. The standard rollback +-- recipe is rename → recreate → cast → drop, which is intrusive (locks +-- workspaces with ACCESS EXCLUSIVE the same way migration 043 did). We +-- punt: only run this manually, and only if you're prepared to nuke and +-- recreate the type. Application code WILL fail if the value disappears +-- and any row currently has it; pre-flight with: +-- +-- UPDATE workspaces SET status = 'offline' +-- WHERE status = 'awaiting_agent'; +-- +-- before running the recipe below. + +BEGIN; + +SET LOCAL lock_timeout = '5s'; + +-- Convert any existing awaiting_agent / hibernating rows to a value the +-- new enum will accept. 'offline' is the safest fallback for awaiting_agent +-- (operator can re-register to bring them back online); 'hibernated' is +-- the natural terminal of an in-flight 'hibernating'. +UPDATE workspaces SET status = 'offline' WHERE status = 'awaiting_agent'; +UPDATE workspaces SET status = 'hibernated' WHERE status = 'hibernating'; + +ALTER TYPE workspace_status RENAME TO workspace_status_with_awaiting; + +CREATE TYPE workspace_status AS ENUM ( + 'provisioning', + 'online', + 'offline', + 'degraded', + 'failed', + 'removed', + 'paused', + 'hibernated' +); + +ALTER TABLE workspaces + ALTER COLUMN status DROP DEFAULT; + +ALTER TABLE workspaces + ALTER COLUMN status TYPE workspace_status + USING status::text::workspace_status; + +ALTER TABLE workspaces + ALTER COLUMN status SET DEFAULT 'provisioning'::workspace_status; + +DROP TYPE workspace_status_with_awaiting; + +COMMIT; diff --git a/workspace-server/migrations/046_workspace_status_awaiting_agent.up.sql b/workspace-server/migrations/046_workspace_status_awaiting_agent.up.sql new file mode 100644 index 00000000..a3a3f26d --- /dev/null +++ b/workspace-server/migrations/046_workspace_status_awaiting_agent.up.sql @@ -0,0 +1,60 @@ +-- 046_workspace_status_awaiting_agent.up.sql +-- +-- Add the missing 'awaiting_agent' and 'hibernating' values to the +-- workspace_status enum. +-- +-- Migration 043 (2026-04-25) introduced the workspace_status enum but +-- omitted two values that application code had already been writing: +-- +-- 'awaiting_agent' — handlers/workspace.go:333 (since 2026-04-24, +-- commit 1e8b5e01, "first-class BYO-compute"): +-- external workspaces created without a URL are +-- meant to park here until the agent registers. +-- 'hibernating' — handlers/workspace_restart.go:271-272 (since +-- 29_workspace_hibernation): intermediate state +-- between 'online' and the final 'hibernated'; +-- acts as a DB-level claim while provisioner.Stop +-- runs. +-- +-- Every UPDATE that tried to write either value failed with +-- `invalid input value for enum workspace_status: "..."`. The +-- consequences were silent because both call sites either dropped or +-- log-and-continued the error: +-- +-- - workspace.go:333 discards the Exec result entirely. Canvas shows +-- `awaiting_agent` in the response, but the row stays in +-- 'provisioning' until first /registry/register. +-- - workspace_restart.go:277 logs and returns; hibernation simply +-- never happens. Idle workspaces consumed resources indefinitely +-- for five days before this gate flagged it. +-- +-- 2026-04-30 PR #2382 ("default external runtime to poll-mode + +-- awaiting_agent") added two more silent-fail sites in +-- registry/liveness.go and registry/healthsweep.go. Both log + continue. +-- Result: liveness expiry no longer transitions runtime='external' +-- rows, and the heartbeat-staleness sweep is a no-op for them. +-- UI/canvas shows external workspaces stuck on 'online' or 'degraded' +-- indefinitely after the agent disconnects. +-- +-- Tests across all four sites used sqlmock, which matches SQL by regex +-- but does not validate against the live enum constraint, so unit +-- tests passed despite the prod-only failures. See +-- feedback_mock_at_drifting_layer in operator memory. +-- +-- This migration adds both enum values so all six call sites start +-- succeeding. IF NOT EXISTS makes the migration idempotent across +-- re-runs (RunMigrations in postgres.go re-applies migrations until +-- schema_migrations records them; a future operator-driven re-run is +-- harmless). +-- +-- ALTER TYPE ADD VALUE is committed immediately by Postgres regardless +-- of transaction wrapping, and does NOT take a heavy lock on workspaces. +-- Safe to run during normal traffic. +-- +-- A regression gate lives at internal/db/workspace_status_enum_drift_test.go: +-- it parses every UPDATE/INSERT against `workspaces` in the Go tree +-- and asserts every status literal is in the enum. That test would +-- have caught both omissions on the day they shipped. + +ALTER TYPE workspace_status ADD VALUE IF NOT EXISTS 'awaiting_agent'; +ALTER TYPE workspace_status ADD VALUE IF NOT EXISTS 'hibernating';