Merge pull request #3011 from Molecule-AI/rfc-2872-workspaces-uniq-toctou

fix(workspace-server): close TOCTOU race on workspaces(parent_id, name) (#2872 Critical 1)
This commit is contained in:
Hongming Wang 2026-05-06 04:51:01 +00:00 committed by GitHub
commit bb9bf85dbd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 203 additions and 99 deletions

View File

@ -82,61 +82,6 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
}
}
// 5s timeout bounds the lookup independently of any HTTP request
// context. createWorkspaceTree runs in goroutines spawned from the
// /org/import handler, so plumbing the request context here would
// cascade-cancel into provisionWorkspaceAuto and abort in-flight
// EC2 provisioning if the client disconnected mid-import — that's
// the wrong behaviour. A short bounded timeout protects the
// per-row SELECT against a wedged DB without taking the
// drop-everything-on-disconnect tradeoff.
ctxLookup, cancelLookup := context.WithTimeout(context.Background(), 5*time.Second)
defer cancelLookup()
// Idempotency: if a workspace with the same (parent_id, name) already
// exists, skip the INSERT + canvas_layouts + broadcast + provisioning.
// This is what makes /org/import safe to call multiple times — the
// historical leak was every call recreating the entire tree (see
// tenant-hongming, 72 distinct child workspaces in 4 days, all from
// repeated org-template spawns of the same template).
//
// Recursion still runs on the existing id so partial-match templates
// (parent exists, some children missing) backfill the missing children
// instead of either no-op'ing the whole subtree or duplicating the
// existing children.
//
// /org/import is ADDITIVE-ONLY, never destructive. Children present
// in the existing tree but absent from the new template are
// preserved (no DELETE on diff). Skip-path also does NOT propagate
// updates to existing nodes — a re-import that adds an
// initial_memory or schedule to an existing workspace is silently
// dropped (the function bypasses seedInitialMemories, schedule SQL,
// channel config for skipped rows). To force-update an existing
// tree, delete and re-import or use a future /org/sync route.
existingID, existing, lookupErr := h.lookupExistingChild(ctxLookup, ws.Name, parentID)
if lookupErr != nil {
return fmt.Errorf("idempotency check for %s: %w", ws.Name, lookupErr)
}
if existing {
log.Printf("Org import: %q already exists (id=%s) — skipping create+provision, recursing into children for partial-match", ws.Name, existingID)
parentRef := ""
if parentID != nil {
parentRef = *parentID
}
provlog.Event("provision.skip_existing", map[string]any{
"name": ws.Name,
"existing_id": existingID,
"parent_id": parentRef,
"tier": tier,
})
*results = append(*results, map[string]interface{}{
"id": existingID,
"name": ws.Name,
"tier": tier,
"skipped": true,
})
return h.recurseChildrenForImport(ws, existingID, absX, absY, defaults, orgBaseDir, results, provisionSem)
}
id := uuid.New().String()
awarenessNS := workspaceAwarenessNamespace(id)
@ -188,10 +133,67 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
if maxConcurrent <= 0 {
maxConcurrent = models.DefaultMaxConcurrentTasks
}
_, err := db.DB.ExecContext(ctx, `
// TOCTOU-safe insert (#2872 Critical 1).
//
// `ON CONFLICT DO NOTHING` paired with the partial unique index
// from migration 20260506000000_workspaces_unique_parent_name.up.sql
// atomically resolves a race window that the prior
// lookup-then-insert had: two concurrent /org/import POSTs both
// saw "not found" in lookupExistingChild and both INSERT'd the
// same (parent_id, name). After this swap the SECOND INSERT
// silently no-ops, RETURNING returns 0 rows → sql.ErrNoRows, and
// the skip-path runs.
//
// ON CONFLICT target uses (COALESCE(parent_id,...), name) WHERE
// status != 'removed' — must match the partial-index predicate
// EXACTLY for Postgres to consider the index applicable.
var insertedID string
err := db.DB.QueryRowContext(ctx, `
INSERT INTO workspaces (id, name, role, tier, runtime, awareness_namespace, status, parent_id, workspace_dir, workspace_access, max_concurrent_tasks)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
`, id, ws.Name, role, tier, runtime, awarenessNS, "provisioning", parentID, workspaceDir, workspaceAccess, maxConcurrent)
ON CONFLICT (COALESCE(parent_id, '00000000-0000-0000-0000-000000000000'::uuid), name)
WHERE status != 'removed'
DO NOTHING
RETURNING id
`, id, ws.Name, role, tier, runtime, awarenessNS, "provisioning", parentID, workspaceDir, workspaceAccess, maxConcurrent).Scan(&insertedID)
if errors.Is(err, sql.ErrNoRows) {
// Skip path — a non-removed row already exists for
// (parent_id, name). Re-select its id; idempotency-friendly
// semantics match the original lookupExistingChild path
// (parent_id IS NOT DISTINCT FROM matches NULL too,
// status='removed' rows are ignored).
ctxLookup, cancelLookup := context.WithTimeout(context.Background(), 5*time.Second)
defer cancelLookup()
existingID, found, selErr := h.lookupExistingChild(ctxLookup, ws.Name, parentID)
if selErr != nil {
return fmt.Errorf("post-conflict re-select for %s: %w", ws.Name, selErr)
}
if !found {
// Index conflicted but row vanished between INSERT and
// re-SELECT (status flipped to 'removed' concurrently).
// Surface as an error rather than silently retrying —
// the user can re-trigger /org/import safely.
return fmt.Errorf("workspace %q conflicted on insert but not visible on re-select (concurrent status flip?)", ws.Name)
}
log.Printf("Org import: %q already exists (id=%s) — skipping create+provision, recursing into children for partial-match", ws.Name, existingID)
parentRef := ""
if parentID != nil {
parentRef = *parentID
}
provlog.Event("provision.skip_existing", map[string]any{
"name": ws.Name,
"existing_id": existingID,
"parent_id": parentRef,
"tier": tier,
})
*results = append(*results, map[string]interface{}{
"id": existingID,
"name": ws.Name,
"tier": tier,
"skipped": true,
})
return h.recurseChildrenForImport(ws, existingID, absX, absY, defaults, orgBaseDir, results, provisionSem)
}
if err != nil {
log.Printf("Org import: failed to create %s: %v", ws.Name, err)
return fmt.Errorf("failed to create %s: %w", ws.Name, err)

View File

@ -209,19 +209,42 @@ func findLookupAndWorkspacesInsertPos(t *testing.T, fname string, src []byte) (l
return
}
// Source-level guard — pins that org_import.go calls
// h.lookupExistingChild BEFORE its INSERT INTO workspaces.
// onConflictDoNothingRE pins the TOCTOU-safe shape introduced by
// migration 20260506000000_workspaces_unique_parent_name.up.sql +
// the org_import.go INSERT swap (#2872 Critical 1). The workspaces
// INSERT MUST funnel concurrent collisions through the partial unique
// index — `ON CONFLICT (...) WHERE status != 'removed' DO NOTHING`
// is the literal pg statement form that achieves it.
//
// The pattern intentionally requires both the COALESCE expression
// (so root-workspace NULL parents collide) AND the partial-index WHERE
// clause (so 'removed' rows don't block re-imports). A regression that
// drops either piece would make the index target a different shape
// than the migration created, and Postgres would emit
// "no unique or exclusion constraint matching the ON CONFLICT
// specification" at runtime — but only on the FIRST collision attempt
// in production, not in CI without a live race. This regex catches
// the shape in source so the bug never ships.
var onConflictDoNothingRE = regexp.MustCompile(
`(?s)ON\s+CONFLICT\s*\(\s*COALESCE\s*\(\s*parent_id\s*,\s*'00000000-0000-0000-0000-000000000000'::uuid\s*\)\s*,\s*name\s*\).*?WHERE\s+status\s*!=\s*'removed'.*?DO\s+NOTHING`,
)
// Source-level guard — pins that org_import.go's INSERT INTO workspaces
// uses the TOCTOU-safe ON CONFLICT DO NOTHING pattern.
//
// Per memory feedback_behavior_based_ast_gates.md: pin the behavior
// (idempotency check before INSERT), not just function names. If a
// future refactor reintroduces the un-checked INSERT (the original
// bug shape that leaked 72 workspaces in 4 days), this test fails.
// (atomic conflict resolution at the DB), not just function names.
// If a future refactor reintroduces the un-checked INSERT (the original
// bug shape that leaked 72 workspaces in 4 days at tenant-hongming),
// this test fails BEFORE the broken code reaches production where the
// race window opens.
//
// AST-walk implementation closes the silent-false-pass mode that the
// previous bytes.Index gate had — see workspacesInsertRE comment for
// the failure mode (workspaces_audit / workspace_secrets / etc.
// shadowing the real target via prefix match).
func TestCreateWorkspaceTree_CallsLookupBeforeInsert(t *testing.T) {
// Replaces an earlier "lookup-before-insert" gate that became obsolete
// when this swap moved idempotency into the database. The earlier
// gate would silent-false-pass against ON CONFLICT — even though that
// shape is correct — because lookupExistingChild now runs AFTER the
// INSERT (only on the skip path, to retrieve the existing id).
func TestCreateWorkspaceTree_InsertUsesOnConflictDoNothing(t *testing.T) {
wd, err := os.Getwd()
if err != nil {
t.Fatalf("getwd: %v", err)
@ -230,30 +253,24 @@ func TestCreateWorkspaceTree_CallsLookupBeforeInsert(t *testing.T) {
if err != nil {
t.Fatalf("read org_import.go: %v", err)
}
lookupPos, insertPos, fset := findLookupAndWorkspacesInsertPos(t, "org_import.go", src)
if lookupPos == token.NoPos {
t.Fatalf("AST: no call to lookupExistingChild in org_import.go — idempotency check removed?")
}
if insertPos == token.NoPos {
insertSQL := findWorkspacesInsertSQL(t, "org_import.go", src)
if insertSQL == "" {
t.Fatalf("AST: no SQL literal matching `^\\s*INSERT INTO workspaces\\s*\\(` in any CallExpr in org_import.go — schema change or rename?")
}
if lookupPos > insertPos {
t.Errorf("lookupExistingChild call at %s must come BEFORE INSERT INTO workspaces at %s — non-idempotent ordering would re-leak under repeat /org/import calls",
fset.Position(lookupPos), fset.Position(insertPos))
if !onConflictDoNothingRE.MatchString(insertSQL) {
t.Errorf("workspaces INSERT SQL does NOT use the TOCTOU-safe ON CONFLICT shape — concurrent /org/import POSTs will silently double-insert. Required pattern:\n ON CONFLICT (COALESCE(parent_id, '00000000-...'::uuid), name) WHERE status != 'removed' DO NOTHING\n\nActual SQL:\n%s", insertSQL)
}
}
// TestGate_FailsWhenLookupAfterInsert proves the gate actually catches
// the bug it's named after — running it against synthetic Go source
// where the lookup call is positioned AFTER the workspaces INSERT must
// produce lookupPos > insertPos, which the production gate flags as
// an ERROR. Without this test the gate could regress to "always pass"
// and we wouldn't notice until the bug shipped again.
// TestGate_FailsWhenInsertOmitsOnConflict proves the gate actually
// catches the bug it's named after — running it against synthetic Go
// source where the workspaces INSERT lacks the ON CONFLICT clause must
// fail the regex match. Without this test the gate could regress to
// "always pass" and the TOCTOU window would silently reopen.
//
// Per memory feedback_assert_exact_not_substring.md: verify a
// tightened test FAILS on old code before merging.
func TestGate_FailsWhenLookupAfterInsert(t *testing.T) {
// Per memory feedback_assert_exact_not_substring.md: verify the
// tightened test FAILS on the bug shape before merging.
func TestGate_FailsWhenInsertOmitsOnConflict(t *testing.T) {
const buggySrc = `package handlers
import "context"
@ -264,26 +281,57 @@ func (fakeDB) ExecContext(ctx context.Context, sql string, args ...interface{})
type fakeOrgHandler struct{}
func (h *fakeOrgHandler) lookupExistingChild(ctx context.Context, name string, parentID *string) (string, bool, error) {
return "", false, nil
}
func buggyCreate(h *fakeOrgHandler, db fakeDB, ctx context.Context, name string, parentID *string) {
// Bug shape: INSERT runs FIRST, lookup runs AFTER. This is the
// non-idempotent ordering the gate exists to forbid.
// Bug shape: bare INSERT, no ON CONFLICT. Two concurrent calls
// race past the unique-index check before either completes the
// transaction; constraint failure surfaces as a 500 to the
// caller (not graceful skip). Pre-#2872 this would silently
// duplicate-insert.
db.ExecContext(ctx, ` + "`INSERT INTO workspaces (id, name) VALUES ($1, $2)`" + `, "x", name)
h.lookupExistingChild(ctx, name, parentID)
}
`
lookupPos, insertPos, _ := findLookupAndWorkspacesInsertPos(t, "buggy.go", []byte(buggySrc))
if lookupPos == token.NoPos || insertPos == token.NoPos {
t.Fatalf("synthetic buggy source missing expected nodes (lookupPos=%v insertPos=%v) — helper logic regression", lookupPos, insertPos)
insertSQL := findWorkspacesInsertSQL(t, "buggy.go", []byte(buggySrc))
if insertSQL == "" {
t.Fatalf("synthetic buggy source missing workspaces INSERT — helper logic regression")
}
if lookupPos < insertPos {
t.Fatalf("synthetic bug shape (lookup AFTER insert) returned lookupPos=%d < insertPos=%d — gate would NOT fire on actual bug, regression!", lookupPos, insertPos)
if onConflictDoNothingRE.MatchString(insertSQL) {
t.Fatalf("synthetic bug shape (bare INSERT, no ON CONFLICT) was MATCHED by the gate — regression: gate would not flag the actual bug. SQL:\n%s", insertSQL)
}
// Implicit: lookupPos > insertPos here, which the production gate
// flags via t.Errorf. This proves the gate is live, not vestigial.
}
// findWorkspacesInsertSQL walks `src` and returns the unquoted SQL of
// the first string literal matching workspacesInsertRE inside any
// CallExpr's argument list. Returns "" if none found. Helper for the
// ON CONFLICT gate above.
func findWorkspacesInsertSQL(t *testing.T, fname string, src []byte) string {
t.Helper()
fset := token.NewFileSet()
file, err := parser.ParseFile(fset, fname, src, parser.ParseComments)
if err != nil {
t.Fatalf("parse %s: %v", fname, err)
}
var sql string
ast.Inspect(file, func(n ast.Node) bool {
call, ok := n.(*ast.CallExpr)
if !ok {
return true
}
for _, arg := range call.Args {
lit, ok := arg.(*ast.BasicLit)
if !ok || lit.Kind != token.STRING {
continue
}
raw := lit.Value
if unq, err := strconv.Unquote(raw); err == nil {
raw = unq
}
if workspacesInsertRE.MatchString(raw) && sql == "" {
sql = raw
}
}
return true
})
return sql
}
// TestGate_IgnoresAuditTableShadow proves the regex tightening

View File

@ -0,0 +1,8 @@
-- Reverse of 20260506000000_workspaces_unique_parent_name.up.sql.
--
-- DROP CONCURRENTLY for the same reason CREATE was CONCURRENTLY:
-- avoid an ACCESS EXCLUSIVE lock during teardown on tenants under
-- live traffic. IF EXISTS makes this idempotent if the up-migration
-- never landed (e.g. resumed deploy on a fresh tenant).
DROP INDEX CONCURRENTLY IF EXISTS workspaces_parent_name_uniq;

View File

@ -0,0 +1,46 @@
-- TOCTOU backstop on workspaces(parent_id, name)
--
-- Origin: #2872 Critical 1 — /org/import had no per-tenant mutex,
-- advisory lock, or DB-level uniqueness, so two concurrent admin
-- POSTs (rapid double-click in canvas, retry-after-timeout, two
-- operators on the same template) both saw "not found" in
-- lookupExistingChild and both INSERT'd the same (parent_id, name)
-- row. Sweeper #2860 cleaned residual drift; this migration prevents
-- new collisions.
--
-- Why a partial index keyed on COALESCE(parent_id, sentinel):
--
-- - Postgres treats NULL ≠ NULL in a UNIQUE constraint, so root
-- workspaces (parent_id = NULL) would not collide pairwise even
-- if they shared the same name. COALESCE collapses NULLs to a
-- sentinel UUID so root collisions are caught.
--
-- - The `WHERE status != 'removed'` partial-index filter makes a
-- tombstoned row (collapsed team, deleted workspace) NOT block a
-- re-import using the same name, which preserves the existing
-- org-import semantics (lookupExistingChild already excludes
-- status='removed').
--
-- Why CONCURRENTLY:
--
-- - Builds the index without an ACCESS EXCLUSIVE lock on workspaces.
-- Production tenants serve live traffic during migration; a
-- blocking index build would cause request stalls.
-- - CONCURRENTLY MUST run outside a transaction. The migration
-- runner is configured to honour this (no BEGIN/COMMIT wrapper
-- around an idempotent CREATE INDEX CONCURRENTLY IF NOT EXISTS).
-- - IF NOT EXISTS makes this resumable: a partial build (e.g. CI
-- killed mid-flight) leaves an INVALID index; re-running CONCURRENTLY
-- after a manual REINDEX repairs without erroring on already-built.
--
-- Drift detection: companion test
-- workspaces_unique_parent_name_test.go pre-flights the index on a
-- live test DB to confirm the migration applied + the constraint is
-- enforceable.
CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS workspaces_parent_name_uniq
ON workspaces (
COALESCE(parent_id, '00000000-0000-0000-0000-000000000000'::uuid),
name
)
WHERE status != 'removed';