forked from molecule-ai/molecule-core
Merge pull request #3011 from Molecule-AI/rfc-2872-workspaces-uniq-toctou
fix(workspace-server): close TOCTOU race on workspaces(parent_id, name) (#2872 Critical 1)
This commit is contained in:
commit
bb9bf85dbd
@ -82,61 +82,6 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
|
||||
}
|
||||
}
|
||||
|
||||
// 5s timeout bounds the lookup independently of any HTTP request
|
||||
// context. createWorkspaceTree runs in goroutines spawned from the
|
||||
// /org/import handler, so plumbing the request context here would
|
||||
// cascade-cancel into provisionWorkspaceAuto and abort in-flight
|
||||
// EC2 provisioning if the client disconnected mid-import — that's
|
||||
// the wrong behaviour. A short bounded timeout protects the
|
||||
// per-row SELECT against a wedged DB without taking the
|
||||
// drop-everything-on-disconnect tradeoff.
|
||||
ctxLookup, cancelLookup := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancelLookup()
|
||||
// Idempotency: if a workspace with the same (parent_id, name) already
|
||||
// exists, skip the INSERT + canvas_layouts + broadcast + provisioning.
|
||||
// This is what makes /org/import safe to call multiple times — the
|
||||
// historical leak was every call recreating the entire tree (see
|
||||
// tenant-hongming, 72 distinct child workspaces in 4 days, all from
|
||||
// repeated org-template spawns of the same template).
|
||||
//
|
||||
// Recursion still runs on the existing id so partial-match templates
|
||||
// (parent exists, some children missing) backfill the missing children
|
||||
// instead of either no-op'ing the whole subtree or duplicating the
|
||||
// existing children.
|
||||
//
|
||||
// /org/import is ADDITIVE-ONLY, never destructive. Children present
|
||||
// in the existing tree but absent from the new template are
|
||||
// preserved (no DELETE on diff). Skip-path also does NOT propagate
|
||||
// updates to existing nodes — a re-import that adds an
|
||||
// initial_memory or schedule to an existing workspace is silently
|
||||
// dropped (the function bypasses seedInitialMemories, schedule SQL,
|
||||
// channel config for skipped rows). To force-update an existing
|
||||
// tree, delete and re-import or use a future /org/sync route.
|
||||
existingID, existing, lookupErr := h.lookupExistingChild(ctxLookup, ws.Name, parentID)
|
||||
if lookupErr != nil {
|
||||
return fmt.Errorf("idempotency check for %s: %w", ws.Name, lookupErr)
|
||||
}
|
||||
if existing {
|
||||
log.Printf("Org import: %q already exists (id=%s) — skipping create+provision, recursing into children for partial-match", ws.Name, existingID)
|
||||
parentRef := ""
|
||||
if parentID != nil {
|
||||
parentRef = *parentID
|
||||
}
|
||||
provlog.Event("provision.skip_existing", map[string]any{
|
||||
"name": ws.Name,
|
||||
"existing_id": existingID,
|
||||
"parent_id": parentRef,
|
||||
"tier": tier,
|
||||
})
|
||||
*results = append(*results, map[string]interface{}{
|
||||
"id": existingID,
|
||||
"name": ws.Name,
|
||||
"tier": tier,
|
||||
"skipped": true,
|
||||
})
|
||||
return h.recurseChildrenForImport(ws, existingID, absX, absY, defaults, orgBaseDir, results, provisionSem)
|
||||
}
|
||||
|
||||
id := uuid.New().String()
|
||||
awarenessNS := workspaceAwarenessNamespace(id)
|
||||
|
||||
@ -188,10 +133,67 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
|
||||
if maxConcurrent <= 0 {
|
||||
maxConcurrent = models.DefaultMaxConcurrentTasks
|
||||
}
|
||||
_, err := db.DB.ExecContext(ctx, `
|
||||
// TOCTOU-safe insert (#2872 Critical 1).
|
||||
//
|
||||
// `ON CONFLICT DO NOTHING` paired with the partial unique index
|
||||
// from migration 20260506000000_workspaces_unique_parent_name.up.sql
|
||||
// atomically resolves a race window that the prior
|
||||
// lookup-then-insert had: two concurrent /org/import POSTs both
|
||||
// saw "not found" in lookupExistingChild and both INSERT'd the
|
||||
// same (parent_id, name). After this swap the SECOND INSERT
|
||||
// silently no-ops, RETURNING returns 0 rows → sql.ErrNoRows, and
|
||||
// the skip-path runs.
|
||||
//
|
||||
// ON CONFLICT target uses (COALESCE(parent_id,...), name) WHERE
|
||||
// status != 'removed' — must match the partial-index predicate
|
||||
// EXACTLY for Postgres to consider the index applicable.
|
||||
var insertedID string
|
||||
err := db.DB.QueryRowContext(ctx, `
|
||||
INSERT INTO workspaces (id, name, role, tier, runtime, awareness_namespace, status, parent_id, workspace_dir, workspace_access, max_concurrent_tasks)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
|
||||
`, id, ws.Name, role, tier, runtime, awarenessNS, "provisioning", parentID, workspaceDir, workspaceAccess, maxConcurrent)
|
||||
ON CONFLICT (COALESCE(parent_id, '00000000-0000-0000-0000-000000000000'::uuid), name)
|
||||
WHERE status != 'removed'
|
||||
DO NOTHING
|
||||
RETURNING id
|
||||
`, id, ws.Name, role, tier, runtime, awarenessNS, "provisioning", parentID, workspaceDir, workspaceAccess, maxConcurrent).Scan(&insertedID)
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
// Skip path — a non-removed row already exists for
|
||||
// (parent_id, name). Re-select its id; idempotency-friendly
|
||||
// semantics match the original lookupExistingChild path
|
||||
// (parent_id IS NOT DISTINCT FROM matches NULL too,
|
||||
// status='removed' rows are ignored).
|
||||
ctxLookup, cancelLookup := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancelLookup()
|
||||
existingID, found, selErr := h.lookupExistingChild(ctxLookup, ws.Name, parentID)
|
||||
if selErr != nil {
|
||||
return fmt.Errorf("post-conflict re-select for %s: %w", ws.Name, selErr)
|
||||
}
|
||||
if !found {
|
||||
// Index conflicted but row vanished between INSERT and
|
||||
// re-SELECT (status flipped to 'removed' concurrently).
|
||||
// Surface as an error rather than silently retrying —
|
||||
// the user can re-trigger /org/import safely.
|
||||
return fmt.Errorf("workspace %q conflicted on insert but not visible on re-select (concurrent status flip?)", ws.Name)
|
||||
}
|
||||
log.Printf("Org import: %q already exists (id=%s) — skipping create+provision, recursing into children for partial-match", ws.Name, existingID)
|
||||
parentRef := ""
|
||||
if parentID != nil {
|
||||
parentRef = *parentID
|
||||
}
|
||||
provlog.Event("provision.skip_existing", map[string]any{
|
||||
"name": ws.Name,
|
||||
"existing_id": existingID,
|
||||
"parent_id": parentRef,
|
||||
"tier": tier,
|
||||
})
|
||||
*results = append(*results, map[string]interface{}{
|
||||
"id": existingID,
|
||||
"name": ws.Name,
|
||||
"tier": tier,
|
||||
"skipped": true,
|
||||
})
|
||||
return h.recurseChildrenForImport(ws, existingID, absX, absY, defaults, orgBaseDir, results, provisionSem)
|
||||
}
|
||||
if err != nil {
|
||||
log.Printf("Org import: failed to create %s: %v", ws.Name, err)
|
||||
return fmt.Errorf("failed to create %s: %w", ws.Name, err)
|
||||
|
||||
@ -209,19 +209,42 @@ func findLookupAndWorkspacesInsertPos(t *testing.T, fname string, src []byte) (l
|
||||
return
|
||||
}
|
||||
|
||||
// Source-level guard — pins that org_import.go calls
|
||||
// h.lookupExistingChild BEFORE its INSERT INTO workspaces.
|
||||
// onConflictDoNothingRE pins the TOCTOU-safe shape introduced by
|
||||
// migration 20260506000000_workspaces_unique_parent_name.up.sql +
|
||||
// the org_import.go INSERT swap (#2872 Critical 1). The workspaces
|
||||
// INSERT MUST funnel concurrent collisions through the partial unique
|
||||
// index — `ON CONFLICT (...) WHERE status != 'removed' DO NOTHING`
|
||||
// is the literal pg statement form that achieves it.
|
||||
//
|
||||
// The pattern intentionally requires both the COALESCE expression
|
||||
// (so root-workspace NULL parents collide) AND the partial-index WHERE
|
||||
// clause (so 'removed' rows don't block re-imports). A regression that
|
||||
// drops either piece would make the index target a different shape
|
||||
// than the migration created, and Postgres would emit
|
||||
// "no unique or exclusion constraint matching the ON CONFLICT
|
||||
// specification" at runtime — but only on the FIRST collision attempt
|
||||
// in production, not in CI without a live race. This regex catches
|
||||
// the shape in source so the bug never ships.
|
||||
var onConflictDoNothingRE = regexp.MustCompile(
|
||||
`(?s)ON\s+CONFLICT\s*\(\s*COALESCE\s*\(\s*parent_id\s*,\s*'00000000-0000-0000-0000-000000000000'::uuid\s*\)\s*,\s*name\s*\).*?WHERE\s+status\s*!=\s*'removed'.*?DO\s+NOTHING`,
|
||||
)
|
||||
|
||||
// Source-level guard — pins that org_import.go's INSERT INTO workspaces
|
||||
// uses the TOCTOU-safe ON CONFLICT DO NOTHING pattern.
|
||||
//
|
||||
// Per memory feedback_behavior_based_ast_gates.md: pin the behavior
|
||||
// (idempotency check before INSERT), not just function names. If a
|
||||
// future refactor reintroduces the un-checked INSERT (the original
|
||||
// bug shape that leaked 72 workspaces in 4 days), this test fails.
|
||||
// (atomic conflict resolution at the DB), not just function names.
|
||||
// If a future refactor reintroduces the un-checked INSERT (the original
|
||||
// bug shape that leaked 72 workspaces in 4 days at tenant-hongming),
|
||||
// this test fails BEFORE the broken code reaches production where the
|
||||
// race window opens.
|
||||
//
|
||||
// AST-walk implementation closes the silent-false-pass mode that the
|
||||
// previous bytes.Index gate had — see workspacesInsertRE comment for
|
||||
// the failure mode (workspaces_audit / workspace_secrets / etc.
|
||||
// shadowing the real target via prefix match).
|
||||
func TestCreateWorkspaceTree_CallsLookupBeforeInsert(t *testing.T) {
|
||||
// Replaces an earlier "lookup-before-insert" gate that became obsolete
|
||||
// when this swap moved idempotency into the database. The earlier
|
||||
// gate would silent-false-pass against ON CONFLICT — even though that
|
||||
// shape is correct — because lookupExistingChild now runs AFTER the
|
||||
// INSERT (only on the skip path, to retrieve the existing id).
|
||||
func TestCreateWorkspaceTree_InsertUsesOnConflictDoNothing(t *testing.T) {
|
||||
wd, err := os.Getwd()
|
||||
if err != nil {
|
||||
t.Fatalf("getwd: %v", err)
|
||||
@ -230,30 +253,24 @@ func TestCreateWorkspaceTree_CallsLookupBeforeInsert(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatalf("read org_import.go: %v", err)
|
||||
}
|
||||
lookupPos, insertPos, fset := findLookupAndWorkspacesInsertPos(t, "org_import.go", src)
|
||||
|
||||
if lookupPos == token.NoPos {
|
||||
t.Fatalf("AST: no call to lookupExistingChild in org_import.go — idempotency check removed?")
|
||||
}
|
||||
if insertPos == token.NoPos {
|
||||
insertSQL := findWorkspacesInsertSQL(t, "org_import.go", src)
|
||||
if insertSQL == "" {
|
||||
t.Fatalf("AST: no SQL literal matching `^\\s*INSERT INTO workspaces\\s*\\(` in any CallExpr in org_import.go — schema change or rename?")
|
||||
}
|
||||
if lookupPos > insertPos {
|
||||
t.Errorf("lookupExistingChild call at %s must come BEFORE INSERT INTO workspaces at %s — non-idempotent ordering would re-leak under repeat /org/import calls",
|
||||
fset.Position(lookupPos), fset.Position(insertPos))
|
||||
if !onConflictDoNothingRE.MatchString(insertSQL) {
|
||||
t.Errorf("workspaces INSERT SQL does NOT use the TOCTOU-safe ON CONFLICT shape — concurrent /org/import POSTs will silently double-insert. Required pattern:\n ON CONFLICT (COALESCE(parent_id, '00000000-...'::uuid), name) WHERE status != 'removed' DO NOTHING\n\nActual SQL:\n%s", insertSQL)
|
||||
}
|
||||
}
|
||||
|
||||
// TestGate_FailsWhenLookupAfterInsert proves the gate actually catches
|
||||
// the bug it's named after — running it against synthetic Go source
|
||||
// where the lookup call is positioned AFTER the workspaces INSERT must
|
||||
// produce lookupPos > insertPos, which the production gate flags as
|
||||
// an ERROR. Without this test the gate could regress to "always pass"
|
||||
// and we wouldn't notice until the bug shipped again.
|
||||
// TestGate_FailsWhenInsertOmitsOnConflict proves the gate actually
|
||||
// catches the bug it's named after — running it against synthetic Go
|
||||
// source where the workspaces INSERT lacks the ON CONFLICT clause must
|
||||
// fail the regex match. Without this test the gate could regress to
|
||||
// "always pass" and the TOCTOU window would silently reopen.
|
||||
//
|
||||
// Per memory feedback_assert_exact_not_substring.md: verify a
|
||||
// tightened test FAILS on old code before merging.
|
||||
func TestGate_FailsWhenLookupAfterInsert(t *testing.T) {
|
||||
// Per memory feedback_assert_exact_not_substring.md: verify the
|
||||
// tightened test FAILS on the bug shape before merging.
|
||||
func TestGate_FailsWhenInsertOmitsOnConflict(t *testing.T) {
|
||||
const buggySrc = `package handlers
|
||||
|
||||
import "context"
|
||||
@ -264,26 +281,57 @@ func (fakeDB) ExecContext(ctx context.Context, sql string, args ...interface{})
|
||||
|
||||
type fakeOrgHandler struct{}
|
||||
|
||||
func (h *fakeOrgHandler) lookupExistingChild(ctx context.Context, name string, parentID *string) (string, bool, error) {
|
||||
return "", false, nil
|
||||
}
|
||||
|
||||
func buggyCreate(h *fakeOrgHandler, db fakeDB, ctx context.Context, name string, parentID *string) {
|
||||
// Bug shape: INSERT runs FIRST, lookup runs AFTER. This is the
|
||||
// non-idempotent ordering the gate exists to forbid.
|
||||
// Bug shape: bare INSERT, no ON CONFLICT. Two concurrent calls
|
||||
// race past the unique-index check before either completes the
|
||||
// transaction; constraint failure surfaces as a 500 to the
|
||||
// caller (not graceful skip). Pre-#2872 this would silently
|
||||
// duplicate-insert.
|
||||
db.ExecContext(ctx, ` + "`INSERT INTO workspaces (id, name) VALUES ($1, $2)`" + `, "x", name)
|
||||
h.lookupExistingChild(ctx, name, parentID)
|
||||
}
|
||||
`
|
||||
lookupPos, insertPos, _ := findLookupAndWorkspacesInsertPos(t, "buggy.go", []byte(buggySrc))
|
||||
if lookupPos == token.NoPos || insertPos == token.NoPos {
|
||||
t.Fatalf("synthetic buggy source missing expected nodes (lookupPos=%v insertPos=%v) — helper logic regression", lookupPos, insertPos)
|
||||
insertSQL := findWorkspacesInsertSQL(t, "buggy.go", []byte(buggySrc))
|
||||
if insertSQL == "" {
|
||||
t.Fatalf("synthetic buggy source missing workspaces INSERT — helper logic regression")
|
||||
}
|
||||
if lookupPos < insertPos {
|
||||
t.Fatalf("synthetic bug shape (lookup AFTER insert) returned lookupPos=%d < insertPos=%d — gate would NOT fire on actual bug, regression!", lookupPos, insertPos)
|
||||
if onConflictDoNothingRE.MatchString(insertSQL) {
|
||||
t.Fatalf("synthetic bug shape (bare INSERT, no ON CONFLICT) was MATCHED by the gate — regression: gate would not flag the actual bug. SQL:\n%s", insertSQL)
|
||||
}
|
||||
// Implicit: lookupPos > insertPos here, which the production gate
|
||||
// flags via t.Errorf. This proves the gate is live, not vestigial.
|
||||
}
|
||||
|
||||
// findWorkspacesInsertSQL walks `src` and returns the unquoted SQL of
|
||||
// the first string literal matching workspacesInsertRE inside any
|
||||
// CallExpr's argument list. Returns "" if none found. Helper for the
|
||||
// ON CONFLICT gate above.
|
||||
func findWorkspacesInsertSQL(t *testing.T, fname string, src []byte) string {
|
||||
t.Helper()
|
||||
fset := token.NewFileSet()
|
||||
file, err := parser.ParseFile(fset, fname, src, parser.ParseComments)
|
||||
if err != nil {
|
||||
t.Fatalf("parse %s: %v", fname, err)
|
||||
}
|
||||
var sql string
|
||||
ast.Inspect(file, func(n ast.Node) bool {
|
||||
call, ok := n.(*ast.CallExpr)
|
||||
if !ok {
|
||||
return true
|
||||
}
|
||||
for _, arg := range call.Args {
|
||||
lit, ok := arg.(*ast.BasicLit)
|
||||
if !ok || lit.Kind != token.STRING {
|
||||
continue
|
||||
}
|
||||
raw := lit.Value
|
||||
if unq, err := strconv.Unquote(raw); err == nil {
|
||||
raw = unq
|
||||
}
|
||||
if workspacesInsertRE.MatchString(raw) && sql == "" {
|
||||
sql = raw
|
||||
}
|
||||
}
|
||||
return true
|
||||
})
|
||||
return sql
|
||||
}
|
||||
|
||||
// TestGate_IgnoresAuditTableShadow proves the regex tightening
|
||||
|
||||
@ -0,0 +1,8 @@
|
||||
-- Reverse of 20260506000000_workspaces_unique_parent_name.up.sql.
|
||||
--
|
||||
-- DROP CONCURRENTLY for the same reason CREATE was CONCURRENTLY:
|
||||
-- avoid an ACCESS EXCLUSIVE lock during teardown on tenants under
|
||||
-- live traffic. IF EXISTS makes this idempotent if the up-migration
|
||||
-- never landed (e.g. resumed deploy on a fresh tenant).
|
||||
|
||||
DROP INDEX CONCURRENTLY IF EXISTS workspaces_parent_name_uniq;
|
||||
@ -0,0 +1,46 @@
|
||||
-- TOCTOU backstop on workspaces(parent_id, name)
|
||||
--
|
||||
-- Origin: #2872 Critical 1 — /org/import had no per-tenant mutex,
|
||||
-- advisory lock, or DB-level uniqueness, so two concurrent admin
|
||||
-- POSTs (rapid double-click in canvas, retry-after-timeout, two
|
||||
-- operators on the same template) both saw "not found" in
|
||||
-- lookupExistingChild and both INSERT'd the same (parent_id, name)
|
||||
-- row. Sweeper #2860 cleaned residual drift; this migration prevents
|
||||
-- new collisions.
|
||||
--
|
||||
-- Why a partial index keyed on COALESCE(parent_id, sentinel):
|
||||
--
|
||||
-- - Postgres treats NULL ≠ NULL in a UNIQUE constraint, so root
|
||||
-- workspaces (parent_id = NULL) would not collide pairwise even
|
||||
-- if they shared the same name. COALESCE collapses NULLs to a
|
||||
-- sentinel UUID so root collisions are caught.
|
||||
--
|
||||
-- - The `WHERE status != 'removed'` partial-index filter makes a
|
||||
-- tombstoned row (collapsed team, deleted workspace) NOT block a
|
||||
-- re-import using the same name, which preserves the existing
|
||||
-- org-import semantics (lookupExistingChild already excludes
|
||||
-- status='removed').
|
||||
--
|
||||
-- Why CONCURRENTLY:
|
||||
--
|
||||
-- - Builds the index without an ACCESS EXCLUSIVE lock on workspaces.
|
||||
-- Production tenants serve live traffic during migration; a
|
||||
-- blocking index build would cause request stalls.
|
||||
-- - CONCURRENTLY MUST run outside a transaction. The migration
|
||||
-- runner is configured to honour this (no BEGIN/COMMIT wrapper
|
||||
-- around an idempotent CREATE INDEX CONCURRENTLY IF NOT EXISTS).
|
||||
-- - IF NOT EXISTS makes this resumable: a partial build (e.g. CI
|
||||
-- killed mid-flight) leaves an INVALID index; re-running CONCURRENTLY
|
||||
-- after a manual REINDEX repairs without erroring on already-built.
|
||||
--
|
||||
-- Drift detection: companion test
|
||||
-- workspaces_unique_parent_name_test.go pre-flights the index on a
|
||||
-- live test DB to confirm the migration applied + the constraint is
|
||||
-- enforceable.
|
||||
|
||||
CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS workspaces_parent_name_uniq
|
||||
ON workspaces (
|
||||
COALESCE(parent_id, '00000000-0000-0000-0000-000000000000'::uuid),
|
||||
name
|
||||
)
|
||||
WHERE status != 'removed';
|
||||
Loading…
Reference in New Issue
Block a user