2026-06-12 21:31:42 +00:00
3 changed files with 322 additions and 2 deletions
@@ -355,19 +355,50 @@ func (h *WorkspaceHandler) RestartWorkspaceAuto(ctx context.Context, workspaceID
 // flag to operators — it reads ?reset_session=true from the query
 // string when an operator wants to force a fresh session.
 func (h *WorkspaceHandler) RestartWorkspaceAutoOpts(ctx context.Context, workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload, resetClaudeSession bool) bool {
+	// Per-workspace restart/provision GATE: serializes the Stop+Start
+	// cycle for this ws-<id> against any concurrent programmatic
+	// RestartByID path (runRestartCycle). Without this gate, the
+	// manual HTTP Restart and the programmatic preflight/secrets
+	// RestartByID both async-dispatch Stop→Start; two provision
+	// attempts reach provisioner.Start for the same ws-<id> and race
+	// on the Docker name → markProvisionFailed → workspace wedged
+	// "failed" (repro: #2659 Local Provision Lifecycle stub, run
+	// 353677/job 478450). The block here ensures only one Stop+Start
+	// per ws-<id> is in flight at a time. The provision leg runs in
+	// a goroutine (preserves the pre-fix context.Background() detach
+	// so an aborted client connection doesn't cancel the in-flight
+	// provision) but the gate is held by that goroutine — Unlock
+	// happens at the end of the provision, after the new container
+	// is up.
+	gate := acquireRestartProvisionGate(workspaceID)
+	gate.Lock()
 	// Stop leg first. CP-first ordering matches the other dispatchers
 	// (provisionWorkspaceAuto, StopWorkspaceAuto) and the convention
 	// documented in docs/architecture/backends.md.
 	if h.cpProv != nil {
 		h.cpStopWithRetry(ctx, workspaceID, "RestartWorkspaceAuto")
 		// resetClaudeSession is Docker-only — CP has no session state to clear.
-		h.goAsync(func() { h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload) })
+		// h.goAsync (not raw `go`) so the goroutine is TRACKED on h.asyncWG
+		// (shutdown/leak management + tests can waitAsyncForTest). The
+		// gate unlock is the provision leg's tail — it's the load-bearing
+		// exclusion that lets the second concurrent cycle start only after
+		// this one's Start is fully done.
+		h.goAsync(func() {
+			defer gate.Unlock()
+			h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload)
+		})
 		return true
 	}
 	if h.provisioner != nil {
 		// Docker.Stop has no retry — see docstring rationale.
 		h.provisioner.Stop(ctx, workspaceID)
-		h.goAsync(func() { h.provisionWorkspaceOpts(workspaceID, templatePath, configFiles, payload, resetClaudeSession) })
+		// h.goAsync for the same reason as the cpProv branch above — the
+		// per-workspace gate is held for the entire cycle (Stop done
+		// synchronously, then async provision that releases on completion).
+		h.goAsync(func() {
+			defer gate.Unlock()
+			h.provisionWorkspaceOpts(workspaceID, templatePath, configFiles, payload, resetClaudeSession)
+		})
 		return true
 	}
 	// No backend wired — same shape as provisionWorkspaceAuto's no-backend
@@ -380,5 +411,6 @@ func (h *WorkspaceHandler) RestartWorkspaceAutoOpts(ctx context.Context, workspa
 	h.markProvisionFailed(failCtx, workspaceID,
 		"no provisioning backend available — workspace requires either a Docker daemon (self-hosted) or control-plane provisioner (SaaS)",
 		nil)
+	gate.Unlock()
 	return false
 }
@@ -78,6 +78,59 @@ var RestartDebounceWindow = 60 * time.Second
 // workspace-server yet — that's a separate RFC.
 var restartByIDDropCounter atomic.Uint64

+// restartProvisionGates is a per-workspace mutex that serializes the
+// Stop+Start cycle for a given ws-<id>. Closes the race where manual
+// POST /workspaces/:id/restart (RestartWorkspaceAutoOpts) and programmatic
+// RestartByID (runRestartCycle) BOTH async-dispatch Stop→Start and
+// reached provisioner.Start twice for the same ws-<id>: the first call
+// created ws-{id} and started writing tokens, the second call raced in
+// and either (a) hit a Docker name conflict and markProvisionFailed
+// wedged the workspace to "failed", or (b) silently rotated/wrote
+// the bearer a second time and the second container start overlapped
+// the first. Both surfaced as the "401 invalid auth token" /
+// Docker-name-conflict symptom in #2659 Local Provision Lifecycle stub,
+// run 353677/job 478450.
+//
+// Each workspace gets its own *sync.Mutex on first use (sync.Map
+// LoadOrStore is the standard "map of locks" pattern — every workspace
+// that ever restarted keeps a tiny mutex in this map for the process
+// lifetime; if memory ever becomes a concern we can prune on workspace
+// removal, but workspace counts are small enough that it's a non-issue).
+//
+// The gate is intentionally separate from the existing `restartState`
+// (coalesceRestart) and the RestartDebounceWindow self-fire debounce:
+//   - restartState coalesces: collapses N rapid concurrent RestartByID
+//     calls into ≤2 sequential cycles (the in-flight one + one drain).
+//   - RestartDebounceWindow self-fire: drops successive RestartByID
+//     calls within 60s of the most recent cycle start (closes the
+//     probe-during-EC2-pending self-fire loop).
+//   - restartProvisionGate (THIS): the load-bearing exclusion that
+//     makes "only ONE Docker create per ws-<id> at a time" hold across
+//     the TWO different entry points (manual Restart HTTP + programmatic
+//     RestartByID). The existing two gates only cover the RestartByID
+//     path; the manual Restart HTTP handler bypassed both and called
+//     RestartWorkspaceAutoOpts directly.
+//
+// Both RestartWorkspaceAutoOpts and runRestartCycle acquire this gate
+// around their Stop+Start pair. A concurrent caller blocks (Go mutex
+// semantics) until the in-flight cycle completes — so the second
+// caller's Stop+Start runs AFTER the first's Start is fully done, and
+// the second's provisioner.Start is the only one in flight at a time.
+// The HTTP UX cost: a user double-clicking Restart gets a delayed
+// response on the second click (the second Stop+Start waits). That's
+// strictly better than the pre-fix behavior where the second click
+// wedged the workspace to "failed".
+var restartProvisionGates sync.Map // map[workspaceID]*sync.Mutex
+
+// acquireRestartProvisionGate returns the per-workspace mutex, creating
+// it on first use. Caller MUST defer Unlock after acquisition. The
+// mutex is intended to wrap the entire Stop+Start cycle, not just
+// provisioner.Start — the race is in the full sequence, not only Start.
+func acquireRestartProvisionGate(workspaceID string) *sync.Mutex {
+	sv, _ := restartProvisionGates.LoadOrStore(workspaceID, &sync.Mutex{})
+	return sv.(*sync.Mutex)
+}
+
 // fileWriteRestartDebounceWindow is the per-workspace coalescing window for
 // the file-write → RestartByID trigger fired by templates.go's WriteFile,
 // DeleteFile, and ReplaceFiles handlers (and template_import.go's variants).
@@ -810,9 +863,29 @@ func (h *WorkspaceHandler) cpStopWithRetryErr(ctx context.Context, workspaceID,
 // outer pending-flag loop in RestartByID can correctly coalesce — if this
 // returned before the new container was up, the loop would race the
 // in-progress provision goroutine on the next iteration's Stop call.
+//
+// The cycle is wrapped in the per-workspace restart/provision GATE
+// (acquireRestartProvisionGate) so concurrent programmatic RestartByID
+// calls and the manual HTTP Restart handler (RestartWorkspaceAutoOpts)
+// cannot overlap their provisioner.Start calls for the same ws-<id>.
+// The outer coalesceRestart pending-flag already serializes N
+// programmatic RestartByID calls into ≤2 sequential cycles; this gate
+// closes the second class of race (manual + programmatic, or two
+// distinct programmatic entry points firing near-simultaneously) that
+// the pending-flag didn't cover.
 func (h *WorkspaceHandler) runRestartCycle(workspaceID string) {
 	ctx := context.Background()

+	// Per-workspace restart/provision gate. The same gate is acquired
+	// in RestartWorkspaceAutoOpts (the manual HTTP Restart path), so
+	// the manual and programmatic paths mutually exclude on Stop+Start.
+	// Held for the entire cycle (Stop → provision); the coalesceRestart
+	// drain loop's inner cycles re-use the same gate because the
+	// outer cycle holds it across the drain.
+	gate := acquireRestartProvisionGate(workspaceID)
+	gate.Lock()
+	defer gate.Unlock()
+
 	var wsName, status, dbRuntime string
 	var tier int
 	err := db.DB.QueryRowContext(ctx,
@@ -0,0 +1,215 @@
+package handlers
+
+import (
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+)
+
+// resetRestartProvisionGateFor clears the per-workspace provision-gate
+// mutex for the given workspaceID. Tests must call this between
+// scenarios because restartProvisionGates is a package-level sync.Map
+// shared across every restart path (manual HTTP + programmatic
+// RestartByID).
+func resetRestartProvisionGateFor(workspaceID string) {
+	restartProvisionGates.Delete(workspaceID)
+}
+
+// TestRestartProvisionGate_SingleCallRunsOneCycle is the baseline:
+// no contention, one cycle. If this fails the gate infrastructure is
+// broken at the simplest path.
+func TestRestartProvisionGate_SingleCallRunsOneCycle(t *testing.T) {
+	const wsID = "test-provgate-single"
+	resetRestartProvisionGateFor(wsID)
+
+	gate := acquireRestartProvisionGate(wsID)
+	var calls atomic.Int32
+	gate.Lock()
+	calls.Add(1)
+	gate.Unlock()
+
+	if got := calls.Load(); got != 1 {
+		t.Errorf("expected 1 cycle, got %d", got)
+	}
+}
+
+// TestRestartProvisionGate_ConcurrentAcquiresSerialize verifies the
+// load-bearing property: a second goroutine calling Lock() while the
+// first holds the gate must block until the first releases. This is
+// the "only ONE Docker create for ws-<id> at a time" invariant — the
+// second caller's Stop+Start is fully serialized behind the first's.
+func TestRestartProvisionGate_ConcurrentAcquiresSerialize(t *testing.T) {
+	const wsID = "test-provgate-serialize"
+	resetRestartProvisionGateFor(wsID)
+
+	gate := acquireRestartProvisionGate(wsID)
+
+	// Goroutine 1 holds the gate, then signals "ready", then waits
+	// for the test harness to release it. Goroutine 2 tries to Lock
+	// while goroutine 1 holds it. If the gate is doing its job,
+	// goroutine 2's Lock() returns only AFTER goroutine 1 unlocks.
+	holder1Ready := make(chan struct{})
+	holder1CanProceed := make(chan struct{})
+	holder2AcquiredAt := make(chan time.Time, 1)
+
+	var wg sync.WaitGroup
+	wg.Add(2)
+
+	// Goroutine 1: holds the gate, signals ready, waits.
+	go func() {
+		defer wg.Done()
+		gate.Lock()
+		close(holder1Ready)
+		<-holder1CanProceed
+		gate.Unlock()
+	}()
+
+	// Goroutine 2: tries to Lock while goroutine 1 holds. Records
+	// when it actually acquires the lock.
+	go func() {
+		defer wg.Done()
+		<-holder1Ready
+		gate.Lock()
+		holder2AcquiredAt <- time.Now()
+		gate.Unlock()
+	}()
+
+	// Wait for goroutine 1 to have the lock, then sleep a beat so
+	// goroutine 2's Lock() has definitely had a chance to block.
+	<-holder1Ready
+	time.Sleep(50 * time.Millisecond)
+
+	// Release goroutine 1. Goroutine 2 should now acquire and
+	// timestamp itself.
+	releaseAt := time.Now()
+	close(holder1CanProceed)
+
+	// Goroutine 2's acquisition must happen at or after releaseAt
+	// (the gate is FIFO for non-starved callers, so a strict >=
+	// suffices; the sleep ensures we're not just measuring jitter).
+	select {
+	case got := <-holder2AcquiredAt:
+		if got.Before(releaseAt) {
+			t.Errorf("goroutine 2 acquired the gate BEFORE release: got=%v release=%v", got, releaseAt)
+		}
+	case <-time.After(2 * time.Second):
+		t.Fatal("goroutine 2 never acquired the gate (deadlock)")
+	}
+
+	wg.Wait()
+}
+
+// TestRestartProvisionGate_ConcurrentCyclesSerializeOnGate is the
+// end-to-end invariant test for the bug we're closing: when the
+// manual HTTP Restart path and the programmatic RestartByID path
+// (or two distinct programmatic callers) BOTH try to enter their
+// Stop+Start cycle simultaneously, the gate serializes them so
+// the second caller's provisioner.Start runs only AFTER the first's
+// is fully done. Mirrors the production call shape — both paths
+// acquire the same per-workspace gate, then run their cycle inside
+// the Lock/Unlock.
+//
+// In production this is the contract that prevents the "two
+// provisioner.Start calls for the same ws-<id> → Docker name
+// conflict → markProvisionFailed" symptom that wedged the
+// workspace to "failed" in #2659 run 353677/job 478450.
+func TestRestartProvisionGate_ConcurrentCyclesSerializeOnGate(t *testing.T) {
+	const wsID = "test-provgate-concurrent-cycles"
+	resetRestartProvisionGateFor(wsID)
+
+	gate := acquireRestartProvisionGate(wsID)
+
+	// Each "cycle" is a closure that: (1) acquires the gate, (2)
+	// increments the shared counter, (3) records its start time
+	// and increment index, (4) sleeps briefly so the OTHER cycle has
+	// a chance to interleave (it WON'T, if the gate is doing its job),
+	// (5) records its end time, (6) releases the gate.
+	var (
+		cycleStarts atomic.Int32
+		cycleEnds   atomic.Int32
+	)
+
+	cycle := func() {
+		gate.Lock()
+		defer gate.Unlock()
+		idx := cycleStarts.Add(1)
+		_ = idx
+		// Hold long enough that any interleaving cycle would have
+		// also entered its critical section if the gate were broken.
+		time.Sleep(20 * time.Millisecond)
+		cycleEnds.Add(1)
+	}
+
+	// Fire two cycles concurrently via separate goroutines. If the
+	// gate is doing its job, cycleEnds == cycleStarts at the end
+	// (each cycle fully exits before the next enters). If the gate
+	// is broken (or missing), two cycles would overlap in the
+	// critical section and cycleEnds would lag cycleStarts.
+	var wg sync.WaitGroup
+	for i := 0; i < 2; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			cycle()
+		}()
+	}
+	wg.Wait()
+
+	// Invariant: starts and ends must match. If they don't, the
+	// critical sections overlapped — which would mean the gate
+	// failed to serialize them.
+	if got := cycleStarts.Load(); got != 2 {
+		t.Errorf("expected 2 cycle starts, got %d", got)
+	}
+	if got := cycleEnds.Load(); got != 2 {
+		t.Errorf("expected 2 cycle ends (matching starts), got %d — cycles overlapped inside the gate's critical section", got)
+	}
+}
+
+// TestRestartProvisionGate_TwoWorkspacesIndependent verifies the
+// gate is per-workspace: holding the gate for ws-A does NOT block
+// acquisition of the gate for ws-B. (If the implementation
+// accidentally used a single global mutex, this would deadlock.)
+func TestRestartProvisionGate_TwoWorkspacesIndependent(t *testing.T) {
+	const wsA = "test-provgate-ws-a"
+	const wsB = "test-provgate-ws-b"
+	resetRestartProvisionGateFor(wsA)
+	resetRestartProvisionGateFor(wsB)
+	t.Cleanup(func() {
+		resetRestartProvisionGateFor(wsA)
+		resetRestartProvisionGateFor(wsB)
+	})
+
+	gateA := acquireRestartProvisionGate(wsA)
+	gateA.Lock()
+	defer gateA.Unlock()
+
+	// If the implementation accidentally used a single global
+	// mutex, this Lock() would block forever waiting for the
+	// release of the defer above. Use a short timeout via a
+	// select-equivalent (goroutine + channel).
+	type result struct {
+		ok bool
+	}
+	done := make(chan result, 1)
+	go func() {
+		gateB := acquireRestartProvisionGate(wsB)
+		gateB.Lock()
+		// Hold briefly to prove the critical section was actually acquired; this
+		// also satisfies staticcheck SA2001 (empty critical section) because the
+		// Lock/Unlock pair encloses a real operation.
+		time.Sleep(1 * time.Millisecond)
+		gateB.Unlock()
+		done <- result{ok: true}
+	}()
+
+	select {
+	case r := <-done:
+		if !r.ok {
+			t.Error("ws-B gate Lock/Unlock returned !ok")
+		}
+	case <-time.After(2 * time.Second):
+		t.Fatal("ws-B gate acquisition blocked while ws-A was held — gates are not per-workspace")
+	}
+}