2026-06-13 21:26:46 +00:00
4 changed files with 448 additions and 15 deletions
@@ -0,0 +1,356 @@
+package handlers
+
+// Tests for core#2771 — Create and Restart for the same workspace ID
+// must serialize through acquireRestartProvisionGate, otherwise the
+// async create provision and a near-immediate /restart can both reach
+// provisioner.Start concurrently and trigger a Docker container name
+// conflict → markProvisionFailed → workspace wedged "failed".
+//
+// These tests assert the load-bearing property: a Create call
+// (provisionWorkspaceAuto) and a Restart call (RestartWorkspaceAutoOpts)
+// for the SAME ws-<id> cannot overlap. Different ws-<id>s do not block
+// each other (the per-workspace gate is a sync.Map keyed on ID).
+
+import (
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/DATA-DOG/go-sqlmock"
+	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/models"
+)
+
+// TestCreateRestartGate_SharedAcrossCreateAndRestart: pre-fix, only
+// RestartWorkspaceAutoOpts acquired the gate; Create's
+// provisionWorkspaceAuto dispatched the async provision WITHOUT the
+// gate, so a Create then /restart for the same ws-<id> could both
+// reach provisioner.Start. The regression test proves the gate is
+// now SHARED across the two entry points: holding the gate from
+// the create side blocks the restart side (and vice versa).
+//
+// This is the load-bearing property that closes core#2771. If a
+// future refactor pulls the gate acquisition out of one of the two
+// entry points, this test will fail.
+func TestCreateRestartGate_SharedAcrossCreateAndRestart(t *testing.T) {
+	const wsID = "test-core2771-shared-gate"
+	resetRestartProvisionGateFor(wsID)
+
+	gate := acquireRestartProvisionGate(wsID)
+
+	// Holder A simulates a Create call whose provision is in flight
+	// (i.e. the new gate acquisition path inside provisionWorkspaceAuto).
+	// Holder B simulates a concurrent Restart call (RestartWorkspaceAutoOpts).
+	// The gate must serialize them: only one holds the gate at a time.
+	holderAHolds := make(chan struct{})
+	holderACanRelease := make(chan struct{})
+	holderBAcquiredAt := make(chan time.Time, 1)
+
+	var wg sync.WaitGroup
+	wg.Add(2)
+
+	// Holder A: acquire the gate (representing create-side in-flight
+	// provision), signal, then wait for the test to release.
+	go func() {
+		defer wg.Done()
+		gate.Lock()
+		close(holderAHolds)
+		<-holderACanRelease
+		gate.Unlock()
+	}()
+
+	// Holder B: while A holds, attempt to acquire the same gate
+	// (representing the restart-side call landing on the same ws-<id>).
+	// Record the moment the lock actually returns. The gate must
+	// guarantee the Lock() returns ONLY after A unlocks.
+	go func() {
+		defer wg.Done()
+		<-holderAHolds
+		gate.Lock()
+		holderBAcquiredAt <- time.Now()
+		gate.Unlock()
+	}()
+
+	// Wait for A to have the lock, then sleep a beat so B's Lock()
+	// has definitely had a chance to block (otherwise B's record
+	// timestamp is just goroutine-scheduling noise, not the gate's
+	// serialization).
+	<-holderAHolds
+	time.Sleep(100 * time.Millisecond)
+
+	// Confirm B has NOT acquired the lock yet — proving the gate is
+	// load-bearing across the entry points.
+	select {
+	case <-holderBAcquiredAt:
+		t.Fatal("holder B acquired the lock while holder A still held it — gate is not serializing across create+restart entry points (core#2771 regression)")
+	default:
+		// expected: B is still blocked
+	}
+
+	// Now release A. B must unblock within a small window.
+	close(holderACanRelease)
+	select {
+	case bAcquiredAt := <-holderBAcquiredAt:
+		// We can't assert a precise elapsed time (CI clock noise) but
+		// the timestamp MUST be after the release we just sent. The
+		// fact that the channel delivered at all is the assertion.
+		_ = bAcquiredAt
+	case <-time.After(2 * time.Second):
+		t.Fatal("holder B did not acquire the lock within 2s of A releasing — gate may be broken or leaked")
+	}
+
+	wg.Wait()
+}
+
+// TestCreateRestartGate_DifferentWorkspacesDoNotBlock: orthogonal
+// workspace IDs must not block each other. The gate is a per-workspace
+// mutex (sync.Map keyed on ID); two different ws-<id>s have different
+// mutex instances, so concurrent create/restart on different
+// workspaces proceed independently.
+func TestCreateRestartGate_DifferentWorkspacesDoNotBlock(t *testing.T) {
+	const wsA = "test-core2771-different-A"
+	const wsB = "test-core2771-different-B"
+	resetRestartProvisionGateFor(wsA)
+	resetRestartProvisionGateFor(wsB)
+
+	gateA := acquireRestartProvisionGate(wsA)
+	gateB := acquireRestartProvisionGate(wsB)
+
+	// Hold gateA, then confirm gateB can be acquired without waiting.
+	gateA.Lock()
+	defer gateA.Unlock()
+
+	acquired := make(chan struct{})
+	go func() {
+		gateB.Lock()
+		close(acquired)
+		gateB.Unlock()
+	}()
+
+	select {
+	case <-acquired:
+		// good — gateA and gateB are independent
+	case <-time.After(200 * time.Millisecond):
+		t.Fatal("gateB blocked behind gateA — per-workspace gates must be independent (different ws-IDs)")
+	}
+}
+
+// TestCreateRestartGate_ProvisionPathHoldsGateUntilCompletion is the
+// behavioral assertion: the gate acquired by provisionWorkspaceAuto
+// must be held until the async provision COMPLETES (not just until
+// provisionWorkspace returns to spawn the goroutine). This is the
+// invariant that makes "only one Docker create for ws-<id>" hold.
+//
+// The test directly mimics the new code path: Lock the gate, then
+// assert that a second Lock() call on the same gate cannot return
+// until we Unlock. This is the unit-level mirror of the goroutine's
+// `defer gate.Unlock()` at the tail of provisionWorkspace.
+func TestCreateRestartGate_ProvisionPathHoldsGateUntilCompletion(t *testing.T) {
+	const wsID = "test-core2771-hold-until-completion"
+	resetRestartProvisionGateFor(wsID)
+
+	gate := acquireRestartProvisionGate(wsID)
+
+	// Simulate the goroutine that provisionWorkspaceAuto now spawns:
+	// it holds the gate for the entire provision (defer gate.Unlock
+	// at the tail). The restart side, while this is in flight, must
+	// block.
+	holderReady := make(chan struct{})
+	holderReleased := make(chan struct{})
+	var concurrentAcquired atomic.Bool
+	concurrentAcquired.Store(false)
+
+	go func() {
+		gate.Lock()
+		close(holderReady)
+		// Simulate a non-trivial provision (the real provision can
+		// run for seconds — Docker create, register, etc.). The 50ms
+		// here is enough to prove the test isn't accidentally racy.
+		time.Sleep(50 * time.Millisecond)
+		gate.Unlock()
+		close(holderReleased)
+	}()
+
+	<-holderReady
+
+	// Try to acquire the same gate while the simulated provision is
+	// still in flight. Must block.
+	acquiredAt := make(chan time.Time, 1)
+	go func() {
+		gate.Lock()
+		concurrentAcquired.Store(true)
+		acquiredAt <- time.Now()
+		gate.Unlock()
+	}()
+
+	// Confirm we're still blocked before the provision releases.
+	time.Sleep(20 * time.Millisecond)
+	if concurrentAcquired.Load() {
+		t.Fatal("concurrent acquirer got the lock while provision goroutine still held it — core#2771 race window not closed")
+	}
+
+	// Wait for the provision to release, then confirm the acquirer unblocks.
+	<-holderReleased
+	select {
+	case <-acquiredAt:
+		// good
+	case <-time.After(2 * time.Second):
+		t.Fatal("acquirer did not unblock after provision released — possible gate leak")
+	}
+}
+
+// TestCreateRestartGate_RestartCycleDoesNotDeadlock (CR2 #11473 RC):
+// the previous head of this PR introduced a deadlock —
+// provisionWorkspaceAutoSync unconditionally acquired the gate, but
+// runRestartCycle (auto-restart) already held the gate and called
+// the same helper. The non-reentrant sync.Mutex deadlocked the
+// programmatic restart path before reprovision.
+//
+// The fix split provisionWorkspaceAutoSync into two:
+//   - provisionWorkspaceAutoSync (unlocked): acquires + defers
+//   - provisionWorkspaceAutoSyncLocked: assumes the gate is HELD
+//
+// runRestartCycle calls the Locked variant. This test exercises
+// that exact path: hold the gate (mimicking the top of
+// runRestartCycle), then call the Locked variant, then assert
+// no deadlock and the no-backend markProvisionFailed fires.
+//
+// Without the Locked variant (i.e. if a future refactor reverts the
+// split and calls the unlocked one from runRestartCycle), this
+// test would deadlock past the 2-second timeout because the
+// unlocked variant tries to re-lock the non-reentrant mutex.
+func TestCreateRestartGate_RestartCycleDoesNotDeadlock(t *testing.T) {
+	const wsID = "test-core2771-restart-no-deadlock"
+	resetRestartProvisionGateFor(wsID)
+
+	mock := setupTestDB(t)
+	mock.MatchExpectationsInOrder(false)
+	// markProvisionFailed is the no-backend fallback fired by the
+	// Locked variant when neither CP nor Docker is wired. The test
+	// relies on the no-backend path to keep the test simple (no
+	// provisioner plumbing) — the load-bearing assertion is that
+	// the call returns (no deadlock), not the specific provision
+	// outcome.
+	mock.ExpectExec(`UPDATE workspaces SET status =`).
+		WithArgs(sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	bcast := &concurrentSafeBroadcaster{}
+	h := NewWorkspaceHandler(bcast, nil, "http://localhost:8080", t.TempDir())
+	// Do NOT call SetCPProvisioner — both backends nil, which drives
+	// the Locked variant down the markProvisionFailed path.
+
+	// Mimic the top of runRestartCycle: acquire the gate before
+	// calling the provision helper. The helper MUST be the Locked
+	// variant — calling the unlocked one would deadlock here
+	// (re-locking the same non-reentrant mutex).
+	gate := acquireRestartProvisionGate(wsID)
+	gate.Lock()
+
+	// Now call the Locked variant — must complete within a small
+	// window. If the contract breaks (someone reverts the split
+	// and re-locks inside), this deadlocks and the test fails
+	// at the timeout.
+	type result struct {
+		ok bool
+	}
+	done := make(chan result, 1)
+	go func() {
+		ok := h.provisionWorkspaceAutoSyncLocked(wsID, "", nil, models.CreateWorkspacePayload{
+			Name: "no-deadlock", Tier: 1, Runtime: "claude-code", Model: "anthropic:claude-opus-4-7", // core#2594
+		})
+		done <- result{ok: ok}
+	}()
+
+	select {
+	case r := <-done:
+		// No deadlock. The no-backend path returned false; the
+		// markProvisionFailed UPDATE fired.
+		if r.ok {
+			t.Errorf("expected false return from no-backend provisionWorkspaceAutoSyncLocked, got true")
+		}
+	case <-time.After(2 * time.Second):
+		t.Fatal("provisionWorkspaceAutoSyncLocked under an already-held gate did not return within 2s — DEADLOCK (CR2 #11473 RC not fixed)")
+	}
+
+	// Release the gate we held (the restart cycle would, on its defer).
+	gate.Unlock()
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("expected markProvisionFailed UPDATE to fire: %v", err)
+	}
+}
+
+// TestCreateRestartGate_ReacquiringSyncWouldDeadlock is the negative
+// control: a "naive" call to the UNLOCKED variant from inside an
+// already-held gate would deadlock (the test asserts the deadlock
+// with a timeout). Together with the previous test, this proves:
+//   1. The Locked variant does NOT deadlock under a held gate.
+//   2. The unlocked variant DOES deadlock under a held gate (it
+//      tries to re-lock the same non-reentrant mutex).
+//
+// This guards against a future refactor that accidentally removes
+// the Locked/Unlocked split: the negative control would START
+// passing silently if the unlocked variant stopped trying to
+// re-lock, and the positive control would START failing. Either
+// change is a regression. The test pair keeps the contract pinned.
+func TestCreateRestartGate_ReacquiringSyncWouldDeadlock(t *testing.T) {
+	const wsID = "test-core2771-reacquire-deadlock"
+	resetRestartProvisionGateFor(wsID)
+
+	mock := setupTestDB(t)
+	mock.MatchExpectationsInOrder(false)
+	// The unlocked variant tries to re-lock (which deadlocks on
+	// the same non-reentrant mutex) and so never reaches the
+	// markProvisionFailed UPDATE. We DO NOT expect that UPDATE.
+	_ = mock
+
+	bcast := &concurrentSafeBroadcaster{}
+	h := NewWorkspaceHandler(bcast, nil, "http://localhost:8080", t.TempDir())
+
+	// Hold the gate from outside (mimicking runRestartCycle's
+	// outer Lock), then try to call the UNLOCKED variant. The
+	// unlocked variant's Lock() inside would block on the same
+	// mutex → deadlock. We use a 500ms timeout to assert the
+	// deadlock happens (a fast return would mean the unlocked
+	// variant no longer tries to re-lock, which would be a
+	// regression of the gate-acquisition design).
+	gate := acquireRestartProvisionGate(wsID)
+	gate.Lock()
+
+	done := make(chan struct{})
+	go func() {
+		// This call MUST deadlock (under a held gate, the
+		// unlocked variant's gate.Lock() inside cannot return).
+		_ = h.provisionWorkspaceAutoSync(wsID, "", nil, models.CreateWorkspacePayload{
+			Name: "would-deadlock", Tier: 1, Runtime: "claude-code", Model: "anthropic:claude-opus-4-7", // core#2594
+		})
+		close(done)
+	}()
+
+	// Wait 500ms — well past any reasonable provision time but
+	// short enough to keep the test fast. The call must still be
+	// blocked.
+	select {
+	case <-done:
+		// Returned fast — that's the regression we want to
+		// catch. The unlocked variant no longer re-locks, which
+		// would mean the create+restart serialization is broken.
+		t.Fatal("provisionWorkspaceAutoSync (unlocked) returned under a held gate — the unlock+relock pattern is gone; create+restart serialization is broken")
+	case <-time.After(500 * time.Millisecond):
+		// Still blocked — correct: the unlocked variant is
+		// waiting for the gate that we hold, exactly as it
+		// should. The runRestartCycle path uses the Locked
+		// variant to avoid this deadlock.
+	}
+
+	// Unblock: release the gate. The unlocked variant should now
+	// proceed and the test will exit cleanly.
+	gate.Unlock()
+	select {
+	case <-done:
+		// good
+	case <-time.After(2 * time.Second):
+		t.Fatal("unlocked variant did not unblock after gate release — possible gate leak")
+	}
+}
@@ -103,6 +103,20 @@ func (h *WorkspaceHandler) DefaultTier() int {
 // post-routing-but-pre-Start (mint secrets, render template, etc.)
 // lives in prepareProvisionContext (shared by both per-backend
 // goroutines).
+//
+// core#2771: acquire the per-workspace provision gate (acquireRestartProvisionGate)
+// HERE, before the async dispatch, so a Create call and a subsequent
+// Restart call for the same ws-<id> cannot both reach provisioner.Start
+// concurrently. Pre-fix the gate was only acquired by RestartWorkspaceAutoOpts
+// — the Create call started provision OUTSIDE the gate, so a near-
+// immediate /restart from the E2E (or an operator) raced into Docker
+// name conflict + markProvisionFailed → workspace wedged "failed"
+// (run 360209/job 490401, local-provision stub). The gate is now shared
+// by both entry points: create acquires and Lock()s it synchronously
+// (brief HTTP-handler block if a restart is in flight, which is the
+// correct serialization), the async provision goroutine holds it via
+// defer Unlock, and the gate is also released on the no-backend
+// markProvisionFailed path (no goroutine to defer from).
 func (h *WorkspaceHandler) provisionWorkspaceAuto(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) bool {
 	provlog.Event("provision.start", map[string]any{
 		"workspace_id": workspaceID,
@@ -112,17 +126,33 @@ func (h *WorkspaceHandler) provisionWorkspaceAuto(workspaceID, templatePath stri
 		"template":     payload.Template,
 		"sync":         false,
 	})
+	// core#2771: gate acquisition MUST be synchronous (before the
+	// goroutine spawn) so a concurrent restart for the same ws-<id>
+	// blocks in the calling HTTP handler, NOT in the goroutine. If
+	// acquisition were inside the goroutine, the create provision
+	// could run to provisioner.Start unblocked while a restart was
+	// still holding the gate from a prior cycle.
+	gate := acquireRestartProvisionGate(workspaceID)
+	gate.Lock()
 	if h.cpProv != nil {
-		h.goAsync(func() { h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload) })
+		h.goAsync(func() {
+			defer gate.Unlock()
+			h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload)
+		})
 		return true
 	}
 	if h.provisioner != nil {
-		h.goAsync(func() { h.provisionWorkspace(workspaceID, templatePath, configFiles, payload) })
+		h.goAsync(func() {
+			defer gate.Unlock()
+			h.provisionWorkspace(workspaceID, templatePath, configFiles, payload)
+		})
 		return true
 	}
-	// No backend wired — mark failed so the workspace doesn't linger in
-	// 'provisioning' for the full 10-minute sweep window. 10s is enough
-	// for the broadcast + single UPDATE inside markProvisionFailed.
+	// No backend wired — release the gate immediately (no goroutine
+	// to defer Unlock from) and mark failed so the workspace doesn't
+	// linger in 'provisioning' for the full 10-minute sweep window. 10s
+	// is enough for the broadcast + single UPDATE inside markProvisionFailed.
+	gate.Unlock()
 	log.Printf("provisionWorkspaceAuto: no provisioning backend wired for %s — marking failed (cpProv=nil, provisioner=nil)", workspaceID)
 	failCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
@@ -146,7 +176,43 @@ func (h *WorkspaceHandler) provisionWorkspaceAuto(workspaceID, templatePath stri
 // provisionWorkspaceAuto. The only difference is the goroutine wrapper.
 // Keep these two helpers in sync — when one grows a new arm (third
 // backend, retry semantics), the other should too.
+//
+// core#2771: acquires the per-workspace provision gate before doing
+// the provision work, then calls the Locked variant. Callers that
+// ALREADY hold the gate (e.g. runRestartCycle) must call
+// provisionWorkspaceAutoSyncLocked directly to avoid re-locking the
+// non-reentrant sync.Mutex. Provisioning a workspace under an
+// already-held gate is the contract: a Create call that races a
+// Restart's gate is blocked at the create side, NOT inside the
+// Restart's hold window.
+//
+// Why a non-reentrant mutex: the sync.Map-of-locks pattern
+// (acquireRestartProvisionGate) returns the same *sync.Mutex for a
+// given workspaceID across the process lifetime. Re-entrancy would
+// require either a counter (extra complexity, easy to leak) or a
+// different mutex type. The split-function pattern is the
+// std-lib-native way to model "caller may or may not hold the gate"
+// without giving up non-reentrance.
 func (h *WorkspaceHandler) provisionWorkspaceAutoSync(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) bool {
+	gate := acquireRestartProvisionGate(workspaceID)
+	gate.Lock()
+	defer gate.Unlock()
+	return h.provisionWorkspaceAutoSyncLocked(workspaceID, templatePath, configFiles, payload)
+}
+
+// provisionWorkspaceAutoSyncLocked is the locked variant — ASSUMES
+// acquireRestartProvisionGate(workspaceID) is already HELD by the
+// caller. Does the actual provision work (or no-backend mark-failed).
+// Split from provisionWorkspaceAutoSync to support callers that
+// already hold the gate (e.g. runRestartCycle, which acquires the
+// gate at the top of the cycle and holds it across Stop+provision
+// to serialize concurrent programmatic RestartByID paths against
+// the manual HTTP Restart path).
+//
+// Do NOT call this from a context that does NOT hold the gate —
+// the synchronization guarantee (Create+Restart serialize) is
+// broken if this is called outside an outer Lock+defer-Unlock scope.
+func (h *WorkspaceHandler) provisionWorkspaceAutoSyncLocked(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) bool {
 	provlog.Event("provision.start", map[string]any{
 		"workspace_id": workspaceID,
 		"name":         payload.Name,
@@ -163,7 +229,7 @@ func (h *WorkspaceHandler) provisionWorkspaceAutoSync(workspaceID, templatePath
 		h.provisionWorkspace(workspaceID, templatePath, configFiles, payload)
 		return true
 	}
-	log.Printf("provisionWorkspaceAutoSync: no provisioning backend wired for %s — marking failed (cpProv=nil, provisioner=nil)", workspaceID)
+	log.Printf("provisionWorkspaceAutoSyncLocked: no provisioning backend wired for %s — marking failed (cpProv=nil, provisioner=nil)", workspaceID)
 	failCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
 	h.markProvisionFailed(failCtx, workspaceID,
@@ -870,11 +870,16 @@ func TestProvisionWorkspaceAutoSync_NoBackendMarksFailed(t *testing.T) {
 	}
 }

-// TestRunRestartCycle_UsesProvisionWorkspaceAutoSync — source-level pin
-// that runRestartCycle (Site 4) routes through the sync dispatcher
-// instead of inlining the if-cpProv-else dispatch. Phase 2 PR-B of
-// #2799 migrated this site.
-func TestRunRestartCycle_UsesProvisionWorkspaceAutoSync(t *testing.T) {
+// TestRunRestartCycle_UsesProvisionWorkspaceAutoSyncLocked — source-level
+// pin that runRestartCycle (Site 4) routes through the sync dispatcher
+// (now the *Locked variant) instead of inlining the if-cpProv-else
+// dispatch. Phase 2 PR-B of #2799 migrated this site; core#2771 split
+// the sync dispatcher into Locked + unlocked variants and runRestartCycle
+// must use the Locked one because the per-workspace provision gate is
+// already HELD at the call site (acquired at the top of runRestartCycle
+// for the Stop+Start serialization — re-locking a non-reentrant
+// sync.Mutex would deadlock).
+func TestRunRestartCycle_UsesProvisionWorkspaceAutoSyncLocked(t *testing.T) {
 	wd, err := os.Getwd()
 	if err != nil {
 		t.Fatalf("getwd: %v", err)
@@ -884,9 +889,9 @@ func TestRunRestartCycle_UsesProvisionWorkspaceAutoSync(t *testing.T) {
 		t.Fatalf("read workspace_restart.go: %v", err)
 	}
 	stripped := stripGoComments(src)
-	if !bytes.Contains(stripped, []byte("h.provisionWorkspaceAutoSync(workspaceID")) {
-		t.Errorf("workspace_restart.go must call provisionWorkspaceAutoSync from runRestartCycle — current code does not. " +
-			"Phase 2 PR-B of #2799 migrated this site; do not regress to the inline if-cpProv-else dispatch.")
+	if !bytes.Contains(stripped, []byte("h.provisionWorkspaceAutoSyncLocked(workspaceID")) {
+		t.Errorf("workspace_restart.go must call provisionWorkspaceAutoSyncLocked from runRestartCycle — current code does not. " +
+			"Phase 2 PR-B of #2799 migrated this site; core#2771 split the sync dispatcher into Locked + unlocked variants and the runRestartCycle path uses the Locked one because the per-workspace provision gate is already held at the call site.")
 	}
 }

@@ -952,6 +952,12 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) {
 	// safe to start another restart cycle without racing this one's
 	// Stop call.
 	//
+	// core#2771: call the *Locked* variant — the per-workspace provision
+	// gate is ALREADY held by this cycle (acquired at the top of
+	// runRestartCycle, held for the entire Stop+Start). Calling the
+	// unlocked variant would re-lock the same non-reentrant sync.Mutex
+	// and deadlock the programmatic restart path.
+	//
 	// Pre-2026-05-05 this site inlined the if-cpProv-else dispatch. On
 	// SaaS the cycle would NPE inside provisionWorkspace's
 	// `h.provisioner.VolumeHasFile` call, get swallowed by
@@ -960,7 +966,7 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) {
 	// status='provisioning' (the UPDATE above already ran). User-
 	// observable result on SaaS pre-fix: dead workspace → manual canvas
 	// restart was the only recovery path.
-	h.provisionWorkspaceAutoSync(workspaceID, "", nil, payload)
+	h.provisionWorkspaceAutoSyncLocked(workspaceID, "", nil, payload)
 	// sendRestartContext is a one-way notification to the new container; safe
 	// to fire async — the next restart cycle won't depend on it completing.
 	// Tracked via h.goAsync so tests can wait for it via h.asyncWG before