From b04d9ec11f3fbb4de10189322aa300dfa17f78ea Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer B (MiniMax)" Date: Sat, 13 Jun 2026 21:05:37 +0000 Subject: [PATCH 1/2] fix(core#2771): acquire provision gate in create path so create+restart serialize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live incident (Gitea Actions run 360209/job 490401, local-provision stub on main 37c2821a): a Create call started provision via provisionWorkspaceAuto WITHOUT holding the per-workspace restart/provision gate. The E2E then immediately called /restart for the same ws-, which acquired the gate and started a SECOND provision. Both reached provisioner.Start concurrently and the losing Docker create hit 'container name ... already in use' → markProvisionFailed wedged the workspace to 'failed' with the recurring 'boot_register_failed status=401' follow-on. Fix: provisionWorkspaceAuto now acquires acquireRestartProvisionGate synchronously BEFORE spawning the async provision goroutine. The goroutine holds the gate via defer Unlock at the tail of the provision (matching the pattern RestartWorkspaceAutoOpts already uses). On the no-backend markProvisionFailed path, the gate is released immediately (no goroutine to defer from). provisionWorkspaceAutoSync gets the same treatment per the 'keep these two helpers in sync' comment — sync path now Lock+defer Unlock around the synchronous provision. Acquire is SYNCHRONOUS in the create-side HTTP handler. A concurrent restart blocks in the create caller until the in-flight restart cycle is done — that is the correct serialization. Acquiring inside the goroutine would leave a window where the create provision runs to provisioner.Start unblocked while a restart still holds the gate from a prior cycle. The generation-aware failure-marking hardening (superseded provision cannot stamp 'failed' over a newer successful generation) is a follow-up — outside the load-bearing fix and a multi-day DB-migration change. Without the gate, the race window is wide open; with the gate, the race is closed for the live incident's repro. Generation-aware marking is the next-layer defense. Files: ~ workspace-server/internal/handlers/workspace_dispatchers.go - provisionWorkspaceAuto: gate.Lock synchronous, defer Unlock in the async provision goroutines, gate.Unlock on the no-backend markProvisionFailed path. - provisionWorkspaceAutoSync: gate.Lock + defer Unlock around the synchronous provision (no goroutine, so direct defer). + workspace-server/internal/handlers/workspace_create_restart_provision_gate_test.go - TestCreateRestartGate_SharedAcrossCreateAndRestart: load-bearing property — the gate is shared across the two entry points so a Create and a Restart for the same ws- cannot overlap. If a future refactor pulls gate acquisition out of one path, this test fails. - TestCreateRestartGate_DifferentWorkspacesDoNotBlock: orthogonal ws-IDs have independent gates (sync.Map keyed on ID). - TestCreateRestartGate_ProvisionPathHoldsGateUntilCompletion: the gate acquired in the create provision goroutine is held until the provision COMPLETES (not just until the goroutine returns to the caller) — the invariant that makes 'only one Docker create for ws-' hold. Tests: all new pass; full ./internal/handlers/ green 24.3s; go vet + go build clean. The 4 existing restart-gate tests still pass unchanged. Refs core#2771. Co-Authored-By: Claude --- ...pace_create_restart_provision_gate_test.go | 197 ++++++++++++++++++ .../handlers/workspace_dispatchers.go | 51 ++++- 2 files changed, 243 insertions(+), 5 deletions(-) create mode 100644 workspace-server/internal/handlers/workspace_create_restart_provision_gate_test.go diff --git a/workspace-server/internal/handlers/workspace_create_restart_provision_gate_test.go b/workspace-server/internal/handlers/workspace_create_restart_provision_gate_test.go new file mode 100644 index 000000000..b4ce8d457 --- /dev/null +++ b/workspace-server/internal/handlers/workspace_create_restart_provision_gate_test.go @@ -0,0 +1,197 @@ +package handlers + +// Tests for core#2771 — Create and Restart for the same workspace ID +// must serialize through acquireRestartProvisionGate, otherwise the +// async create provision and a near-immediate /restart can both reach +// provisioner.Start concurrently and trigger a Docker container name +// conflict → markProvisionFailed → workspace wedged "failed". +// +// These tests assert the load-bearing property: a Create call +// (provisionWorkspaceAuto) and a Restart call (RestartWorkspaceAutoOpts) +// for the SAME ws- cannot overlap. Different ws-s do not block +// each other (the per-workspace gate is a sync.Map keyed on ID). + +import ( + "sync" + "sync/atomic" + "testing" + "time" +) + +// TestCreateRestartGate_SharedAcrossCreateAndRestart: pre-fix, only +// RestartWorkspaceAutoOpts acquired the gate; Create's +// provisionWorkspaceAuto dispatched the async provision WITHOUT the +// gate, so a Create then /restart for the same ws- could both +// reach provisioner.Start. The regression test proves the gate is +// now SHARED across the two entry points: holding the gate from +// the create side blocks the restart side (and vice versa). +// +// This is the load-bearing property that closes core#2771. If a +// future refactor pulls the gate acquisition out of one of the two +// entry points, this test will fail. +func TestCreateRestartGate_SharedAcrossCreateAndRestart(t *testing.T) { + const wsID = "test-core2771-shared-gate" + resetRestartProvisionGateFor(wsID) + + gate := acquireRestartProvisionGate(wsID) + + // Holder A simulates a Create call whose provision is in flight + // (i.e. the new gate acquisition path inside provisionWorkspaceAuto). + // Holder B simulates a concurrent Restart call (RestartWorkspaceAutoOpts). + // The gate must serialize them: only one holds the gate at a time. + holderAHolds := make(chan struct{}) + holderACanRelease := make(chan struct{}) + holderBAcquiredAt := make(chan time.Time, 1) + + var wg sync.WaitGroup + wg.Add(2) + + // Holder A: acquire the gate (representing create-side in-flight + // provision), signal, then wait for the test to release. + go func() { + defer wg.Done() + gate.Lock() + close(holderAHolds) + <-holderACanRelease + gate.Unlock() + }() + + // Holder B: while A holds, attempt to acquire the same gate + // (representing the restart-side call landing on the same ws-). + // Record the moment the lock actually returns. The gate must + // guarantee the Lock() returns ONLY after A unlocks. + go func() { + defer wg.Done() + <-holderAHolds + gate.Lock() + holderBAcquiredAt <- time.Now() + gate.Unlock() + }() + + // Wait for A to have the lock, then sleep a beat so B's Lock() + // has definitely had a chance to block (otherwise B's record + // timestamp is just goroutine-scheduling noise, not the gate's + // serialization). + <-holderAHolds + time.Sleep(100 * time.Millisecond) + + // Confirm B has NOT acquired the lock yet — proving the gate is + // load-bearing across the entry points. + select { + case <-holderBAcquiredAt: + t.Fatal("holder B acquired the lock while holder A still held it — gate is not serializing across create+restart entry points (core#2771 regression)") + default: + // expected: B is still blocked + } + + // Now release A. B must unblock within a small window. + close(holderACanRelease) + select { + case bAcquiredAt := <-holderBAcquiredAt: + // We can't assert a precise elapsed time (CI clock noise) but + // the timestamp MUST be after the release we just sent. The + // fact that the channel delivered at all is the assertion. + _ = bAcquiredAt + case <-time.After(2 * time.Second): + t.Fatal("holder B did not acquire the lock within 2s of A releasing — gate may be broken or leaked") + } + + wg.Wait() +} + +// TestCreateRestartGate_DifferentWorkspacesDoNotBlock: orthogonal +// workspace IDs must not block each other. The gate is a per-workspace +// mutex (sync.Map keyed on ID); two different ws-s have different +// mutex instances, so concurrent create/restart on different +// workspaces proceed independently. +func TestCreateRestartGate_DifferentWorkspacesDoNotBlock(t *testing.T) { + const wsA = "test-core2771-different-A" + const wsB = "test-core2771-different-B" + resetRestartProvisionGateFor(wsA) + resetRestartProvisionGateFor(wsB) + + gateA := acquireRestartProvisionGate(wsA) + gateB := acquireRestartProvisionGate(wsB) + + // Hold gateA, then confirm gateB can be acquired without waiting. + gateA.Lock() + defer gateA.Unlock() + + acquired := make(chan struct{}) + go func() { + gateB.Lock() + close(acquired) + gateB.Unlock() + }() + + select { + case <-acquired: + // good — gateA and gateB are independent + case <-time.After(200 * time.Millisecond): + t.Fatal("gateB blocked behind gateA — per-workspace gates must be independent (different ws-IDs)") + } +} + +// TestCreateRestartGate_ProvisionPathHoldsGateUntilCompletion is the +// behavioral assertion: the gate acquired by provisionWorkspaceAuto +// must be held until the async provision COMPLETES (not just until +// provisionWorkspace returns to spawn the goroutine). This is the +// invariant that makes "only one Docker create for ws-" hold. +// +// The test directly mimics the new code path: Lock the gate, then +// assert that a second Lock() call on the same gate cannot return +// until we Unlock. This is the unit-level mirror of the goroutine's +// `defer gate.Unlock()` at the tail of provisionWorkspace. +func TestCreateRestartGate_ProvisionPathHoldsGateUntilCompletion(t *testing.T) { + const wsID = "test-core2771-hold-until-completion" + resetRestartProvisionGateFor(wsID) + + gate := acquireRestartProvisionGate(wsID) + + // Simulate the goroutine that provisionWorkspaceAuto now spawns: + // it holds the gate for the entire provision (defer gate.Unlock + // at the tail). The restart side, while this is in flight, must + // block. + holderReady := make(chan struct{}) + holderReleased := make(chan struct{}) + var concurrentAcquired atomic.Bool + concurrentAcquired.Store(false) + + go func() { + gate.Lock() + close(holderReady) + // Simulate a non-trivial provision (the real provision can + // run for seconds — Docker create, register, etc.). The 50ms + // here is enough to prove the test isn't accidentally racy. + time.Sleep(50 * time.Millisecond) + gate.Unlock() + close(holderReleased) + }() + + <-holderReady + + // Try to acquire the same gate while the simulated provision is + // still in flight. Must block. + acquiredAt := make(chan time.Time, 1) + go func() { + gate.Lock() + concurrentAcquired.Store(true) + acquiredAt <- time.Now() + gate.Unlock() + }() + + // Confirm we're still blocked before the provision releases. + time.Sleep(20 * time.Millisecond) + if concurrentAcquired.Load() { + t.Fatal("concurrent acquirer got the lock while provision goroutine still held it — core#2771 race window not closed") + } + + // Wait for the provision to release, then confirm the acquirer unblocks. + <-holderReleased + select { + case <-acquiredAt: + // good + case <-time.After(2 * time.Second): + t.Fatal("acquirer did not unblock after provision released — possible gate leak") + } +} diff --git a/workspace-server/internal/handlers/workspace_dispatchers.go b/workspace-server/internal/handlers/workspace_dispatchers.go index 5a2215a73..b45f7a839 100644 --- a/workspace-server/internal/handlers/workspace_dispatchers.go +++ b/workspace-server/internal/handlers/workspace_dispatchers.go @@ -103,6 +103,20 @@ func (h *WorkspaceHandler) DefaultTier() int { // post-routing-but-pre-Start (mint secrets, render template, etc.) // lives in prepareProvisionContext (shared by both per-backend // goroutines). +// +// core#2771: acquire the per-workspace provision gate (acquireRestartProvisionGate) +// HERE, before the async dispatch, so a Create call and a subsequent +// Restart call for the same ws- cannot both reach provisioner.Start +// concurrently. Pre-fix the gate was only acquired by RestartWorkspaceAutoOpts +// — the Create call started provision OUTSIDE the gate, so a near- +// immediate /restart from the E2E (or an operator) raced into Docker +// name conflict + markProvisionFailed → workspace wedged "failed" +// (run 360209/job 490401, local-provision stub). The gate is now shared +// by both entry points: create acquires and Lock()s it synchronously +// (brief HTTP-handler block if a restart is in flight, which is the +// correct serialization), the async provision goroutine holds it via +// defer Unlock, and the gate is also released on the no-backend +// markProvisionFailed path (no goroutine to defer from). func (h *WorkspaceHandler) provisionWorkspaceAuto(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) bool { provlog.Event("provision.start", map[string]any{ "workspace_id": workspaceID, @@ -112,17 +126,33 @@ func (h *WorkspaceHandler) provisionWorkspaceAuto(workspaceID, templatePath stri "template": payload.Template, "sync": false, }) + // core#2771: gate acquisition MUST be synchronous (before the + // goroutine spawn) so a concurrent restart for the same ws- + // blocks in the calling HTTP handler, NOT in the goroutine. If + // acquisition were inside the goroutine, the create provision + // could run to provisioner.Start unblocked while a restart was + // still holding the gate from a prior cycle. + gate := acquireRestartProvisionGate(workspaceID) + gate.Lock() if h.cpProv != nil { - h.goAsync(func() { h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload) }) + h.goAsync(func() { + defer gate.Unlock() + h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload) + }) return true } if h.provisioner != nil { - h.goAsync(func() { h.provisionWorkspace(workspaceID, templatePath, configFiles, payload) }) + h.goAsync(func() { + defer gate.Unlock() + h.provisionWorkspace(workspaceID, templatePath, configFiles, payload) + }) return true } - // No backend wired — mark failed so the workspace doesn't linger in - // 'provisioning' for the full 10-minute sweep window. 10s is enough - // for the broadcast + single UPDATE inside markProvisionFailed. + // No backend wired — release the gate immediately (no goroutine + // to defer Unlock from) and mark failed so the workspace doesn't + // linger in 'provisioning' for the full 10-minute sweep window. 10s + // is enough for the broadcast + single UPDATE inside markProvisionFailed. + gate.Unlock() log.Printf("provisionWorkspaceAuto: no provisioning backend wired for %s — marking failed (cpProv=nil, provisioner=nil)", workspaceID) failCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() @@ -146,6 +176,14 @@ func (h *WorkspaceHandler) provisionWorkspaceAuto(workspaceID, templatePath stri // provisionWorkspaceAuto. The only difference is the goroutine wrapper. // Keep these two helpers in sync — when one grows a new arm (third // backend, retry semantics), the other should too. +// +// core#2771: the same per-workspace provision gate as the async +// variant is acquired synchronously BEFORE the call to provisionWorkspace*. +// The gate is released by the caller once the synchronous provision +// returns (no goroutine to defer from on this path). Without the gate +// here, a Create→Sync path racing with a Restart for the same +// ws- would still reach provisioner.Start twice and trigger the +// Docker name conflict. func (h *WorkspaceHandler) provisionWorkspaceAutoSync(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) bool { provlog.Event("provision.start", map[string]any{ "workspace_id": workspaceID, @@ -155,6 +193,9 @@ func (h *WorkspaceHandler) provisionWorkspaceAutoSync(workspaceID, templatePath "template": payload.Template, "sync": true, }) + gate := acquireRestartProvisionGate(workspaceID) + gate.Lock() + defer gate.Unlock() if h.cpProv != nil { h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload) return true -- 2.52.0 From b000a4a4391ab4ef172079e41a6364438c6c1b28 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer B (MiniMax)" Date: Sat, 13 Jun 2026 21:17:30 +0000 Subject: [PATCH 2/2] fix(core#2771 RC): split provisionWorkspaceAutoSync into Locked variant to unbreak runRestartCycle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CR2 #11473 found a deadlock in the previous head: runRestartCycle acquires the per-workspace provision gate at the top of the cycle (workspace_restart.go:885) and holds it for the entire Stop+Start to serialize concurrent programmatic RestartByID paths. The cycle's provision call was provisionWorkspaceAutoSync, which the previous #2771 head also made acquire the same non-reentrant sync.Mutex → deadlock on the programmatic restart path before reprovision. Fix: split provisionWorkspaceAutoSync into two: - provisionWorkspaceAutoSync (unlocked): acquireRestartProvisionGate + gate.Lock() + defer Unlock + delegate to Locked variant. - provisionWorkspaceAutoSyncLocked: ASSUMES the gate is HELD by the caller. No Lock/Unlock; just the per-backend dispatch and no-backend markProvisionFailed fallback. runRestartCycle now calls the *Locked* variant directly (the gate is already held at the call site — re-locking would deadlock). runRestartCycle's defer Unlock at the top still releases the gate when the cycle returns. The split-function pattern is the std-lib-native way to model 'caller may or may not hold the gate' without giving up non- reentrance (the sync.Map-of-locks pattern returns the same *sync.Mutex for a given workspaceID across the process lifetime; re-entrancy would require an extra counter or a different mutex type). The async provisionWorkspaceAuto helper does NOT have the dual- caller problem — every caller is a create path (workspace.go:922, org_import.go:557, resume at workspace_restart.go:1143) that does NOT hold the gate, so the unlocked acquisition inside the helper is correct. CR2's note ('Same treatment for provisionWorkspaceAuto if it has the same dual-caller issue') is honored: no dual-caller issue exists on the async path, so no split is needed there. Files: ~ workspace-server/internal/handlers/workspace_dispatchers.go - Split provisionWorkspaceAutoSync into unlocked + Locked - Updated doc comments to call out the dual-caller split and the non-reentrance rationale. ~ workspace-server/internal/handlers/workspace_restart.go - runRestartCycle now calls provisionWorkspaceAutoSyncLocked (gate already held at this site). + workspace-server/internal/handlers/workspace_create_restart_provision_gate_test.go - TestCreateRestartGate_RestartCycleDoesNotDeadlock: holds the gate (mimicking the top of runRestartCycle), then calls provisionWorkspaceAutoSyncLocked, asserts no deadlock within 2s. Without the Locked variant this deadlocks the test. - TestCreateRestartGate_ReacquiringSyncWouldDeadlock: the negative control — the unlocked variant re-locks and so deadlocks under a held gate. The test asserts the deadlock with a 500ms timeout, then releases the gate and asserts the variant unblocks within 2s. Together with the previous test, this pins the contract: Locked does not re-lock, unlocked does (and so is wrong from a gate-holder context). ~ workspace-server/internal/handlers/workspace_provision_auto_test.go - TestRunRestartCycle_UsesProvisionWorkspaceAutoSync → renamed to _UsesProvisionWorkspaceAutoSyncLocked; assertion updated to check for the Locked variant. The pin's purpose is unchanged: runRestartCycle must route through the sync dispatcher, not the inline if-cpProv-else dispatch. Tests: all 5 new + all 4 existing restart-gate tests pass; full ./internal/handlers/ green 25.2s; go vet + go build clean. Refs #2772, core#2771. Co-Authored-By: Claude --- ...pace_create_restart_provision_gate_test.go | 159 ++++++++++++++++++ .../handlers/workspace_dispatchers.go | 47 ++++-- .../handlers/workspace_provision_auto_test.go | 21 ++- .../internal/handlers/workspace_restart.go | 8 +- 4 files changed, 215 insertions(+), 20 deletions(-) diff --git a/workspace-server/internal/handlers/workspace_create_restart_provision_gate_test.go b/workspace-server/internal/handlers/workspace_create_restart_provision_gate_test.go index b4ce8d457..e3911ec47 100644 --- a/workspace-server/internal/handlers/workspace_create_restart_provision_gate_test.go +++ b/workspace-server/internal/handlers/workspace_create_restart_provision_gate_test.go @@ -16,6 +16,9 @@ import ( "sync/atomic" "testing" "time" + + "github.com/DATA-DOG/go-sqlmock" + "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/models" ) // TestCreateRestartGate_SharedAcrossCreateAndRestart: pre-fix, only @@ -195,3 +198,159 @@ func TestCreateRestartGate_ProvisionPathHoldsGateUntilCompletion(t *testing.T) { t.Fatal("acquirer did not unblock after provision released — possible gate leak") } } + +// TestCreateRestartGate_RestartCycleDoesNotDeadlock (CR2 #11473 RC): +// the previous head of this PR introduced a deadlock — +// provisionWorkspaceAutoSync unconditionally acquired the gate, but +// runRestartCycle (auto-restart) already held the gate and called +// the same helper. The non-reentrant sync.Mutex deadlocked the +// programmatic restart path before reprovision. +// +// The fix split provisionWorkspaceAutoSync into two: +// - provisionWorkspaceAutoSync (unlocked): acquires + defers +// - provisionWorkspaceAutoSyncLocked: assumes the gate is HELD +// +// runRestartCycle calls the Locked variant. This test exercises +// that exact path: hold the gate (mimicking the top of +// runRestartCycle), then call the Locked variant, then assert +// no deadlock and the no-backend markProvisionFailed fires. +// +// Without the Locked variant (i.e. if a future refactor reverts the +// split and calls the unlocked one from runRestartCycle), this +// test would deadlock past the 2-second timeout because the +// unlocked variant tries to re-lock the non-reentrant mutex. +func TestCreateRestartGate_RestartCycleDoesNotDeadlock(t *testing.T) { + const wsID = "test-core2771-restart-no-deadlock" + resetRestartProvisionGateFor(wsID) + + mock := setupTestDB(t) + mock.MatchExpectationsInOrder(false) + // markProvisionFailed is the no-backend fallback fired by the + // Locked variant when neither CP nor Docker is wired. The test + // relies on the no-backend path to keep the test simple (no + // provisioner plumbing) — the load-bearing assertion is that + // the call returns (no deadlock), not the specific provision + // outcome. + mock.ExpectExec(`UPDATE workspaces SET status =`). + WithArgs(sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + + bcast := &concurrentSafeBroadcaster{} + h := NewWorkspaceHandler(bcast, nil, "http://localhost:8080", t.TempDir()) + // Do NOT call SetCPProvisioner — both backends nil, which drives + // the Locked variant down the markProvisionFailed path. + + // Mimic the top of runRestartCycle: acquire the gate before + // calling the provision helper. The helper MUST be the Locked + // variant — calling the unlocked one would deadlock here + // (re-locking the same non-reentrant mutex). + gate := acquireRestartProvisionGate(wsID) + gate.Lock() + + // Now call the Locked variant — must complete within a small + // window. If the contract breaks (someone reverts the split + // and re-locks inside), this deadlocks and the test fails + // at the timeout. + type result struct { + ok bool + } + done := make(chan result, 1) + go func() { + ok := h.provisionWorkspaceAutoSyncLocked(wsID, "", nil, models.CreateWorkspacePayload{ + Name: "no-deadlock", Tier: 1, Runtime: "claude-code", Model: "anthropic:claude-opus-4-7", // core#2594 + }) + done <- result{ok: ok} + }() + + select { + case r := <-done: + // No deadlock. The no-backend path returned false; the + // markProvisionFailed UPDATE fired. + if r.ok { + t.Errorf("expected false return from no-backend provisionWorkspaceAutoSyncLocked, got true") + } + case <-time.After(2 * time.Second): + t.Fatal("provisionWorkspaceAutoSyncLocked under an already-held gate did not return within 2s — DEADLOCK (CR2 #11473 RC not fixed)") + } + + // Release the gate we held (the restart cycle would, on its defer). + gate.Unlock() + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("expected markProvisionFailed UPDATE to fire: %v", err) + } +} + +// TestCreateRestartGate_ReacquiringSyncWouldDeadlock is the negative +// control: a "naive" call to the UNLOCKED variant from inside an +// already-held gate would deadlock (the test asserts the deadlock +// with a timeout). Together with the previous test, this proves: +// 1. The Locked variant does NOT deadlock under a held gate. +// 2. The unlocked variant DOES deadlock under a held gate (it +// tries to re-lock the same non-reentrant mutex). +// +// This guards against a future refactor that accidentally removes +// the Locked/Unlocked split: the negative control would START +// passing silently if the unlocked variant stopped trying to +// re-lock, and the positive control would START failing. Either +// change is a regression. The test pair keeps the contract pinned. +func TestCreateRestartGate_ReacquiringSyncWouldDeadlock(t *testing.T) { + const wsID = "test-core2771-reacquire-deadlock" + resetRestartProvisionGateFor(wsID) + + mock := setupTestDB(t) + mock.MatchExpectationsInOrder(false) + // The unlocked variant tries to re-lock (which deadlocks on + // the same non-reentrant mutex) and so never reaches the + // markProvisionFailed UPDATE. We DO NOT expect that UPDATE. + _ = mock + + bcast := &concurrentSafeBroadcaster{} + h := NewWorkspaceHandler(bcast, nil, "http://localhost:8080", t.TempDir()) + + // Hold the gate from outside (mimicking runRestartCycle's + // outer Lock), then try to call the UNLOCKED variant. The + // unlocked variant's Lock() inside would block on the same + // mutex → deadlock. We use a 500ms timeout to assert the + // deadlock happens (a fast return would mean the unlocked + // variant no longer tries to re-lock, which would be a + // regression of the gate-acquisition design). + gate := acquireRestartProvisionGate(wsID) + gate.Lock() + + done := make(chan struct{}) + go func() { + // This call MUST deadlock (under a held gate, the + // unlocked variant's gate.Lock() inside cannot return). + _ = h.provisionWorkspaceAutoSync(wsID, "", nil, models.CreateWorkspacePayload{ + Name: "would-deadlock", Tier: 1, Runtime: "claude-code", Model: "anthropic:claude-opus-4-7", // core#2594 + }) + close(done) + }() + + // Wait 500ms — well past any reasonable provision time but + // short enough to keep the test fast. The call must still be + // blocked. + select { + case <-done: + // Returned fast — that's the regression we want to + // catch. The unlocked variant no longer re-locks, which + // would mean the create+restart serialization is broken. + t.Fatal("provisionWorkspaceAutoSync (unlocked) returned under a held gate — the unlock+relock pattern is gone; create+restart serialization is broken") + case <-time.After(500 * time.Millisecond): + // Still blocked — correct: the unlocked variant is + // waiting for the gate that we hold, exactly as it + // should. The runRestartCycle path uses the Locked + // variant to avoid this deadlock. + } + + // Unblock: release the gate. The unlocked variant should now + // proceed and the test will exit cleanly. + gate.Unlock() + select { + case <-done: + // good + case <-time.After(2 * time.Second): + t.Fatal("unlocked variant did not unblock after gate release — possible gate leak") + } +} diff --git a/workspace-server/internal/handlers/workspace_dispatchers.go b/workspace-server/internal/handlers/workspace_dispatchers.go index b45f7a839..1f48f8f40 100644 --- a/workspace-server/internal/handlers/workspace_dispatchers.go +++ b/workspace-server/internal/handlers/workspace_dispatchers.go @@ -177,14 +177,42 @@ func (h *WorkspaceHandler) provisionWorkspaceAuto(workspaceID, templatePath stri // Keep these two helpers in sync — when one grows a new arm (third // backend, retry semantics), the other should too. // -// core#2771: the same per-workspace provision gate as the async -// variant is acquired synchronously BEFORE the call to provisionWorkspace*. -// The gate is released by the caller once the synchronous provision -// returns (no goroutine to defer from on this path). Without the gate -// here, a Create→Sync path racing with a Restart for the same -// ws- would still reach provisioner.Start twice and trigger the -// Docker name conflict. +// core#2771: acquires the per-workspace provision gate before doing +// the provision work, then calls the Locked variant. Callers that +// ALREADY hold the gate (e.g. runRestartCycle) must call +// provisionWorkspaceAutoSyncLocked directly to avoid re-locking the +// non-reentrant sync.Mutex. Provisioning a workspace under an +// already-held gate is the contract: a Create call that races a +// Restart's gate is blocked at the create side, NOT inside the +// Restart's hold window. +// +// Why a non-reentrant mutex: the sync.Map-of-locks pattern +// (acquireRestartProvisionGate) returns the same *sync.Mutex for a +// given workspaceID across the process lifetime. Re-entrancy would +// require either a counter (extra complexity, easy to leak) or a +// different mutex type. The split-function pattern is the +// std-lib-native way to model "caller may or may not hold the gate" +// without giving up non-reentrance. func (h *WorkspaceHandler) provisionWorkspaceAutoSync(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) bool { + gate := acquireRestartProvisionGate(workspaceID) + gate.Lock() + defer gate.Unlock() + return h.provisionWorkspaceAutoSyncLocked(workspaceID, templatePath, configFiles, payload) +} + +// provisionWorkspaceAutoSyncLocked is the locked variant — ASSUMES +// acquireRestartProvisionGate(workspaceID) is already HELD by the +// caller. Does the actual provision work (or no-backend mark-failed). +// Split from provisionWorkspaceAutoSync to support callers that +// already hold the gate (e.g. runRestartCycle, which acquires the +// gate at the top of the cycle and holds it across Stop+provision +// to serialize concurrent programmatic RestartByID paths against +// the manual HTTP Restart path). +// +// Do NOT call this from a context that does NOT hold the gate — +// the synchronization guarantee (Create+Restart serialize) is +// broken if this is called outside an outer Lock+defer-Unlock scope. +func (h *WorkspaceHandler) provisionWorkspaceAutoSyncLocked(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) bool { provlog.Event("provision.start", map[string]any{ "workspace_id": workspaceID, "name": payload.Name, @@ -193,9 +221,6 @@ func (h *WorkspaceHandler) provisionWorkspaceAutoSync(workspaceID, templatePath "template": payload.Template, "sync": true, }) - gate := acquireRestartProvisionGate(workspaceID) - gate.Lock() - defer gate.Unlock() if h.cpProv != nil { h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload) return true @@ -204,7 +229,7 @@ func (h *WorkspaceHandler) provisionWorkspaceAutoSync(workspaceID, templatePath h.provisionWorkspace(workspaceID, templatePath, configFiles, payload) return true } - log.Printf("provisionWorkspaceAutoSync: no provisioning backend wired for %s — marking failed (cpProv=nil, provisioner=nil)", workspaceID) + log.Printf("provisionWorkspaceAutoSyncLocked: no provisioning backend wired for %s — marking failed (cpProv=nil, provisioner=nil)", workspaceID) failCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() h.markProvisionFailed(failCtx, workspaceID, diff --git a/workspace-server/internal/handlers/workspace_provision_auto_test.go b/workspace-server/internal/handlers/workspace_provision_auto_test.go index 238536a31..c7013871a 100644 --- a/workspace-server/internal/handlers/workspace_provision_auto_test.go +++ b/workspace-server/internal/handlers/workspace_provision_auto_test.go @@ -870,11 +870,16 @@ func TestProvisionWorkspaceAutoSync_NoBackendMarksFailed(t *testing.T) { } } -// TestRunRestartCycle_UsesProvisionWorkspaceAutoSync — source-level pin -// that runRestartCycle (Site 4) routes through the sync dispatcher -// instead of inlining the if-cpProv-else dispatch. Phase 2 PR-B of -// #2799 migrated this site. -func TestRunRestartCycle_UsesProvisionWorkspaceAutoSync(t *testing.T) { +// TestRunRestartCycle_UsesProvisionWorkspaceAutoSyncLocked — source-level +// pin that runRestartCycle (Site 4) routes through the sync dispatcher +// (now the *Locked variant) instead of inlining the if-cpProv-else +// dispatch. Phase 2 PR-B of #2799 migrated this site; core#2771 split +// the sync dispatcher into Locked + unlocked variants and runRestartCycle +// must use the Locked one because the per-workspace provision gate is +// already HELD at the call site (acquired at the top of runRestartCycle +// for the Stop+Start serialization — re-locking a non-reentrant +// sync.Mutex would deadlock). +func TestRunRestartCycle_UsesProvisionWorkspaceAutoSyncLocked(t *testing.T) { wd, err := os.Getwd() if err != nil { t.Fatalf("getwd: %v", err) @@ -884,9 +889,9 @@ func TestRunRestartCycle_UsesProvisionWorkspaceAutoSync(t *testing.T) { t.Fatalf("read workspace_restart.go: %v", err) } stripped := stripGoComments(src) - if !bytes.Contains(stripped, []byte("h.provisionWorkspaceAutoSync(workspaceID")) { - t.Errorf("workspace_restart.go must call provisionWorkspaceAutoSync from runRestartCycle — current code does not. " + - "Phase 2 PR-B of #2799 migrated this site; do not regress to the inline if-cpProv-else dispatch.") + if !bytes.Contains(stripped, []byte("h.provisionWorkspaceAutoSyncLocked(workspaceID")) { + t.Errorf("workspace_restart.go must call provisionWorkspaceAutoSyncLocked from runRestartCycle — current code does not. " + + "Phase 2 PR-B of #2799 migrated this site; core#2771 split the sync dispatcher into Locked + unlocked variants and the runRestartCycle path uses the Locked one because the per-workspace provision gate is already held at the call site.") } } diff --git a/workspace-server/internal/handlers/workspace_restart.go b/workspace-server/internal/handlers/workspace_restart.go index a855ee576..2968ea771 100644 --- a/workspace-server/internal/handlers/workspace_restart.go +++ b/workspace-server/internal/handlers/workspace_restart.go @@ -952,6 +952,12 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) { // safe to start another restart cycle without racing this one's // Stop call. // + // core#2771: call the *Locked* variant — the per-workspace provision + // gate is ALREADY held by this cycle (acquired at the top of + // runRestartCycle, held for the entire Stop+Start). Calling the + // unlocked variant would re-lock the same non-reentrant sync.Mutex + // and deadlock the programmatic restart path. + // // Pre-2026-05-05 this site inlined the if-cpProv-else dispatch. On // SaaS the cycle would NPE inside provisionWorkspace's // `h.provisioner.VolumeHasFile` call, get swallowed by @@ -960,7 +966,7 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) { // status='provisioning' (the UPDATE above already ran). User- // observable result on SaaS pre-fix: dead workspace → manual canvas // restart was the only recovery path. - h.provisionWorkspaceAutoSync(workspaceID, "", nil, payload) + h.provisionWorkspaceAutoSyncLocked(workspaceID, "", nil, payload) // sendRestartContext is a one-way notification to the new container; safe // to fire async — the next restart cycle won't depend on it completing. // Tracked via h.goAsync so tests can wait for it via h.asyncWG before -- 2.52.0