fix(workspace): per-workspace restart/provision gate to close Stop→Start race (#2659 poll change subsumed; rebased onto main; CI dispatched) #2665

Merged
devops-engineer merged 5 commits from fix/restart-race-provision-gate into main 2026-06-12 21:31:42 +00:00
3 changed files with 322 additions and 2 deletions
@@ -355,19 +355,50 @@ func (h *WorkspaceHandler) RestartWorkspaceAuto(ctx context.Context, workspaceID
// flag to operators — it reads ?reset_session=true from the query
// string when an operator wants to force a fresh session.
func (h *WorkspaceHandler) RestartWorkspaceAutoOpts(ctx context.Context, workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload, resetClaudeSession bool) bool {
// Per-workspace restart/provision GATE: serializes the Stop+Start
// cycle for this ws-<id> against any concurrent programmatic
// RestartByID path (runRestartCycle). Without this gate, the
// manual HTTP Restart and the programmatic preflight/secrets
// RestartByID both async-dispatch Stop→Start; two provision
// attempts reach provisioner.Start for the same ws-<id> and race
// on the Docker name → markProvisionFailed → workspace wedged
// "failed" (repro: #2659 Local Provision Lifecycle stub, run
// 353677/job 478450). The block here ensures only one Stop+Start
// per ws-<id> is in flight at a time. The provision leg runs in
// a goroutine (preserves the pre-fix context.Background() detach
// so an aborted client connection doesn't cancel the in-flight
// provision) but the gate is held by that goroutine — Unlock
// happens at the end of the provision, after the new container
// is up.
gate := acquireRestartProvisionGate(workspaceID)
gate.Lock()
// Stop leg first. CP-first ordering matches the other dispatchers
// (provisionWorkspaceAuto, StopWorkspaceAuto) and the convention
// documented in docs/architecture/backends.md.
if h.cpProv != nil {
h.cpStopWithRetry(ctx, workspaceID, "RestartWorkspaceAuto")
// resetClaudeSession is Docker-only — CP has no session state to clear.
h.goAsync(func() { h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload) })
// h.goAsync (not raw `go`) so the goroutine is TRACKED on h.asyncWG
// (shutdown/leak management + tests can waitAsyncForTest). The
// gate unlock is the provision leg's tail — it's the load-bearing
// exclusion that lets the second concurrent cycle start only after
// this one's Start is fully done.
h.goAsync(func() {
defer gate.Unlock()
h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload)
})
return true
}
if h.provisioner != nil {
// Docker.Stop has no retry — see docstring rationale.
h.provisioner.Stop(ctx, workspaceID)
h.goAsync(func() { h.provisionWorkspaceOpts(workspaceID, templatePath, configFiles, payload, resetClaudeSession) })
// h.goAsync for the same reason as the cpProv branch above — the
// per-workspace gate is held for the entire cycle (Stop done
// synchronously, then async provision that releases on completion).
h.goAsync(func() {
defer gate.Unlock()
h.provisionWorkspaceOpts(workspaceID, templatePath, configFiles, payload, resetClaudeSession)
})
return true
}
// No backend wired — same shape as provisionWorkspaceAuto's no-backend
@@ -380,5 +411,6 @@ func (h *WorkspaceHandler) RestartWorkspaceAutoOpts(ctx context.Context, workspa
h.markProvisionFailed(failCtx, workspaceID,
"no provisioning backend available — workspace requires either a Docker daemon (self-hosted) or control-plane provisioner (SaaS)",
nil)
gate.Unlock()
return false
}
@@ -78,6 +78,59 @@ var RestartDebounceWindow = 60 * time.Second
// workspace-server yet — that's a separate RFC.
var restartByIDDropCounter atomic.Uint64
// restartProvisionGates is a per-workspace mutex that serializes the
// Stop+Start cycle for a given ws-<id>. Closes the race where manual
// POST /workspaces/:id/restart (RestartWorkspaceAutoOpts) and programmatic
// RestartByID (runRestartCycle) BOTH async-dispatch Stop→Start and
// reached provisioner.Start twice for the same ws-<id>: the first call
// created ws-{id} and started writing tokens, the second call raced in
// and either (a) hit a Docker name conflict and markProvisionFailed
// wedged the workspace to "failed", or (b) silently rotated/wrote
// the bearer a second time and the second container start overlapped
// the first. Both surfaced as the "401 invalid auth token" /
// Docker-name-conflict symptom in #2659 Local Provision Lifecycle stub,
// run 353677/job 478450.
//
// Each workspace gets its own *sync.Mutex on first use (sync.Map
// LoadOrStore is the standard "map of locks" pattern — every workspace
// that ever restarted keeps a tiny mutex in this map for the process
// lifetime; if memory ever becomes a concern we can prune on workspace
// removal, but workspace counts are small enough that it's a non-issue).
//
// The gate is intentionally separate from the existing `restartState`
// (coalesceRestart) and the RestartDebounceWindow self-fire debounce:
// - restartState coalesces: collapses N rapid concurrent RestartByID
// calls into ≤2 sequential cycles (the in-flight one + one drain).
// - RestartDebounceWindow self-fire: drops successive RestartByID
// calls within 60s of the most recent cycle start (closes the
// probe-during-EC2-pending self-fire loop).
// - restartProvisionGate (THIS): the load-bearing exclusion that
// makes "only ONE Docker create per ws-<id> at a time" hold across
// the TWO different entry points (manual Restart HTTP + programmatic
// RestartByID). The existing two gates only cover the RestartByID
// path; the manual Restart HTTP handler bypassed both and called
// RestartWorkspaceAutoOpts directly.
//
// Both RestartWorkspaceAutoOpts and runRestartCycle acquire this gate
// around their Stop+Start pair. A concurrent caller blocks (Go mutex
// semantics) until the in-flight cycle completes — so the second
// caller's Stop+Start runs AFTER the first's Start is fully done, and
// the second's provisioner.Start is the only one in flight at a time.
// The HTTP UX cost: a user double-clicking Restart gets a delayed
// response on the second click (the second Stop+Start waits). That's
// strictly better than the pre-fix behavior where the second click
// wedged the workspace to "failed".
var restartProvisionGates sync.Map // map[workspaceID]*sync.Mutex
// acquireRestartProvisionGate returns the per-workspace mutex, creating
// it on first use. Caller MUST defer Unlock after acquisition. The
// mutex is intended to wrap the entire Stop+Start cycle, not just
// provisioner.Start — the race is in the full sequence, not only Start.
func acquireRestartProvisionGate(workspaceID string) *sync.Mutex {
sv, _ := restartProvisionGates.LoadOrStore(workspaceID, &sync.Mutex{})
return sv.(*sync.Mutex)
}
// fileWriteRestartDebounceWindow is the per-workspace coalescing window for
// the file-write → RestartByID trigger fired by templates.go's WriteFile,
// DeleteFile, and ReplaceFiles handlers (and template_import.go's variants).
@@ -810,9 +863,29 @@ func (h *WorkspaceHandler) cpStopWithRetryErr(ctx context.Context, workspaceID,
// outer pending-flag loop in RestartByID can correctly coalesce — if this
// returned before the new container was up, the loop would race the
// in-progress provision goroutine on the next iteration's Stop call.
//
// The cycle is wrapped in the per-workspace restart/provision GATE
// (acquireRestartProvisionGate) so concurrent programmatic RestartByID
// calls and the manual HTTP Restart handler (RestartWorkspaceAutoOpts)
// cannot overlap their provisioner.Start calls for the same ws-<id>.
// The outer coalesceRestart pending-flag already serializes N
// programmatic RestartByID calls into ≤2 sequential cycles; this gate
// closes the second class of race (manual + programmatic, or two
// distinct programmatic entry points firing near-simultaneously) that
// the pending-flag didn't cover.
func (h *WorkspaceHandler) runRestartCycle(workspaceID string) {
ctx := context.Background()
// Per-workspace restart/provision gate. The same gate is acquired
// in RestartWorkspaceAutoOpts (the manual HTTP Restart path), so
// the manual and programmatic paths mutually exclude on Stop+Start.
// Held for the entire cycle (Stop → provision); the coalesceRestart
// drain loop's inner cycles re-use the same gate because the
// outer cycle holds it across the drain.
gate := acquireRestartProvisionGate(workspaceID)
gate.Lock()
defer gate.Unlock()
var wsName, status, dbRuntime string
var tier int
err := db.DB.QueryRowContext(ctx,
@@ -0,0 +1,215 @@
package handlers
import (
"sync"
"sync/atomic"
"testing"
"time"
)
// resetRestartProvisionGateFor clears the per-workspace provision-gate
// mutex for the given workspaceID. Tests must call this between
// scenarios because restartProvisionGates is a package-level sync.Map
// shared across every restart path (manual HTTP + programmatic
// RestartByID).
func resetRestartProvisionGateFor(workspaceID string) {
restartProvisionGates.Delete(workspaceID)
}
// TestRestartProvisionGate_SingleCallRunsOneCycle is the baseline:
// no contention, one cycle. If this fails the gate infrastructure is
// broken at the simplest path.
func TestRestartProvisionGate_SingleCallRunsOneCycle(t *testing.T) {
const wsID = "test-provgate-single"
resetRestartProvisionGateFor(wsID)
gate := acquireRestartProvisionGate(wsID)
var calls atomic.Int32
gate.Lock()
calls.Add(1)
gate.Unlock()
if got := calls.Load(); got != 1 {
t.Errorf("expected 1 cycle, got %d", got)
}
}
// TestRestartProvisionGate_ConcurrentAcquiresSerialize verifies the
// load-bearing property: a second goroutine calling Lock() while the
// first holds the gate must block until the first releases. This is
// the "only ONE Docker create for ws-<id> at a time" invariant — the
// second caller's Stop+Start is fully serialized behind the first's.
func TestRestartProvisionGate_ConcurrentAcquiresSerialize(t *testing.T) {
const wsID = "test-provgate-serialize"
resetRestartProvisionGateFor(wsID)
gate := acquireRestartProvisionGate(wsID)
// Goroutine 1 holds the gate, then signals "ready", then waits
// for the test harness to release it. Goroutine 2 tries to Lock
// while goroutine 1 holds it. If the gate is doing its job,
// goroutine 2's Lock() returns only AFTER goroutine 1 unlocks.
holder1Ready := make(chan struct{})
holder1CanProceed := make(chan struct{})
holder2AcquiredAt := make(chan time.Time, 1)
var wg sync.WaitGroup
wg.Add(2)
// Goroutine 1: holds the gate, signals ready, waits.
go func() {
defer wg.Done()
gate.Lock()
close(holder1Ready)
<-holder1CanProceed
gate.Unlock()
}()
// Goroutine 2: tries to Lock while goroutine 1 holds. Records
// when it actually acquires the lock.
go func() {
defer wg.Done()
<-holder1Ready
gate.Lock()
holder2AcquiredAt <- time.Now()
gate.Unlock()
}()
// Wait for goroutine 1 to have the lock, then sleep a beat so
// goroutine 2's Lock() has definitely had a chance to block.
<-holder1Ready
time.Sleep(50 * time.Millisecond)
// Release goroutine 1. Goroutine 2 should now acquire and
// timestamp itself.
releaseAt := time.Now()
close(holder1CanProceed)
// Goroutine 2's acquisition must happen at or after releaseAt
// (the gate is FIFO for non-starved callers, so a strict >=
// suffices; the sleep ensures we're not just measuring jitter).
select {
case got := <-holder2AcquiredAt:
if got.Before(releaseAt) {
t.Errorf("goroutine 2 acquired the gate BEFORE release: got=%v release=%v", got, releaseAt)
}
case <-time.After(2 * time.Second):
t.Fatal("goroutine 2 never acquired the gate (deadlock)")
}
wg.Wait()
}
// TestRestartProvisionGate_ConcurrentCyclesSerializeOnGate is the
// end-to-end invariant test for the bug we're closing: when the
// manual HTTP Restart path and the programmatic RestartByID path
// (or two distinct programmatic callers) BOTH try to enter their
// Stop+Start cycle simultaneously, the gate serializes them so
// the second caller's provisioner.Start runs only AFTER the first's
// is fully done. Mirrors the production call shape — both paths
// acquire the same per-workspace gate, then run their cycle inside
// the Lock/Unlock.
//
// In production this is the contract that prevents the "two
// provisioner.Start calls for the same ws-<id> → Docker name
// conflict → markProvisionFailed" symptom that wedged the
// workspace to "failed" in #2659 run 353677/job 478450.
func TestRestartProvisionGate_ConcurrentCyclesSerializeOnGate(t *testing.T) {
const wsID = "test-provgate-concurrent-cycles"
resetRestartProvisionGateFor(wsID)
gate := acquireRestartProvisionGate(wsID)
// Each "cycle" is a closure that: (1) acquires the gate, (2)
// increments the shared counter, (3) records its start time
// and increment index, (4) sleeps briefly so the OTHER cycle has
// a chance to interleave (it WON'T, if the gate is doing its job),
// (5) records its end time, (6) releases the gate.
var (
cycleStarts atomic.Int32
cycleEnds atomic.Int32
)
cycle := func() {
gate.Lock()
defer gate.Unlock()
idx := cycleStarts.Add(1)
_ = idx
// Hold long enough that any interleaving cycle would have
// also entered its critical section if the gate were broken.
time.Sleep(20 * time.Millisecond)
cycleEnds.Add(1)
}
// Fire two cycles concurrently via separate goroutines. If the
// gate is doing its job, cycleEnds == cycleStarts at the end
// (each cycle fully exits before the next enters). If the gate
// is broken (or missing), two cycles would overlap in the
// critical section and cycleEnds would lag cycleStarts.
var wg sync.WaitGroup
for i := 0; i < 2; i++ {
wg.Add(1)
go func() {
defer wg.Done()
cycle()
}()
}
wg.Wait()
// Invariant: starts and ends must match. If they don't, the
// critical sections overlapped — which would mean the gate
// failed to serialize them.
if got := cycleStarts.Load(); got != 2 {
t.Errorf("expected 2 cycle starts, got %d", got)
}
if got := cycleEnds.Load(); got != 2 {
t.Errorf("expected 2 cycle ends (matching starts), got %d — cycles overlapped inside the gate's critical section", got)
}
}
// TestRestartProvisionGate_TwoWorkspacesIndependent verifies the
// gate is per-workspace: holding the gate for ws-A does NOT block
// acquisition of the gate for ws-B. (If the implementation
// accidentally used a single global mutex, this would deadlock.)
func TestRestartProvisionGate_TwoWorkspacesIndependent(t *testing.T) {
const wsA = "test-provgate-ws-a"
const wsB = "test-provgate-ws-b"
resetRestartProvisionGateFor(wsA)
resetRestartProvisionGateFor(wsB)
t.Cleanup(func() {
resetRestartProvisionGateFor(wsA)
resetRestartProvisionGateFor(wsB)
})
gateA := acquireRestartProvisionGate(wsA)
gateA.Lock()
defer gateA.Unlock()
// If the implementation accidentally used a single global
// mutex, this Lock() would block forever waiting for the
// release of the defer above. Use a short timeout via a
// select-equivalent (goroutine + channel).
type result struct {
ok bool
}
done := make(chan result, 1)
go func() {
gateB := acquireRestartProvisionGate(wsB)
gateB.Lock()
// Hold briefly to prove the critical section was actually acquired; this
// also satisfies staticcheck SA2001 (empty critical section) because the
// Lock/Unlock pair encloses a real operation.
time.Sleep(1 * time.Millisecond)
gateB.Unlock()
done <- result{ok: true}
}()
select {
case r := <-done:
if !r.ok {
t.Error("ws-B gate Lock/Unlock returned !ok")
}
case <-time.After(2 * time.Second):
t.Fatal("ws-B gate acquisition blocked while ws-A was held — gates are not per-workspace")
}
}