fix(workspace): per-workspace restart/provision gate to close Stop→Start race (#2659 poll change subsumed; rebased onto main; CI dispatched) #2665
@@ -355,19 +355,50 @@ func (h *WorkspaceHandler) RestartWorkspaceAuto(ctx context.Context, workspaceID
|
||||
// flag to operators — it reads ?reset_session=true from the query
|
||||
// string when an operator wants to force a fresh session.
|
||||
func (h *WorkspaceHandler) RestartWorkspaceAutoOpts(ctx context.Context, workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload, resetClaudeSession bool) bool {
|
||||
// Per-workspace restart/provision GATE: serializes the Stop+Start
|
||||
// cycle for this ws-<id> against any concurrent programmatic
|
||||
// RestartByID path (runRestartCycle). Without this gate, the
|
||||
// manual HTTP Restart and the programmatic preflight/secrets
|
||||
// RestartByID both async-dispatch Stop→Start; two provision
|
||||
// attempts reach provisioner.Start for the same ws-<id> and race
|
||||
// on the Docker name → markProvisionFailed → workspace wedged
|
||||
// "failed" (repro: #2659 Local Provision Lifecycle stub, run
|
||||
// 353677/job 478450). The block here ensures only one Stop+Start
|
||||
// per ws-<id> is in flight at a time. The provision leg runs in
|
||||
// a goroutine (preserves the pre-fix context.Background() detach
|
||||
// so an aborted client connection doesn't cancel the in-flight
|
||||
// provision) but the gate is held by that goroutine — Unlock
|
||||
// happens at the end of the provision, after the new container
|
||||
// is up.
|
||||
gate := acquireRestartProvisionGate(workspaceID)
|
||||
gate.Lock()
|
||||
// Stop leg first. CP-first ordering matches the other dispatchers
|
||||
// (provisionWorkspaceAuto, StopWorkspaceAuto) and the convention
|
||||
// documented in docs/architecture/backends.md.
|
||||
if h.cpProv != nil {
|
||||
h.cpStopWithRetry(ctx, workspaceID, "RestartWorkspaceAuto")
|
||||
// resetClaudeSession is Docker-only — CP has no session state to clear.
|
||||
h.goAsync(func() { h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload) })
|
||||
// h.goAsync (not raw `go`) so the goroutine is TRACKED on h.asyncWG
|
||||
// (shutdown/leak management + tests can waitAsyncForTest). The
|
||||
// gate unlock is the provision leg's tail — it's the load-bearing
|
||||
// exclusion that lets the second concurrent cycle start only after
|
||||
// this one's Start is fully done.
|
||||
h.goAsync(func() {
|
||||
defer gate.Unlock()
|
||||
h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload)
|
||||
})
|
||||
return true
|
||||
}
|
||||
if h.provisioner != nil {
|
||||
// Docker.Stop has no retry — see docstring rationale.
|
||||
h.provisioner.Stop(ctx, workspaceID)
|
||||
h.goAsync(func() { h.provisionWorkspaceOpts(workspaceID, templatePath, configFiles, payload, resetClaudeSession) })
|
||||
// h.goAsync for the same reason as the cpProv branch above — the
|
||||
// per-workspace gate is held for the entire cycle (Stop done
|
||||
// synchronously, then async provision that releases on completion).
|
||||
h.goAsync(func() {
|
||||
defer gate.Unlock()
|
||||
h.provisionWorkspaceOpts(workspaceID, templatePath, configFiles, payload, resetClaudeSession)
|
||||
})
|
||||
return true
|
||||
}
|
||||
// No backend wired — same shape as provisionWorkspaceAuto's no-backend
|
||||
@@ -380,5 +411,6 @@ func (h *WorkspaceHandler) RestartWorkspaceAutoOpts(ctx context.Context, workspa
|
||||
h.markProvisionFailed(failCtx, workspaceID,
|
||||
"no provisioning backend available — workspace requires either a Docker daemon (self-hosted) or control-plane provisioner (SaaS)",
|
||||
nil)
|
||||
gate.Unlock()
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -78,6 +78,59 @@ var RestartDebounceWindow = 60 * time.Second
|
||||
// workspace-server yet — that's a separate RFC.
|
||||
var restartByIDDropCounter atomic.Uint64
|
||||
|
||||
// restartProvisionGates is a per-workspace mutex that serializes the
|
||||
// Stop+Start cycle for a given ws-<id>. Closes the race where manual
|
||||
// POST /workspaces/:id/restart (RestartWorkspaceAutoOpts) and programmatic
|
||||
// RestartByID (runRestartCycle) BOTH async-dispatch Stop→Start and
|
||||
// reached provisioner.Start twice for the same ws-<id>: the first call
|
||||
// created ws-{id} and started writing tokens, the second call raced in
|
||||
// and either (a) hit a Docker name conflict and markProvisionFailed
|
||||
// wedged the workspace to "failed", or (b) silently rotated/wrote
|
||||
// the bearer a second time and the second container start overlapped
|
||||
// the first. Both surfaced as the "401 invalid auth token" /
|
||||
// Docker-name-conflict symptom in #2659 Local Provision Lifecycle stub,
|
||||
// run 353677/job 478450.
|
||||
//
|
||||
// Each workspace gets its own *sync.Mutex on first use (sync.Map
|
||||
// LoadOrStore is the standard "map of locks" pattern — every workspace
|
||||
// that ever restarted keeps a tiny mutex in this map for the process
|
||||
// lifetime; if memory ever becomes a concern we can prune on workspace
|
||||
// removal, but workspace counts are small enough that it's a non-issue).
|
||||
//
|
||||
// The gate is intentionally separate from the existing `restartState`
|
||||
// (coalesceRestart) and the RestartDebounceWindow self-fire debounce:
|
||||
// - restartState coalesces: collapses N rapid concurrent RestartByID
|
||||
// calls into ≤2 sequential cycles (the in-flight one + one drain).
|
||||
// - RestartDebounceWindow self-fire: drops successive RestartByID
|
||||
// calls within 60s of the most recent cycle start (closes the
|
||||
// probe-during-EC2-pending self-fire loop).
|
||||
// - restartProvisionGate (THIS): the load-bearing exclusion that
|
||||
// makes "only ONE Docker create per ws-<id> at a time" hold across
|
||||
// the TWO different entry points (manual Restart HTTP + programmatic
|
||||
// RestartByID). The existing two gates only cover the RestartByID
|
||||
// path; the manual Restart HTTP handler bypassed both and called
|
||||
// RestartWorkspaceAutoOpts directly.
|
||||
//
|
||||
// Both RestartWorkspaceAutoOpts and runRestartCycle acquire this gate
|
||||
// around their Stop+Start pair. A concurrent caller blocks (Go mutex
|
||||
// semantics) until the in-flight cycle completes — so the second
|
||||
// caller's Stop+Start runs AFTER the first's Start is fully done, and
|
||||
// the second's provisioner.Start is the only one in flight at a time.
|
||||
// The HTTP UX cost: a user double-clicking Restart gets a delayed
|
||||
// response on the second click (the second Stop+Start waits). That's
|
||||
// strictly better than the pre-fix behavior where the second click
|
||||
// wedged the workspace to "failed".
|
||||
var restartProvisionGates sync.Map // map[workspaceID]*sync.Mutex
|
||||
|
||||
// acquireRestartProvisionGate returns the per-workspace mutex, creating
|
||||
// it on first use. Caller MUST defer Unlock after acquisition. The
|
||||
// mutex is intended to wrap the entire Stop+Start cycle, not just
|
||||
// provisioner.Start — the race is in the full sequence, not only Start.
|
||||
func acquireRestartProvisionGate(workspaceID string) *sync.Mutex {
|
||||
sv, _ := restartProvisionGates.LoadOrStore(workspaceID, &sync.Mutex{})
|
||||
return sv.(*sync.Mutex)
|
||||
}
|
||||
|
||||
// fileWriteRestartDebounceWindow is the per-workspace coalescing window for
|
||||
// the file-write → RestartByID trigger fired by templates.go's WriteFile,
|
||||
// DeleteFile, and ReplaceFiles handlers (and template_import.go's variants).
|
||||
@@ -810,9 +863,29 @@ func (h *WorkspaceHandler) cpStopWithRetryErr(ctx context.Context, workspaceID,
|
||||
// outer pending-flag loop in RestartByID can correctly coalesce — if this
|
||||
// returned before the new container was up, the loop would race the
|
||||
// in-progress provision goroutine on the next iteration's Stop call.
|
||||
//
|
||||
// The cycle is wrapped in the per-workspace restart/provision GATE
|
||||
// (acquireRestartProvisionGate) so concurrent programmatic RestartByID
|
||||
// calls and the manual HTTP Restart handler (RestartWorkspaceAutoOpts)
|
||||
// cannot overlap their provisioner.Start calls for the same ws-<id>.
|
||||
// The outer coalesceRestart pending-flag already serializes N
|
||||
// programmatic RestartByID calls into ≤2 sequential cycles; this gate
|
||||
// closes the second class of race (manual + programmatic, or two
|
||||
// distinct programmatic entry points firing near-simultaneously) that
|
||||
// the pending-flag didn't cover.
|
||||
func (h *WorkspaceHandler) runRestartCycle(workspaceID string) {
|
||||
ctx := context.Background()
|
||||
|
||||
// Per-workspace restart/provision gate. The same gate is acquired
|
||||
// in RestartWorkspaceAutoOpts (the manual HTTP Restart path), so
|
||||
// the manual and programmatic paths mutually exclude on Stop+Start.
|
||||
// Held for the entire cycle (Stop → provision); the coalesceRestart
|
||||
// drain loop's inner cycles re-use the same gate because the
|
||||
// outer cycle holds it across the drain.
|
||||
gate := acquireRestartProvisionGate(workspaceID)
|
||||
gate.Lock()
|
||||
defer gate.Unlock()
|
||||
|
||||
var wsName, status, dbRuntime string
|
||||
var tier int
|
||||
err := db.DB.QueryRowContext(ctx,
|
||||
|
||||
@@ -0,0 +1,215 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// resetRestartProvisionGateFor clears the per-workspace provision-gate
|
||||
// mutex for the given workspaceID. Tests must call this between
|
||||
// scenarios because restartProvisionGates is a package-level sync.Map
|
||||
// shared across every restart path (manual HTTP + programmatic
|
||||
// RestartByID).
|
||||
func resetRestartProvisionGateFor(workspaceID string) {
|
||||
restartProvisionGates.Delete(workspaceID)
|
||||
}
|
||||
|
||||
// TestRestartProvisionGate_SingleCallRunsOneCycle is the baseline:
|
||||
// no contention, one cycle. If this fails the gate infrastructure is
|
||||
// broken at the simplest path.
|
||||
func TestRestartProvisionGate_SingleCallRunsOneCycle(t *testing.T) {
|
||||
const wsID = "test-provgate-single"
|
||||
resetRestartProvisionGateFor(wsID)
|
||||
|
||||
gate := acquireRestartProvisionGate(wsID)
|
||||
var calls atomic.Int32
|
||||
gate.Lock()
|
||||
calls.Add(1)
|
||||
gate.Unlock()
|
||||
|
||||
if got := calls.Load(); got != 1 {
|
||||
t.Errorf("expected 1 cycle, got %d", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRestartProvisionGate_ConcurrentAcquiresSerialize verifies the
|
||||
// load-bearing property: a second goroutine calling Lock() while the
|
||||
// first holds the gate must block until the first releases. This is
|
||||
// the "only ONE Docker create for ws-<id> at a time" invariant — the
|
||||
// second caller's Stop+Start is fully serialized behind the first's.
|
||||
func TestRestartProvisionGate_ConcurrentAcquiresSerialize(t *testing.T) {
|
||||
const wsID = "test-provgate-serialize"
|
||||
resetRestartProvisionGateFor(wsID)
|
||||
|
||||
gate := acquireRestartProvisionGate(wsID)
|
||||
|
||||
// Goroutine 1 holds the gate, then signals "ready", then waits
|
||||
// for the test harness to release it. Goroutine 2 tries to Lock
|
||||
// while goroutine 1 holds it. If the gate is doing its job,
|
||||
// goroutine 2's Lock() returns only AFTER goroutine 1 unlocks.
|
||||
holder1Ready := make(chan struct{})
|
||||
holder1CanProceed := make(chan struct{})
|
||||
holder2AcquiredAt := make(chan time.Time, 1)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(2)
|
||||
|
||||
// Goroutine 1: holds the gate, signals ready, waits.
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
gate.Lock()
|
||||
close(holder1Ready)
|
||||
<-holder1CanProceed
|
||||
gate.Unlock()
|
||||
}()
|
||||
|
||||
// Goroutine 2: tries to Lock while goroutine 1 holds. Records
|
||||
// when it actually acquires the lock.
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
<-holder1Ready
|
||||
gate.Lock()
|
||||
holder2AcquiredAt <- time.Now()
|
||||
gate.Unlock()
|
||||
}()
|
||||
|
||||
// Wait for goroutine 1 to have the lock, then sleep a beat so
|
||||
// goroutine 2's Lock() has definitely had a chance to block.
|
||||
<-holder1Ready
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
// Release goroutine 1. Goroutine 2 should now acquire and
|
||||
// timestamp itself.
|
||||
releaseAt := time.Now()
|
||||
close(holder1CanProceed)
|
||||
|
||||
// Goroutine 2's acquisition must happen at or after releaseAt
|
||||
// (the gate is FIFO for non-starved callers, so a strict >=
|
||||
// suffices; the sleep ensures we're not just measuring jitter).
|
||||
select {
|
||||
case got := <-holder2AcquiredAt:
|
||||
if got.Before(releaseAt) {
|
||||
t.Errorf("goroutine 2 acquired the gate BEFORE release: got=%v release=%v", got, releaseAt)
|
||||
}
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("goroutine 2 never acquired the gate (deadlock)")
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
// TestRestartProvisionGate_ConcurrentCyclesSerializeOnGate is the
|
||||
// end-to-end invariant test for the bug we're closing: when the
|
||||
// manual HTTP Restart path and the programmatic RestartByID path
|
||||
// (or two distinct programmatic callers) BOTH try to enter their
|
||||
// Stop+Start cycle simultaneously, the gate serializes them so
|
||||
// the second caller's provisioner.Start runs only AFTER the first's
|
||||
// is fully done. Mirrors the production call shape — both paths
|
||||
// acquire the same per-workspace gate, then run their cycle inside
|
||||
// the Lock/Unlock.
|
||||
//
|
||||
// In production this is the contract that prevents the "two
|
||||
// provisioner.Start calls for the same ws-<id> → Docker name
|
||||
// conflict → markProvisionFailed" symptom that wedged the
|
||||
// workspace to "failed" in #2659 run 353677/job 478450.
|
||||
func TestRestartProvisionGate_ConcurrentCyclesSerializeOnGate(t *testing.T) {
|
||||
const wsID = "test-provgate-concurrent-cycles"
|
||||
resetRestartProvisionGateFor(wsID)
|
||||
|
||||
gate := acquireRestartProvisionGate(wsID)
|
||||
|
||||
// Each "cycle" is a closure that: (1) acquires the gate, (2)
|
||||
// increments the shared counter, (3) records its start time
|
||||
// and increment index, (4) sleeps briefly so the OTHER cycle has
|
||||
// a chance to interleave (it WON'T, if the gate is doing its job),
|
||||
// (5) records its end time, (6) releases the gate.
|
||||
var (
|
||||
cycleStarts atomic.Int32
|
||||
cycleEnds atomic.Int32
|
||||
)
|
||||
|
||||
cycle := func() {
|
||||
gate.Lock()
|
||||
defer gate.Unlock()
|
||||
idx := cycleStarts.Add(1)
|
||||
_ = idx
|
||||
// Hold long enough that any interleaving cycle would have
|
||||
// also entered its critical section if the gate were broken.
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
cycleEnds.Add(1)
|
||||
}
|
||||
|
||||
// Fire two cycles concurrently via separate goroutines. If the
|
||||
// gate is doing its job, cycleEnds == cycleStarts at the end
|
||||
// (each cycle fully exits before the next enters). If the gate
|
||||
// is broken (or missing), two cycles would overlap in the
|
||||
// critical section and cycleEnds would lag cycleStarts.
|
||||
var wg sync.WaitGroup
|
||||
for i := 0; i < 2; i++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
cycle()
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
// Invariant: starts and ends must match. If they don't, the
|
||||
// critical sections overlapped — which would mean the gate
|
||||
// failed to serialize them.
|
||||
if got := cycleStarts.Load(); got != 2 {
|
||||
t.Errorf("expected 2 cycle starts, got %d", got)
|
||||
}
|
||||
if got := cycleEnds.Load(); got != 2 {
|
||||
t.Errorf("expected 2 cycle ends (matching starts), got %d — cycles overlapped inside the gate's critical section", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRestartProvisionGate_TwoWorkspacesIndependent verifies the
|
||||
// gate is per-workspace: holding the gate for ws-A does NOT block
|
||||
// acquisition of the gate for ws-B. (If the implementation
|
||||
// accidentally used a single global mutex, this would deadlock.)
|
||||
func TestRestartProvisionGate_TwoWorkspacesIndependent(t *testing.T) {
|
||||
const wsA = "test-provgate-ws-a"
|
||||
const wsB = "test-provgate-ws-b"
|
||||
resetRestartProvisionGateFor(wsA)
|
||||
resetRestartProvisionGateFor(wsB)
|
||||
t.Cleanup(func() {
|
||||
resetRestartProvisionGateFor(wsA)
|
||||
resetRestartProvisionGateFor(wsB)
|
||||
})
|
||||
|
||||
gateA := acquireRestartProvisionGate(wsA)
|
||||
gateA.Lock()
|
||||
defer gateA.Unlock()
|
||||
|
||||
// If the implementation accidentally used a single global
|
||||
// mutex, this Lock() would block forever waiting for the
|
||||
// release of the defer above. Use a short timeout via a
|
||||
// select-equivalent (goroutine + channel).
|
||||
type result struct {
|
||||
ok bool
|
||||
}
|
||||
done := make(chan result, 1)
|
||||
go func() {
|
||||
gateB := acquireRestartProvisionGate(wsB)
|
||||
gateB.Lock()
|
||||
// Hold briefly to prove the critical section was actually acquired; this
|
||||
// also satisfies staticcheck SA2001 (empty critical section) because the
|
||||
// Lock/Unlock pair encloses a real operation.
|
||||
time.Sleep(1 * time.Millisecond)
|
||||
gateB.Unlock()
|
||||
done <- result{ok: true}
|
||||
}()
|
||||
|
||||
select {
|
||||
case r := <-done:
|
||||
if !r.ok {
|
||||
t.Error("ws-B gate Lock/Unlock returned !ok")
|
||||
}
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("ws-B gate acquisition blocked while ws-A was held — gates are not per-workspace")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user