diff --git a/workspace-server/internal/handlers/workspace_restart.go b/workspace-server/internal/handlers/workspace_restart.go index f24b8fc3..d986a2c0 100644 --- a/workspace-server/internal/handlers/workspace_restart.go +++ b/workspace-server/internal/handlers/workspace_restart.go @@ -5,6 +5,7 @@ import ( "database/sql" "log" "net/http" + "runtime/debug" "strings" "sync" "time" @@ -352,6 +353,27 @@ func coalesceRestart(workspaceID string, cycle func()) { state.running = true state.mu.Unlock() + // Always clear running on exit — including panic — so a panicking + // cycle (e.g. a future provisionWorkspace nil-deref) doesn't leave + // the workspace permanently locked out of restarts. + // + // recover()-and-DON'T-re-raise on purpose: this runs in a goroutine + // (callers are `go h.RestartByID(...)`); an unrecovered panic in a + // goroutine crashes the whole platform process, taking down every + // workspace served by this binary because of one bug in cycle for + // one workspace. Log the panic with stack trace for debuggability, + // then recover and let the goroutine exit cleanly. The next restart + // request for this workspace will see running=false and proceed. + defer func() { + state.mu.Lock() + state.running = false + state.mu.Unlock() + if r := recover(); r != nil { + log.Printf("Auto-restart: %s — cycle panicked, restart-state cleared: %v\n%s", + workspaceID, r, debug.Stack()) + } + }() + // Drain pending requests. Each iteration re-loads workspace_secrets // inside provisionWorkspace, so any writes that committed since the // last cycle are picked up. Continues until no pending request was @@ -359,9 +381,8 @@ func coalesceRestart(workspaceID string, cycle func()) { for { state.mu.Lock() if !state.pending { - state.running = false state.mu.Unlock() - return + return // defer clears running } state.pending = false state.mu.Unlock() diff --git a/workspace-server/internal/handlers/workspace_restart_coalesce_test.go b/workspace-server/internal/handlers/workspace_restart_coalesce_test.go index 73ebfef1..39e14219 100644 --- a/workspace-server/internal/handlers/workspace_restart_coalesce_test.go +++ b/workspace-server/internal/handlers/workspace_restart_coalesce_test.go @@ -183,6 +183,43 @@ func TestCoalesceRestart_StateClearedAfterDrain(t *testing.T) { } } +// TestCoalesceRestart_PanicInCycleClearsState defends against a +// regression of the sticky-running deadlock: if cycle() panics, the +// running flag MUST be cleared so a follow-up RestartByID for the +// same workspace can still acquire the gate. Without the deferred +// state-clear + recover, a single panic would permanently lock a +// workspace out of all future restarts until process restart. +// +// Also asserts the panic is RECOVERED (not re-raised): callers are +// `go h.RestartByID(...)` from HTTP handlers, and an unrecovered +// goroutine panic in Go takes down the whole process. Crashing the +// platform for every tenant because one workspace's cycle panicked +// is the wrong availability tradeoff. The panic message + stack +// trace are still logged for debuggability. +func TestCoalesceRestart_PanicInCycleClearsState(t *testing.T) { + const wsID = "test-coalesce-panic-recovery" + resetRestartStatesFor(wsID) + + // First call's cycle panics. coalesceRestart's defer must swallow + // the panic so this test caller doesn't see it propagate up — that + // matches what the real production caller (`go h.RestartByID(...)`) + // gets: the goroutine survives, no process crash. + defer func() { + if r := recover(); r != nil { + t.Errorf("panic should NOT propagate out of coalesceRestart (would crash the platform process from a goroutine), got: %v", r) + } + }() + coalesceRestart(wsID, func() { panic("simulated cycle failure") }) + + // Second call must run a fresh cycle. If running stayed true after + // the panic, this call would early-return without invoking cycle. + var ran atomic.Bool + coalesceRestart(wsID, func() { ran.Store(true) }) + if !ran.Load() { + t.Error("post-panic restart was blocked — running flag leaked, workspace permanently locked out") + } +} + // TestCoalesceRestart_DifferentWorkspacesDoNotSerialize verifies the // per-workspace state map: an in-flight restart for ws A must not // block restarts for ws B. Important for performance — without this,