Merge pull request #2267 from Molecule-AI/fix/restart-panic-recovery
fix(restart): clear running flag on panic in cycle()
This commit is contained in:
commit
aa6c42f042
@ -5,6 +5,7 @@ import (
|
||||
"database/sql"
|
||||
"log"
|
||||
"net/http"
|
||||
"runtime/debug"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
@ -352,6 +353,27 @@ func coalesceRestart(workspaceID string, cycle func()) {
|
||||
state.running = true
|
||||
state.mu.Unlock()
|
||||
|
||||
// Always clear running on exit — including panic — so a panicking
|
||||
// cycle (e.g. a future provisionWorkspace nil-deref) doesn't leave
|
||||
// the workspace permanently locked out of restarts.
|
||||
//
|
||||
// recover()-and-DON'T-re-raise on purpose: this runs in a goroutine
|
||||
// (callers are `go h.RestartByID(...)`); an unrecovered panic in a
|
||||
// goroutine crashes the whole platform process, taking down every
|
||||
// workspace served by this binary because of one bug in cycle for
|
||||
// one workspace. Log the panic with stack trace for debuggability,
|
||||
// then recover and let the goroutine exit cleanly. The next restart
|
||||
// request for this workspace will see running=false and proceed.
|
||||
defer func() {
|
||||
state.mu.Lock()
|
||||
state.running = false
|
||||
state.mu.Unlock()
|
||||
if r := recover(); r != nil {
|
||||
log.Printf("Auto-restart: %s — cycle panicked, restart-state cleared: %v\n%s",
|
||||
workspaceID, r, debug.Stack())
|
||||
}
|
||||
}()
|
||||
|
||||
// Drain pending requests. Each iteration re-loads workspace_secrets
|
||||
// inside provisionWorkspace, so any writes that committed since the
|
||||
// last cycle are picked up. Continues until no pending request was
|
||||
@ -359,9 +381,8 @@ func coalesceRestart(workspaceID string, cycle func()) {
|
||||
for {
|
||||
state.mu.Lock()
|
||||
if !state.pending {
|
||||
state.running = false
|
||||
state.mu.Unlock()
|
||||
return
|
||||
return // defer clears running
|
||||
}
|
||||
state.pending = false
|
||||
state.mu.Unlock()
|
||||
|
||||
@ -183,6 +183,43 @@ func TestCoalesceRestart_StateClearedAfterDrain(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestCoalesceRestart_PanicInCycleClearsState defends against a
|
||||
// regression of the sticky-running deadlock: if cycle() panics, the
|
||||
// running flag MUST be cleared so a follow-up RestartByID for the
|
||||
// same workspace can still acquire the gate. Without the deferred
|
||||
// state-clear + recover, a single panic would permanently lock a
|
||||
// workspace out of all future restarts until process restart.
|
||||
//
|
||||
// Also asserts the panic is RECOVERED (not re-raised): callers are
|
||||
// `go h.RestartByID(...)` from HTTP handlers, and an unrecovered
|
||||
// goroutine panic in Go takes down the whole process. Crashing the
|
||||
// platform for every tenant because one workspace's cycle panicked
|
||||
// is the wrong availability tradeoff. The panic message + stack
|
||||
// trace are still logged for debuggability.
|
||||
func TestCoalesceRestart_PanicInCycleClearsState(t *testing.T) {
|
||||
const wsID = "test-coalesce-panic-recovery"
|
||||
resetRestartStatesFor(wsID)
|
||||
|
||||
// First call's cycle panics. coalesceRestart's defer must swallow
|
||||
// the panic so this test caller doesn't see it propagate up — that
|
||||
// matches what the real production caller (`go h.RestartByID(...)`)
|
||||
// gets: the goroutine survives, no process crash.
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
t.Errorf("panic should NOT propagate out of coalesceRestart (would crash the platform process from a goroutine), got: %v", r)
|
||||
}
|
||||
}()
|
||||
coalesceRestart(wsID, func() { panic("simulated cycle failure") })
|
||||
|
||||
// Second call must run a fresh cycle. If running stayed true after
|
||||
// the panic, this call would early-return without invoking cycle.
|
||||
var ran atomic.Bool
|
||||
coalesceRestart(wsID, func() { ran.Store(true) })
|
||||
if !ran.Load() {
|
||||
t.Error("post-panic restart was blocked — running flag leaked, workspace permanently locked out")
|
||||
}
|
||||
}
|
||||
|
||||
// TestCoalesceRestart_DifferentWorkspacesDoNotSerialize verifies the
|
||||
// per-workspace state map: an in-flight restart for ws A must not
|
||||
// block restarts for ws B. Important for performance — without this,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user