Merge pull request #2267 from Molecule-AI/fix/restart-panic-recovery

fix(restart): clear running flag on panic in cycle()
This commit is contained in:
hongmingwang-moleculeai 2026-04-29 07:10:44 +00:00 committed by GitHub
commit aa6c42f042
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 60 additions and 2 deletions

View File

@ -5,6 +5,7 @@ import (
"database/sql"
"log"
"net/http"
"runtime/debug"
"strings"
"sync"
"time"
@ -352,6 +353,27 @@ func coalesceRestart(workspaceID string, cycle func()) {
state.running = true
state.mu.Unlock()
// Always clear running on exit — including panic — so a panicking
// cycle (e.g. a future provisionWorkspace nil-deref) doesn't leave
// the workspace permanently locked out of restarts.
//
// recover()-and-DON'T-re-raise on purpose: this runs in a goroutine
// (callers are `go h.RestartByID(...)`); an unrecovered panic in a
// goroutine crashes the whole platform process, taking down every
// workspace served by this binary because of one bug in cycle for
// one workspace. Log the panic with stack trace for debuggability,
// then recover and let the goroutine exit cleanly. The next restart
// request for this workspace will see running=false and proceed.
defer func() {
state.mu.Lock()
state.running = false
state.mu.Unlock()
if r := recover(); r != nil {
log.Printf("Auto-restart: %s — cycle panicked, restart-state cleared: %v\n%s",
workspaceID, r, debug.Stack())
}
}()
// Drain pending requests. Each iteration re-loads workspace_secrets
// inside provisionWorkspace, so any writes that committed since the
// last cycle are picked up. Continues until no pending request was
@ -359,9 +381,8 @@ func coalesceRestart(workspaceID string, cycle func()) {
for {
state.mu.Lock()
if !state.pending {
state.running = false
state.mu.Unlock()
return
return // defer clears running
}
state.pending = false
state.mu.Unlock()

View File

@ -183,6 +183,43 @@ func TestCoalesceRestart_StateClearedAfterDrain(t *testing.T) {
}
}
// TestCoalesceRestart_PanicInCycleClearsState defends against a
// regression of the sticky-running deadlock: if cycle() panics, the
// running flag MUST be cleared so a follow-up RestartByID for the
// same workspace can still acquire the gate. Without the deferred
// state-clear + recover, a single panic would permanently lock a
// workspace out of all future restarts until process restart.
//
// Also asserts the panic is RECOVERED (not re-raised): callers are
// `go h.RestartByID(...)` from HTTP handlers, and an unrecovered
// goroutine panic in Go takes down the whole process. Crashing the
// platform for every tenant because one workspace's cycle panicked
// is the wrong availability tradeoff. The panic message + stack
// trace are still logged for debuggability.
func TestCoalesceRestart_PanicInCycleClearsState(t *testing.T) {
const wsID = "test-coalesce-panic-recovery"
resetRestartStatesFor(wsID)
// First call's cycle panics. coalesceRestart's defer must swallow
// the panic so this test caller doesn't see it propagate up — that
// matches what the real production caller (`go h.RestartByID(...)`)
// gets: the goroutine survives, no process crash.
defer func() {
if r := recover(); r != nil {
t.Errorf("panic should NOT propagate out of coalesceRestart (would crash the platform process from a goroutine), got: %v", r)
}
}()
coalesceRestart(wsID, func() { panic("simulated cycle failure") })
// Second call must run a fresh cycle. If running stayed true after
// the panic, this call would early-return without invoking cycle.
var ran atomic.Bool
coalesceRestart(wsID, func() { ran.Store(true) })
if !ran.Load() {
t.Error("post-panic restart was blocked — running flag leaked, workspace permanently locked out")
}
}
// TestCoalesceRestart_DifferentWorkspacesDoNotSerialize verifies the
// per-workspace state map: an in-flight restart for ws A must not
// block restarts for ws B. Important for performance — without this,