Merge pull request #2267 from Molecule-AI/fix/restart-panic-recovery

fix(restart): clear running flag on panic in cycle()
2026-04-29 07:10:44 +00:00 · 2026-04-29 07:10:44 +00:00 · aa6c42f042
commit aa6c42f042
parent 480b0576f2 bdfa45572e
2 changed files with 60 additions and 2 deletions
--- a/workspace-server/internal/handlers/workspace_restart.go
+++ b/workspace-server/internal/handlers/workspace_restart.go
@ -5,6 +5,7 @@ import (
 	"database/sql"
 	"log"
 	"net/http"
+	"runtime/debug"
 	"strings"
 	"sync"
 	"time"
@ -352,6 +353,27 @@ func coalesceRestart(workspaceID string, cycle func()) {
 	state.running = true
 	state.mu.Unlock()

+	// Always clear running on exit — including panic — so a panicking
+	// cycle (e.g. a future provisionWorkspace nil-deref) doesn't leave
+	// the workspace permanently locked out of restarts.
+	//
+	// recover()-and-DON'T-re-raise on purpose: this runs in a goroutine
+	// (callers are `go h.RestartByID(...)`); an unrecovered panic in a
+	// goroutine crashes the whole platform process, taking down every
+	// workspace served by this binary because of one bug in cycle for
+	// one workspace. Log the panic with stack trace for debuggability,
+	// then recover and let the goroutine exit cleanly. The next restart
+	// request for this workspace will see running=false and proceed.
+	defer func() {
+		state.mu.Lock()
+		state.running = false
+		state.mu.Unlock()
+		if r := recover(); r != nil {
+			log.Printf("Auto-restart: %s — cycle panicked, restart-state cleared: %v\n%s",
+				workspaceID, r, debug.Stack())
+		}
+	}()
+
 	// Drain pending requests. Each iteration re-loads workspace_secrets
 	// inside provisionWorkspace, so any writes that committed since the
 	// last cycle are picked up. Continues until no pending request was
@ -359,9 +381,8 @@ func coalesceRestart(workspaceID string, cycle func()) {
 	for {
 		state.mu.Lock()
 		if !state.pending {
-			state.running = false
 			state.mu.Unlock()
-			return
+			return // defer clears running
 		}
 		state.pending = false
 		state.mu.Unlock()
--- a/workspace-server/internal/handlers/workspace_restart_coalesce_test.go
+++ b/workspace-server/internal/handlers/workspace_restart_coalesce_test.go
@ -183,6 +183,43 @@ func TestCoalesceRestart_StateClearedAfterDrain(t *testing.T) {
 	}
 }

+// TestCoalesceRestart_PanicInCycleClearsState defends against a
+// regression of the sticky-running deadlock: if cycle() panics, the
+// running flag MUST be cleared so a follow-up RestartByID for the
+// same workspace can still acquire the gate. Without the deferred
+// state-clear + recover, a single panic would permanently lock a
+// workspace out of all future restarts until process restart.
+//
+// Also asserts the panic is RECOVERED (not re-raised): callers are
+// `go h.RestartByID(...)` from HTTP handlers, and an unrecovered
+// goroutine panic in Go takes down the whole process. Crashing the
+// platform for every tenant because one workspace's cycle panicked
+// is the wrong availability tradeoff. The panic message + stack
+// trace are still logged for debuggability.
+func TestCoalesceRestart_PanicInCycleClearsState(t *testing.T) {
+	const wsID = "test-coalesce-panic-recovery"
+	resetRestartStatesFor(wsID)
+
+	// First call's cycle panics. coalesceRestart's defer must swallow
+	// the panic so this test caller doesn't see it propagate up — that
+	// matches what the real production caller (`go h.RestartByID(...)`)
+	// gets: the goroutine survives, no process crash.
+	defer func() {
+		if r := recover(); r != nil {
+			t.Errorf("panic should NOT propagate out of coalesceRestart (would crash the platform process from a goroutine), got: %v", r)
+		}
+	}()
+	coalesceRestart(wsID, func() { panic("simulated cycle failure") })
+
+	// Second call must run a fresh cycle. If running stayed true after
+	// the panic, this call would early-return without invoking cycle.
+	var ran atomic.Bool
+	coalesceRestart(wsID, func() { ran.Store(true) })
+	if !ran.Load() {
+		t.Error("post-panic restart was blocked — running flag leaked, workspace permanently locked out")
+	}
+}
+
 // TestCoalesceRestart_DifferentWorkspacesDoNotSerialize verifies the
 // per-workspace state map: an in-flight restart for ws A must not
 // block restarts for ws B. Important for performance — without this,