fix(restart): branch provisionWorkspace dispatch on cpProv (PR #2362 amendment)

Independent review of #2362 caught a Critical gap: the previous commit fixed the Stop dispatch in runRestartCycle but left the provisionWorkspace dispatch unconditionally Docker-only. So on SaaS the auto-restart cycle would Stop the EC2 successfully (good), then NPE inside provisionWorkspace's `h.provisioner.VolumeHasFile` call. coalesceRestart's recover()-without- re-raise (a deliberate platform-stability safeguard) silently swallowed the panic, leaving the workspace permanently stuck in status='provisioning' because the UPDATE on workspace_restart.go:450 had already run. Net pre-amendment effect on SaaS: dead agent → structured 503 (good) → workspace flipped to 'offline' (good) → cpProv.Stop succeeded (good) → provisionWorkspace NPE swallowed (bad) → workspace permanently 'provisioning' until manual canvas restart. The headline claim of #2362 ("SaaS auto-restart now works") was false on the path it shipped. Fix: dispatch the reprovision call the same way every other call site in the package does (workspace.go:431-433, workspace_restart.go:197+596) — branch on `h.cpProv != nil` and call provisionWorkspaceCP for SaaS, provisionWorkspace for Docker. Tests: - New TestRunRestartCycle_SaaSPath_DispatchesViaCPProv asserts cpProv.Stop is called when the SaaS path runs (would have caught the NPE if provisionWorkspace had been called instead). - fakeCPProv updated: methods record calls and return nil/empty by default rather than panicking. The previous "panic on unexpected call" pattern was unsafe — the panic fires on the async restart goroutine spawned by maybeMarkContainerDead AFTER the test assertions ran, so the test passed by accident even though the production path was broken (which is exactly how the Critical bug landed). - Existing tests still pass (full handlers + provisioner suites green). Branch-count audit refresh: runRestartCycle dispatch decisions: 1. h.provisioner != nil → provisioner.Stop + provisionWorkspace ✓ (existing tests) 2. h.cpProv != nil → cpProv.Stop + provisionWorkspaceCP ✓ (NEW test) 3. both nil → coalesceRestart never called (RestartByID gate) ✓ Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 00:35:51 -07:00 · 2026-04-30 00:35:51 -07:00 · 28b4e38002
commit 28b4e38002
parent 9f35788aee
2 changed files with 81 additions and 11 deletions
--- a/workspace-server/internal/handlers/a2a_proxy_test.go
+++ b/workspace-server/internal/handlers/a2a_proxy_test.go
@ -1806,24 +1806,80 @@ func TestMaybeMarkContainerDead_CPOnly_Running(t *testing.T) {
 	}
 }

-// fakeCPProv satisfies provisioner.CPProvisionerAPI for tests that need
-// to exercise the SaaS / EC2-backed path. Only IsRunning is used by the
-// reactive-health path under test; Start/Stop/GetConsoleOutput panic so
-// an unexpected reach into them is a loud test failure rather than a
-// silent passthrough.
+// SaaS-path runRestartCycle: when h.provisioner is nil and h.cpProv is set,
+// the auto-restart cycle MUST call cpProv.Stop (not Docker provisioner.Stop).
+// Pre-fix this dispatched only to h.provisioner.Stop, NPE'd on nil, was
+// silently swallowed by coalesceRestart's recover-without-re-raise, and
+// left the workspace stuck in status='provisioning' forever — making
+// reactive auto-restart on SaaS effectively dead code. The independent
+// review of PR #2362 caught this gap.
+//
+// We drive runRestartCycle directly (not via RestartByID/coalesceRestart)
+// so we don't fight the goroutine's timing in a unit test. The full
+// restart chain (provisionWorkspaceCP) needs its own mocked DB rows that
+// would explode the surface area of this test; what we care about here
+// is the dispatch decision, which is observable on cpProv.stopCalls.
+func TestRunRestartCycle_SaaSPath_DispatchesViaCPProv(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+	cp := &fakeCPProv{}
+	handler.SetCPProvisioner(cp)
+
+	// runRestartCycle's first query — workspace metadata.
+	mock.ExpectQuery(`SELECT name, status, tier, COALESCE\(runtime, 'langgraph'\) FROM workspaces`).
+		WithArgs("ws-saas-restart").
+		WillReturnRows(sqlmock.NewRows([]string{"name", "status", "tier", "runtime"}).
+			AddRow("Test Workspace", "online", 2, "hermes"))
+	// isParentPaused — return nil parent_id so the helper short-circuits.
+	mock.ExpectQuery(`SELECT parent_id FROM workspaces WHERE id =`).
+		WithArgs("ws-saas-restart").
+		WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow(nil))
+	// After Stop, the cycle UPDATEs status to 'provisioning'. We allow
+	// that and don't mock everything provisionWorkspaceCP touches —
+	// provisionWorkspaceCP will fail at its first unmocked query, which
+	// is downstream of the dispatch decision under test.
+	mock.ExpectExec(`UPDATE workspaces SET status = 'provisioning'`).
+		WithArgs("ws-saas-restart").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+	// loadRestartContextData query (best-effort lookups; let them not match
+	// and the function returns a zero-valued struct).
+	mock.MatchExpectationsInOrder(false)
+
+	handler.runRestartCycle("ws-saas-restart")
+
+	// The dispatch we're testing: SaaS path → cpProv.Stop, never Docker.
+	if cp.stopCalls != 1 {
+		t.Fatalf("expected cpProv.Stop to be called once on SaaS auto-restart; got %d calls — pre-fix the cycle NPE'd before reaching Stop on the (nil) Docker path", cp.stopCalls)
+	}
+}
+
+// fakeCPProv satisfies provisioner.CPProvisionerAPI for tests that exercise
+// the SaaS / EC2-backed reactive-health path.
+//
+// Methods all record calls. Start/Stop/GetConsoleOutput return nil/empty by
+// default — the maybeMarkContainerDead happy path triggers an async
+// `go h.RestartByID(...)` which calls Stop, so the previous "panic on
+// unexpected call" pattern was unsafe (the panic fires on a goroutine,
+// after the assertions ran). Tests that want to ASSERT a method is unused
+// can check `calls == 0` after a sync barrier.
 type fakeCPProv struct {
-	running bool
-	calls   int
+	running    bool
+	calls      int
+	stopCalls  int
+	startCalls int
 }

 func (f *fakeCPProv) Start(_ context.Context, _ provisioner.WorkspaceConfig) (string, error) {
-	panic("fakeCPProv.Start not expected on the maybeMarkContainerDead path")
+	f.startCalls++
+	return "", nil
 }
 func (f *fakeCPProv) Stop(_ context.Context, _ string) error {
-	panic("fakeCPProv.Stop not expected on the maybeMarkContainerDead path")
+	f.stopCalls++
+	return nil
 }
 func (f *fakeCPProv) GetConsoleOutput(_ context.Context, _ string) (string, error) {
-	panic("fakeCPProv.GetConsoleOutput not expected on the maybeMarkContainerDead path")
+	return "", nil
 }
 func (f *fakeCPProv) IsRunning(_ context.Context, _ string) (bool, error) {
 	f.calls++
--- a/workspace-server/internal/handlers/workspace_restart.go
+++ b/workspace-server/internal/handlers/workspace_restart.go
@ -463,7 +463,21 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) {
 	// SYNCHRONOUS provisionWorkspace: returns when the new container is up
 	// (or has failed). The outer loop relies on this to know when it's safe
 	// to start another restart cycle without racing this one's Stop call.
-	h.provisionWorkspace(workspaceID, "", nil, payload)
+	//
+	// Branch on which provisioner is wired — same dispatch as the other call
+	// sites in this package (workspace.go:431-433, workspace_restart.go:197+596).
+	// Pre-fix this only called the Docker variant, so on SaaS the auto-restart
+	// cycle would NPE inside provisionWorkspace's `h.provisioner.VolumeHasFile`
+	// call, get swallowed by coalesceRestart's recover()-without-re-raise (a
+	// platform-stability safeguard), and leave the workspace permanently
+	// stuck in status='provisioning' (the UPDATE above already ran). User-
+	// observable result before this fix on SaaS: dead workspace → manual
+	// canvas restart was the only recovery path.
+	if h.cpProv != nil {
+		h.provisionWorkspaceCP(workspaceID, "", nil, payload)
+	} else {
+		h.provisionWorkspace(workspaceID, "", nil, payload)
+	}
 	// sendRestartContext is a one-way notification to the new container; safe
 	// to fire async — the next restart cycle won't depend on it completing.
 	go h.sendRestartContext(workspaceID, restartData)