diff --git a/workspace-server/internal/handlers/a2a_proxy_test.go b/workspace-server/internal/handlers/a2a_proxy_test.go index 53d7daf9..ef9bec05 100644 --- a/workspace-server/internal/handlers/a2a_proxy_test.go +++ b/workspace-server/internal/handlers/a2a_proxy_test.go @@ -1806,24 +1806,80 @@ func TestMaybeMarkContainerDead_CPOnly_Running(t *testing.T) { } } -// fakeCPProv satisfies provisioner.CPProvisionerAPI for tests that need -// to exercise the SaaS / EC2-backed path. Only IsRunning is used by the -// reactive-health path under test; Start/Stop/GetConsoleOutput panic so -// an unexpected reach into them is a loud test failure rather than a -// silent passthrough. +// SaaS-path runRestartCycle: when h.provisioner is nil and h.cpProv is set, +// the auto-restart cycle MUST call cpProv.Stop (not Docker provisioner.Stop). +// Pre-fix this dispatched only to h.provisioner.Stop, NPE'd on nil, was +// silently swallowed by coalesceRestart's recover-without-re-raise, and +// left the workspace stuck in status='provisioning' forever — making +// reactive auto-restart on SaaS effectively dead code. The independent +// review of PR #2362 caught this gap. +// +// We drive runRestartCycle directly (not via RestartByID/coalesceRestart) +// so we don't fight the goroutine's timing in a unit test. The full +// restart chain (provisionWorkspaceCP) needs its own mocked DB rows that +// would explode the surface area of this test; what we care about here +// is the dispatch decision, which is observable on cpProv.stopCalls. +func TestRunRestartCycle_SaaSPath_DispatchesViaCPProv(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir()) + cp := &fakeCPProv{} + handler.SetCPProvisioner(cp) + + // runRestartCycle's first query — workspace metadata. + mock.ExpectQuery(`SELECT name, status, tier, COALESCE\(runtime, 'langgraph'\) FROM workspaces`). + WithArgs("ws-saas-restart"). + WillReturnRows(sqlmock.NewRows([]string{"name", "status", "tier", "runtime"}). + AddRow("Test Workspace", "online", 2, "hermes")) + // isParentPaused — return nil parent_id so the helper short-circuits. + mock.ExpectQuery(`SELECT parent_id FROM workspaces WHERE id =`). + WithArgs("ws-saas-restart"). + WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow(nil)) + // After Stop, the cycle UPDATEs status to 'provisioning'. We allow + // that and don't mock everything provisionWorkspaceCP touches — + // provisionWorkspaceCP will fail at its first unmocked query, which + // is downstream of the dispatch decision under test. + mock.ExpectExec(`UPDATE workspaces SET status = 'provisioning'`). + WithArgs("ws-saas-restart"). + WillReturnResult(sqlmock.NewResult(0, 1)) + // loadRestartContextData query (best-effort lookups; let them not match + // and the function returns a zero-valued struct). + mock.MatchExpectationsInOrder(false) + + handler.runRestartCycle("ws-saas-restart") + + // The dispatch we're testing: SaaS path → cpProv.Stop, never Docker. + if cp.stopCalls != 1 { + t.Fatalf("expected cpProv.Stop to be called once on SaaS auto-restart; got %d calls — pre-fix the cycle NPE'd before reaching Stop on the (nil) Docker path", cp.stopCalls) + } +} + +// fakeCPProv satisfies provisioner.CPProvisionerAPI for tests that exercise +// the SaaS / EC2-backed reactive-health path. +// +// Methods all record calls. Start/Stop/GetConsoleOutput return nil/empty by +// default — the maybeMarkContainerDead happy path triggers an async +// `go h.RestartByID(...)` which calls Stop, so the previous "panic on +// unexpected call" pattern was unsafe (the panic fires on a goroutine, +// after the assertions ran). Tests that want to ASSERT a method is unused +// can check `calls == 0` after a sync barrier. type fakeCPProv struct { - running bool - calls int + running bool + calls int + stopCalls int + startCalls int } func (f *fakeCPProv) Start(_ context.Context, _ provisioner.WorkspaceConfig) (string, error) { - panic("fakeCPProv.Start not expected on the maybeMarkContainerDead path") + f.startCalls++ + return "", nil } func (f *fakeCPProv) Stop(_ context.Context, _ string) error { - panic("fakeCPProv.Stop not expected on the maybeMarkContainerDead path") + f.stopCalls++ + return nil } func (f *fakeCPProv) GetConsoleOutput(_ context.Context, _ string) (string, error) { - panic("fakeCPProv.GetConsoleOutput not expected on the maybeMarkContainerDead path") + return "", nil } func (f *fakeCPProv) IsRunning(_ context.Context, _ string) (bool, error) { f.calls++ diff --git a/workspace-server/internal/handlers/workspace_restart.go b/workspace-server/internal/handlers/workspace_restart.go index 29d8a94e..7f3f7065 100644 --- a/workspace-server/internal/handlers/workspace_restart.go +++ b/workspace-server/internal/handlers/workspace_restart.go @@ -463,7 +463,21 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) { // SYNCHRONOUS provisionWorkspace: returns when the new container is up // (or has failed). The outer loop relies on this to know when it's safe // to start another restart cycle without racing this one's Stop call. - h.provisionWorkspace(workspaceID, "", nil, payload) + // + // Branch on which provisioner is wired — same dispatch as the other call + // sites in this package (workspace.go:431-433, workspace_restart.go:197+596). + // Pre-fix this only called the Docker variant, so on SaaS the auto-restart + // cycle would NPE inside provisionWorkspace's `h.provisioner.VolumeHasFile` + // call, get swallowed by coalesceRestart's recover()-without-re-raise (a + // platform-stability safeguard), and leave the workspace permanently + // stuck in status='provisioning' (the UPDATE above already ran). User- + // observable result before this fix on SaaS: dead workspace → manual + // canvas restart was the only recovery path. + if h.cpProv != nil { + h.provisionWorkspaceCP(workspaceID, "", nil, payload) + } else { + h.provisionWorkspace(workspaceID, "", nil, payload) + } // sendRestartContext is a one-way notification to the new container; safe // to fire async — the next restart cycle won't depend on it completing. go h.sendRestartContext(workspaceID, restartData)