diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go index a4c37c37..6339fb43 100644 --- a/workspace-server/internal/handlers/workspace_provision.go +++ b/workspace-server/internal/handlers/workspace_provision.go @@ -22,30 +22,33 @@ import ( // provisionWorkspaceCP propagates up the goroutine stack and crashes the // whole workspace-server process — taking every other tenant workspace // down with it. With it, the panic is logged with a stack trace, the -// workspace gets persistently marked failed (so the canvas surfaces -// something instead of leaving it stuck in provisioning until the 10-min -// sweeper fires), and the rest of the process keeps serving. +// workspace is marked failed via markProvisionFailed (so the canvas +// surfaces a failure card immediately instead of leaving the spinner +// stuck on "provisioning" until the 10-min sweeper fires), and the rest +// of the process keeps serving. // // Issue #2486 added this after the symmetric class — silent goroutine // exit, no log, no failure mark — was observed in prod. Even if the // root cause turns out not to be a panic, surfacing the panic class // closes one branch of "what could have happened" cleanly. -func logProvisionPanic(workspaceID, mode string) { +// +// Method on *WorkspaceHandler (not free function) so the panic path can +// reuse markProvisionFailed and emit the WORKSPACE_PROVISION_FAILED +// broadcast — without the broadcast the canvas only learns of the +// failure when the next poll/refresh hits the DB. +func (h *WorkspaceHandler) logProvisionPanic(workspaceID, mode string) { r := recover() if r == nil { return } log.Printf("Provisioner: PANIC during provision goroutine for %s (mode=%s): %v\nstack:\n%s", workspaceID, mode, r, debug.Stack()) - // Best-effort mark-failed via a fresh context — the provision goroutine's - // ctx may have been the one panicking. 10s is enough for a single UPDATE. + // Fresh context: the provision goroutine's ctx may have been the one + // panicking (timeout, cancelled). 10s is enough for the broadcast + + // single UPDATE inside markProvisionFailed. ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() - if _, err := db.DB.ExecContext(ctx, - `UPDATE workspaces SET status='failed', last_sample_error=$2, updated_at=now() WHERE id=$1`, - workspaceID, fmt.Sprintf("provision panic: %v", r)); err != nil { - log.Printf("Provisioner: failed to persist panic-failure for %s: %v", workspaceID, err) - } + h.markProvisionFailed(ctx, workspaceID, fmt.Sprintf("provision panic: %v", r), nil) } // provisionWorkspace handles async container deployment with timeout. @@ -64,7 +67,7 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri // neither a prepare-failed nor start-failed nor success log line, so an // operator couldn't tell whether the goroutine ran at all. log.Printf("Provisioner: goroutine entered for %s (runtime=%s, mode=docker)", workspaceID, payload.Runtime) - defer logProvisionPanic(workspaceID, "docker") + defer h.logProvisionPanic(workspaceID, "docker") ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout) defer cancel() @@ -687,7 +690,7 @@ func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string // unable to distinguish "goroutine never started" from "started but // returned via an unlogged path." log.Printf("CPProvisioner: goroutine entered for %s (runtime=%s, mode=cp)", workspaceID, payload.Runtime) - defer logProvisionPanic(workspaceID, "cp") + defer h.logProvisionPanic(workspaceID, "cp") ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout) defer cancel() diff --git a/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go b/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go index 2100a49c..2c19e41b 100644 --- a/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go +++ b/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go @@ -29,7 +29,9 @@ import ( // // If the silent-drop class is present in current head code, this test // fails because either (a) the entry-log count is < 7 (meaning one or -// more goroutines never started — Go runtime issue), or (b) the +// more goroutines reached the goroutine boundary but never produced +// the entry-log line — entry log renamed/removed, or log writer +// hijacked), or (b) the // recorder count is < 7 (meaning a goroutine entered but exited before // reaching cpProv.Start, via some unlogged path). // diff --git a/workspace-server/internal/handlers/workspace_provision_panic_test.go b/workspace-server/internal/handlers/workspace_provision_panic_test.go index 0a287fb3..f7776ee4 100644 --- a/workspace-server/internal/handlers/workspace_provision_panic_test.go +++ b/workspace-server/internal/handlers/workspace_provision_panic_test.go @@ -14,28 +14,55 @@ import ( // Pin the issue #2486 contract: a panic inside the provision goroutine must // (1) not propagate (the deferred recover swallows it), (2) log the panic // with a stack trace so an operator can see what blew up, and (3) mark the -// workspace `failed` so the canvas surfaces the failure instead of the row -// sitting in `provisioning` until the 10-min sweeper. +// workspace `failed` AND broadcast WORKSPACE_PROVISION_FAILED so the canvas +// flips the spinner to a failure card immediately — not after the 10-min +// sweeper. +// +// Helper: newPanicTestHandler wires a captureBroadcaster + handler so each +// test exercises the real markProvisionFailed path. The broadcaster capture +// is what proves assertion (3) — without it, the panic recovery would mark +// the row failed in the DB but the canvas wouldn't learn until next refresh. + +func newPanicTestHandler() (*WorkspaceHandler, *captureBroadcaster) { + cap := &captureBroadcaster{} + return NewWorkspaceHandler(cap, nil, "http://localhost:8080", ""), cap +} + +// captureLog swaps log output to a buffer for the test and restores the +// previous writer on cleanup. Capturing `prev` BEFORE SetOutput is +// load-bearing — `log.Writer()` evaluated at defer-fire time would +// return the buffer (not the original writer) and never restore it, +// poisoning subsequent tests in the package. +func captureLog(t *testing.T) *bytes.Buffer { + t.Helper() + var buf bytes.Buffer + prev := log.Writer() + log.SetOutput(&buf) + t.Cleanup(func() { log.SetOutput(prev) }) + return &buf +} func TestLogProvisionPanic_NoOpWhenNoPanic(t *testing.T) { // Sanity: the deferred recover must be silent when nothing panicked. // Otherwise every successful provision would emit a spurious panic log. - var buf bytes.Buffer - log.SetOutput(&buf) - defer log.SetOutput(log.Writer()) + buf := captureLog(t) + h, cap := newPanicTestHandler() func() { - defer logProvisionPanic("ws-no-panic", "cp") + defer h.logProvisionPanic("ws-no-panic", "cp") // no panic }() if buf.Len() != 0 { t.Fatalf("expected no log output when no panic, got: %q", buf.String()) } + if cap.lastData != nil { + t.Fatalf("expected no broadcast when no panic, got: %v", cap.lastData) + } } func TestLogProvisionPanic_RecoversAndMarksFailed(t *testing.T) { - // Wire a sqlmock so logProvisionPanic's UPDATE has somewhere to land + // Wire a sqlmock so markProvisionFailed's UPDATE has somewhere to land // without needing a real Postgres. The mock asserts the SQL shape + // args so a future refactor of the persist call doesn't silently // stop marking the row failed. @@ -49,13 +76,15 @@ func TestLogProvisionPanic_RecoversAndMarksFailed(t *testing.T) { db.DB = mockDB defer func() { db.DB = prevDB }() - mock.ExpectExec(`UPDATE workspaces SET status='failed'`). - WithArgs("ws-panic", sqlmock.AnyArg()). + // markProvisionFailed issues: + // UPDATE workspaces SET status = $3, last_sample_error = $2, updated_at = now() WHERE id = $1 + // with args (workspaceID, msg, models.StatusFailed). + mock.ExpectExec(`UPDATE workspaces SET status`). + WithArgs("ws-panic", sqlmock.AnyArg(), sqlmock.AnyArg()). WillReturnResult(sqlmock.NewResult(0, 1)) - var buf bytes.Buffer - log.SetOutput(&buf) - defer log.SetOutput(log.Writer()) + buf := captureLog(t) + h, cap := newPanicTestHandler() // Exercise: a function that defers logProvisionPanic + then panics. // The recover MUST swallow the panic — if it propagates, the test @@ -70,7 +99,7 @@ func TestLogProvisionPanic_RecoversAndMarksFailed(t *testing.T) { didNotPanic = false } }() - defer logProvisionPanic("ws-panic", "cp") + defer h.logProvisionPanic("ws-panic", "cp") panic("simulated provision panic for #2486 regression") }() @@ -90,7 +119,18 @@ func TestLogProvisionPanic_RecoversAndMarksFailed(t *testing.T) { } if err := mock.ExpectationsWereMet(); err != nil { - t.Errorf("sql expectations: %v — UPDATE workspaces … status='failed' was not issued", err) + t.Errorf("sql expectations: %v — UPDATE workspaces … status=failed was not issued", err) + } + + // Canvas-broadcast assertion: the panic recovery MUST route through + // markProvisionFailed, which fires WORKSPACE_PROVISION_FAILED. Without + // this, the canvas spinner stays on "provisioning" until the sweeper + // or a poll — defeating the immediate-feedback purpose of this gate. + if cap.lastData == nil { + t.Fatal("expected broadcaster.RecordAndBroadcast to be called by panic recovery, got nil — canvas would not see the failure") + } + if errMsg, ok := cap.lastData["error"].(string); !ok || !strings.Contains(errMsg, "provision panic:") { + t.Errorf("broadcast payload missing/wrong 'error' field; got: %v", cap.lastData) } } @@ -109,21 +149,23 @@ func TestLogProvisionPanic_PersistFailureLogged(t *testing.T) { db.DB = mockDB defer func() { db.DB = prevDB }() - mock.ExpectExec(`UPDATE workspaces SET status='failed'`). - WithArgs("ws-panic-persist-fail", sqlmock.AnyArg()). + mock.ExpectExec(`UPDATE workspaces SET status`). + WithArgs("ws-panic-persist-fail", sqlmock.AnyArg(), sqlmock.AnyArg()). WillReturnError(sql.ErrConnDone) - var buf bytes.Buffer - log.SetOutput(&buf) - defer log.SetOutput(log.Writer()) + buf := captureLog(t) + h, _ := newPanicTestHandler() func() { - defer logProvisionPanic("ws-panic-persist-fail", "docker") + defer h.logProvisionPanic("ws-panic-persist-fail", "docker") panic("simulated panic with DB unavailable") }() logged := buf.String() - if !strings.Contains(logged, "failed to persist panic-failure for ws-panic-persist-fail") { - t.Errorf("expected persist-failure log line; got: %q", logged) + // markProvisionFailed logs `markProvisionFailed: db update failed for : ` + // when its UPDATE fails. That's the line that proves we surfaced the + // persist failure rather than swallowing it. + if !strings.Contains(logged, "markProvisionFailed: db update failed for ws-panic-persist-fail") { + t.Errorf("expected markProvisionFailed db-update-failure log line; got: %q", logged) } }