fix(provision): route panic recovery through markProvisionFailed + fix log capture
Three fixes addressing review of the issue #2486 observability PR: 1. CI failure: original inline UPDATE in logProvisionPanic used a hard-coded `status='failed'` literal, which trips workspace_status_enum_drift_test (the post-PR-#2396 gate that requires every status write to flow through models.Status* via parameterized $N). Refactor to call h.markProvisionFailed which uses StatusFailed parameterized. 2. Canvas-broadcast gap (review finding): inline UPDATE skipped RecordAndBroadcast, so panic recovery marked the row failed in DB but the canvas spinner stayed on "provisioning" until the next poll. markProvisionFailed fires WORKSPACE_PROVISION_FAILED, so canvas now flips to a failure card immediately. 3. Critical test bug (review finding): `defer log.SetOutput(log.Writer())` in three test sites evaluated log.Writer() at defer-fire time AFTER the SetOutput swap — restoring the buffer to itself, never restoring os.Stderr. Subsequent tests in the package were running with the panic tests' captured buffer as their writer. Extracted captureLog(t) helper that captures `prev` BEFORE the swap and uses t.Cleanup. Plus: softened the "goroutine never started" comment in the concurrent repro harness — the harness atomic-counts BEFORE the entry log fires, so "never started" was misleading; the real failure mode is "entry log renamed/removed or writer hijacked." Verified: full handlers suite passes; drift gate passes (Platform Go CI failure root-caused). Regression-injected the recover body again — both panic tests still fail as expected, confirming the contract is gated. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
fe92194584
commit
7a19724194
@ -22,30 +22,33 @@ import (
|
||||
// provisionWorkspaceCP propagates up the goroutine stack and crashes the
|
||||
// whole workspace-server process — taking every other tenant workspace
|
||||
// down with it. With it, the panic is logged with a stack trace, the
|
||||
// workspace gets persistently marked failed (so the canvas surfaces
|
||||
// something instead of leaving it stuck in provisioning until the 10-min
|
||||
// sweeper fires), and the rest of the process keeps serving.
|
||||
// workspace is marked failed via markProvisionFailed (so the canvas
|
||||
// surfaces a failure card immediately instead of leaving the spinner
|
||||
// stuck on "provisioning" until the 10-min sweeper fires), and the rest
|
||||
// of the process keeps serving.
|
||||
//
|
||||
// Issue #2486 added this after the symmetric class — silent goroutine
|
||||
// exit, no log, no failure mark — was observed in prod. Even if the
|
||||
// root cause turns out not to be a panic, surfacing the panic class
|
||||
// closes one branch of "what could have happened" cleanly.
|
||||
func logProvisionPanic(workspaceID, mode string) {
|
||||
//
|
||||
// Method on *WorkspaceHandler (not free function) so the panic path can
|
||||
// reuse markProvisionFailed and emit the WORKSPACE_PROVISION_FAILED
|
||||
// broadcast — without the broadcast the canvas only learns of the
|
||||
// failure when the next poll/refresh hits the DB.
|
||||
func (h *WorkspaceHandler) logProvisionPanic(workspaceID, mode string) {
|
||||
r := recover()
|
||||
if r == nil {
|
||||
return
|
||||
}
|
||||
log.Printf("Provisioner: PANIC during provision goroutine for %s (mode=%s): %v\nstack:\n%s",
|
||||
workspaceID, mode, r, debug.Stack())
|
||||
// Best-effort mark-failed via a fresh context — the provision goroutine's
|
||||
// ctx may have been the one panicking. 10s is enough for a single UPDATE.
|
||||
// Fresh context: the provision goroutine's ctx may have been the one
|
||||
// panicking (timeout, cancelled). 10s is enough for the broadcast +
|
||||
// single UPDATE inside markProvisionFailed.
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
if _, err := db.DB.ExecContext(ctx,
|
||||
`UPDATE workspaces SET status='failed', last_sample_error=$2, updated_at=now() WHERE id=$1`,
|
||||
workspaceID, fmt.Sprintf("provision panic: %v", r)); err != nil {
|
||||
log.Printf("Provisioner: failed to persist panic-failure for %s: %v", workspaceID, err)
|
||||
}
|
||||
h.markProvisionFailed(ctx, workspaceID, fmt.Sprintf("provision panic: %v", r), nil)
|
||||
}
|
||||
|
||||
// provisionWorkspace handles async container deployment with timeout.
|
||||
@ -64,7 +67,7 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri
|
||||
// neither a prepare-failed nor start-failed nor success log line, so an
|
||||
// operator couldn't tell whether the goroutine ran at all.
|
||||
log.Printf("Provisioner: goroutine entered for %s (runtime=%s, mode=docker)", workspaceID, payload.Runtime)
|
||||
defer logProvisionPanic(workspaceID, "docker")
|
||||
defer h.logProvisionPanic(workspaceID, "docker")
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout)
|
||||
defer cancel()
|
||||
@ -687,7 +690,7 @@ func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string
|
||||
// unable to distinguish "goroutine never started" from "started but
|
||||
// returned via an unlogged path."
|
||||
log.Printf("CPProvisioner: goroutine entered for %s (runtime=%s, mode=cp)", workspaceID, payload.Runtime)
|
||||
defer logProvisionPanic(workspaceID, "cp")
|
||||
defer h.logProvisionPanic(workspaceID, "cp")
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout)
|
||||
defer cancel()
|
||||
|
||||
@ -29,7 +29,9 @@ import (
|
||||
//
|
||||
// If the silent-drop class is present in current head code, this test
|
||||
// fails because either (a) the entry-log count is < 7 (meaning one or
|
||||
// more goroutines never started — Go runtime issue), or (b) the
|
||||
// more goroutines reached the goroutine boundary but never produced
|
||||
// the entry-log line — entry log renamed/removed, or log writer
|
||||
// hijacked), or (b) the
|
||||
// recorder count is < 7 (meaning a goroutine entered but exited before
|
||||
// reaching cpProv.Start, via some unlogged path).
|
||||
//
|
||||
|
||||
@ -14,28 +14,55 @@ import (
|
||||
// Pin the issue #2486 contract: a panic inside the provision goroutine must
|
||||
// (1) not propagate (the deferred recover swallows it), (2) log the panic
|
||||
// with a stack trace so an operator can see what blew up, and (3) mark the
|
||||
// workspace `failed` so the canvas surfaces the failure instead of the row
|
||||
// sitting in `provisioning` until the 10-min sweeper.
|
||||
// workspace `failed` AND broadcast WORKSPACE_PROVISION_FAILED so the canvas
|
||||
// flips the spinner to a failure card immediately — not after the 10-min
|
||||
// sweeper.
|
||||
//
|
||||
// Helper: newPanicTestHandler wires a captureBroadcaster + handler so each
|
||||
// test exercises the real markProvisionFailed path. The broadcaster capture
|
||||
// is what proves assertion (3) — without it, the panic recovery would mark
|
||||
// the row failed in the DB but the canvas wouldn't learn until next refresh.
|
||||
|
||||
func newPanicTestHandler() (*WorkspaceHandler, *captureBroadcaster) {
|
||||
cap := &captureBroadcaster{}
|
||||
return NewWorkspaceHandler(cap, nil, "http://localhost:8080", ""), cap
|
||||
}
|
||||
|
||||
// captureLog swaps log output to a buffer for the test and restores the
|
||||
// previous writer on cleanup. Capturing `prev` BEFORE SetOutput is
|
||||
// load-bearing — `log.Writer()` evaluated at defer-fire time would
|
||||
// return the buffer (not the original writer) and never restore it,
|
||||
// poisoning subsequent tests in the package.
|
||||
func captureLog(t *testing.T) *bytes.Buffer {
|
||||
t.Helper()
|
||||
var buf bytes.Buffer
|
||||
prev := log.Writer()
|
||||
log.SetOutput(&buf)
|
||||
t.Cleanup(func() { log.SetOutput(prev) })
|
||||
return &buf
|
||||
}
|
||||
|
||||
func TestLogProvisionPanic_NoOpWhenNoPanic(t *testing.T) {
|
||||
// Sanity: the deferred recover must be silent when nothing panicked.
|
||||
// Otherwise every successful provision would emit a spurious panic log.
|
||||
var buf bytes.Buffer
|
||||
log.SetOutput(&buf)
|
||||
defer log.SetOutput(log.Writer())
|
||||
buf := captureLog(t)
|
||||
h, cap := newPanicTestHandler()
|
||||
|
||||
func() {
|
||||
defer logProvisionPanic("ws-no-panic", "cp")
|
||||
defer h.logProvisionPanic("ws-no-panic", "cp")
|
||||
// no panic
|
||||
}()
|
||||
|
||||
if buf.Len() != 0 {
|
||||
t.Fatalf("expected no log output when no panic, got: %q", buf.String())
|
||||
}
|
||||
if cap.lastData != nil {
|
||||
t.Fatalf("expected no broadcast when no panic, got: %v", cap.lastData)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLogProvisionPanic_RecoversAndMarksFailed(t *testing.T) {
|
||||
// Wire a sqlmock so logProvisionPanic's UPDATE has somewhere to land
|
||||
// Wire a sqlmock so markProvisionFailed's UPDATE has somewhere to land
|
||||
// without needing a real Postgres. The mock asserts the SQL shape +
|
||||
// args so a future refactor of the persist call doesn't silently
|
||||
// stop marking the row failed.
|
||||
@ -49,13 +76,15 @@ func TestLogProvisionPanic_RecoversAndMarksFailed(t *testing.T) {
|
||||
db.DB = mockDB
|
||||
defer func() { db.DB = prevDB }()
|
||||
|
||||
mock.ExpectExec(`UPDATE workspaces SET status='failed'`).
|
||||
WithArgs("ws-panic", sqlmock.AnyArg()).
|
||||
// markProvisionFailed issues:
|
||||
// UPDATE workspaces SET status = $3, last_sample_error = $2, updated_at = now() WHERE id = $1
|
||||
// with args (workspaceID, msg, models.StatusFailed).
|
||||
mock.ExpectExec(`UPDATE workspaces SET status`).
|
||||
WithArgs("ws-panic", sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
var buf bytes.Buffer
|
||||
log.SetOutput(&buf)
|
||||
defer log.SetOutput(log.Writer())
|
||||
buf := captureLog(t)
|
||||
h, cap := newPanicTestHandler()
|
||||
|
||||
// Exercise: a function that defers logProvisionPanic + then panics.
|
||||
// The recover MUST swallow the panic — if it propagates, the test
|
||||
@ -70,7 +99,7 @@ func TestLogProvisionPanic_RecoversAndMarksFailed(t *testing.T) {
|
||||
didNotPanic = false
|
||||
}
|
||||
}()
|
||||
defer logProvisionPanic("ws-panic", "cp")
|
||||
defer h.logProvisionPanic("ws-panic", "cp")
|
||||
panic("simulated provision panic for #2486 regression")
|
||||
}()
|
||||
|
||||
@ -90,7 +119,18 @@ func TestLogProvisionPanic_RecoversAndMarksFailed(t *testing.T) {
|
||||
}
|
||||
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sql expectations: %v — UPDATE workspaces … status='failed' was not issued", err)
|
||||
t.Errorf("sql expectations: %v — UPDATE workspaces … status=failed was not issued", err)
|
||||
}
|
||||
|
||||
// Canvas-broadcast assertion: the panic recovery MUST route through
|
||||
// markProvisionFailed, which fires WORKSPACE_PROVISION_FAILED. Without
|
||||
// this, the canvas spinner stays on "provisioning" until the sweeper
|
||||
// or a poll — defeating the immediate-feedback purpose of this gate.
|
||||
if cap.lastData == nil {
|
||||
t.Fatal("expected broadcaster.RecordAndBroadcast to be called by panic recovery, got nil — canvas would not see the failure")
|
||||
}
|
||||
if errMsg, ok := cap.lastData["error"].(string); !ok || !strings.Contains(errMsg, "provision panic:") {
|
||||
t.Errorf("broadcast payload missing/wrong 'error' field; got: %v", cap.lastData)
|
||||
}
|
||||
}
|
||||
|
||||
@ -109,21 +149,23 @@ func TestLogProvisionPanic_PersistFailureLogged(t *testing.T) {
|
||||
db.DB = mockDB
|
||||
defer func() { db.DB = prevDB }()
|
||||
|
||||
mock.ExpectExec(`UPDATE workspaces SET status='failed'`).
|
||||
WithArgs("ws-panic-persist-fail", sqlmock.AnyArg()).
|
||||
mock.ExpectExec(`UPDATE workspaces SET status`).
|
||||
WithArgs("ws-panic-persist-fail", sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
WillReturnError(sql.ErrConnDone)
|
||||
|
||||
var buf bytes.Buffer
|
||||
log.SetOutput(&buf)
|
||||
defer log.SetOutput(log.Writer())
|
||||
buf := captureLog(t)
|
||||
h, _ := newPanicTestHandler()
|
||||
|
||||
func() {
|
||||
defer logProvisionPanic("ws-panic-persist-fail", "docker")
|
||||
defer h.logProvisionPanic("ws-panic-persist-fail", "docker")
|
||||
panic("simulated panic with DB unavailable")
|
||||
}()
|
||||
|
||||
logged := buf.String()
|
||||
if !strings.Contains(logged, "failed to persist panic-failure for ws-panic-persist-fail") {
|
||||
t.Errorf("expected persist-failure log line; got: %q", logged)
|
||||
// markProvisionFailed logs `markProvisionFailed: db update failed for <id>: <err>`
|
||||
// when its UPDATE fails. That's the line that proves we surfaced the
|
||||
// persist failure rather than swallowing it.
|
||||
if !strings.Contains(logged, "markProvisionFailed: db update failed for ws-panic-persist-fail") {
|
||||
t.Errorf("expected markProvisionFailed db-update-failure log line; got: %q", logged)
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user