From 82cc3315171ee2ab61be31587750b5192534aade Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Fri, 1 May 2026 20:11:11 -0700 Subject: [PATCH 1/2] test(provision): harden panic tests with re-raise guard + assert broadcast count MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Post-merge follow-up to PR #2487 review feedback: 1. guardAgainstReraise(fn) helper around every panic-test exercise. The original RecoversAndMarksFailed had its own outer recover() to detect re-raise; NoOpWhenNoPanic and PersistFailureLogged didn't. If a future regression makes logProvisionPanic re-raise, those two would have crashed the test process (taking sibling tests down) instead of reporting a clean failure. Now all three use the shared guard. 2. Concurrent repro now asserts bcast.count == 7 — the new concurrentSafeBroadcaster's count field was added in the race fix but not actually consumed. Cross-checks the existing recorder-set assertion from a different angle: a goroutine could in principle reach cpProv.Start (recorder hits) but then lose its WORKSPACE_PROVISION_FAILED broadcast on the failure path. Pinning both rules out that silent-drop variant for the canvas-broadcast contract specifically. 3. Comment on captureLog noting log.SetOutput is process-global and incompatible with t.Parallel() — preempts a future footgun if someone parallelizes the panic suite. Verified: all four tests pass under -race; full handlers + db packages green under -race. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...rkspace_provision_concurrent_repro_test.go | 16 ++++++ .../workspace_provision_panic_test.go | 53 ++++++++++++------- 2 files changed, 50 insertions(+), 19 deletions(-) diff --git a/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go b/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go index 16473f70..e12b8fa8 100644 --- a/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go +++ b/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go @@ -189,6 +189,22 @@ func TestProvisionWorkspaceCP_ConcurrentBurst_NoSilentDrop(t *testing.T) { } } + // Assertion 4: every goroutine's failure path called RecordAndBroadcast + // exactly once (via h.markProvisionFailed inside provisionWorkspaceCP's + // "start failed" arm). Cross-checks Assertion 2 from a different angle + // — if a goroutine reaches Start() but then loses its WORKSPACE_ + // PROVISION_FAILED broadcast, the canvas spinner sticks on + // "provisioning" until the sweeper. That regression class is what + // drove making logProvisionPanic a method on *WorkspaceHandler — so + // it's worth pinning here too. + bcast.mu.Lock() + bcastCount := bcast.count + bcast.mu.Unlock() + if bcastCount != numWorkspaces { + t.Errorf("broadcaster saw %d RecordAndBroadcast calls, want %d. SILENT-DROP CLASS: a goroutine reached cpProv.Start but never broadcast its failure.", + bcastCount, numWorkspaces) + } + if err := mock.ExpectationsWereMet(); err != nil { // Soft-fail: under concurrency some queries may have been // re-ordered relative to the (non-strict) expectation set, diff --git a/workspace-server/internal/handlers/workspace_provision_panic_test.go b/workspace-server/internal/handlers/workspace_provision_panic_test.go index f7776ee4..d9705f30 100644 --- a/workspace-server/internal/handlers/workspace_provision_panic_test.go +++ b/workspace-server/internal/handlers/workspace_provision_panic_test.go @@ -33,6 +33,10 @@ func newPanicTestHandler() (*WorkspaceHandler, *captureBroadcaster) { // load-bearing — `log.Writer()` evaluated at defer-fire time would // return the buffer (not the original writer) and never restore it, // poisoning subsequent tests in the package. +// +// log.SetOutput is process-global: do NOT call this from a test that +// uses t.Parallel() or two captures will race + clobber. The panic +// tests below are intentionally non-parallel for this reason. func captureLog(t *testing.T) *bytes.Buffer { t.Helper() var buf bytes.Buffer @@ -42,16 +46,35 @@ func captureLog(t *testing.T) *bytes.Buffer { return &buf } +// guardAgainstReraise wraps a function in a recover-arm that flips the +// returned bool to false if anything propagates past `defer +// h.logProvisionPanic(...)`. Used in every panic test (not just +// RecoversAndMarksFailed) so a future regression that re-raises from +// the recovery path surfaces as a clean test failure, not a process +// abort that crashes sibling tests. +func guardAgainstReraise(fn func()) (didNotPanic bool) { + didNotPanic = true + defer func() { + if r := recover(); r != nil { + didNotPanic = false + } + }() + fn() + return +} + func TestLogProvisionPanic_NoOpWhenNoPanic(t *testing.T) { // Sanity: the deferred recover must be silent when nothing panicked. // Otherwise every successful provision would emit a spurious panic log. buf := captureLog(t) h, cap := newPanicTestHandler() - func() { + if !guardAgainstReraise(func() { defer h.logProvisionPanic("ws-no-panic", "cp") // no panic - }() + }) { + t.Fatal("logProvisionPanic re-raised on the no-panic path — recover() returned non-nil for a goroutine that didn't panic") + } if buf.Len() != 0 { t.Fatalf("expected no log output when no panic, got: %q", buf.String()) @@ -87,23 +110,13 @@ func TestLogProvisionPanic_RecoversAndMarksFailed(t *testing.T) { h, cap := newPanicTestHandler() // Exercise: a function that defers logProvisionPanic + then panics. - // The recover MUST swallow the panic — if it propagates, the test - // process crashes and the panic message bubbles up as a Go test - // failure rather than the assertion below. - didNotPanic := true - func() { - defer func() { - // If logProvisionPanic re-raised, this catches it for the - // test. We assert below that it did NOT re-raise. - if r := recover(); r != nil { - didNotPanic = false - } - }() + // The recover MUST swallow the panic — if it propagates, + // guardAgainstReraise catches it instead of letting the test + // process abort. + if !guardAgainstReraise(func() { defer h.logProvisionPanic("ws-panic", "cp") panic("simulated provision panic for #2486 regression") - }() - - if !didNotPanic { + }) { t.Fatal("logProvisionPanic re-raised the panic — the recover() arm did not swallow it") } @@ -156,10 +169,12 @@ func TestLogProvisionPanic_PersistFailureLogged(t *testing.T) { buf := captureLog(t) h, _ := newPanicTestHandler() - func() { + if !guardAgainstReraise(func() { defer h.logProvisionPanic("ws-panic-persist-fail", "docker") panic("simulated panic with DB unavailable") - }() + }) { + t.Fatal("logProvisionPanic re-raised when the persist-failure path was exercised — recover() arm did not swallow") + } logged := buf.String() // markProvisionFailed logs `markProvisionFailed: db update failed for : ` From 955755ce1e8c91dfc3edd6213a6d551f740d5875 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Fri, 1 May 2026 20:14:39 -0700 Subject: [PATCH 2/2] test(provision): tighten Assertion 4 message to name both failure modes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per review nit on PR #2491: the previous message ("a goroutine reached cpProv.Start but never broadcast its failure") could mislead an operator if Assertion 2 and 4 both fire — Assertion 4 also catches "goroutine exited via an earlier path before reaching Start." Spell both modes out and cross-reference Assertion 2. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../handlers/workspace_provision_concurrent_repro_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go b/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go index e12b8fa8..a17d5037 100644 --- a/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go +++ b/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go @@ -201,7 +201,7 @@ func TestProvisionWorkspaceCP_ConcurrentBurst_NoSilentDrop(t *testing.T) { bcastCount := bcast.count bcast.mu.Unlock() if bcastCount != numWorkspaces { - t.Errorf("broadcaster saw %d RecordAndBroadcast calls, want %d. SILENT-DROP CLASS: a goroutine reached cpProv.Start but never broadcast its failure.", + t.Errorf("broadcaster saw %d RecordAndBroadcast calls, want %d. SILENT-DROP CLASS: either a goroutine reached cpProv.Start but was lost before markProvisionFailed, OR it exited via an earlier path before reaching Start (cross-check Assertion 2 above).", bcastCount, numWorkspaces) }