diff --git a/workspace-server/cmd/server/cp_config_test.go b/workspace-server/cmd/server/cp_config_test.go index 669f682d0..adf707911 100644 --- a/workspace-server/cmd/server/cp_config_test.go +++ b/workspace-server/cmd/server/cp_config_test.go @@ -7,6 +7,7 @@ import ( "os" "strings" "testing" + "time" ) // TestRefreshEnvFromCP_NoopWhenNotSaaS: without MOLECULE_ORG_ID or @@ -243,3 +244,59 @@ func TestRefreshEnvFromCP_RejectsOversizedValue(t *testing.T) { "original", len(got)) } } + +// TestRefreshEnvFromCP_ClientTimeoutFiresOnSlowUpstream: the +// core#2125 fix replaced http.DefaultClient with +// `&http.Client{Timeout: 10 * time.Second}`. This regression test +// proves that a hung / slow upstream does NOT block the boot — the +// client times out at 10s and refreshEnvFromCP returns an error +// within a small bound. Without the timeout, this test would block +// for 12s+ (the slow server's delay) AND the test would still pass +// on the wrong invariant — so we ALSO assert the elapsed wall time +// is well under the server delay (proving the timeout fired, not +// the server response). +// +// Runtime cost: ~10s wall clock. Acceptable for a regression test +// that runs once per CI build; the alternative (mock the http +// transport) would test the mock, not the real http.Client.Timeout +// contract — exactly the trade-off core#2125 is about. +func TestRefreshEnvFromCP_ClientTimeoutFiresOnSlowUpstream(t *testing.T) { + if testing.Short() { + t.Skip("skipping 10s slow-upstream test in -short mode") + } + // Server that delays 12s — LONGER than the 10s client timeout, so + // the timeout MUST fire first. If the timeout were absent (or set + // higher), this handler would run to completion and refreshEnvFromCP + // would return success after 12s — both wrong. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + select { + case <-time.After(12 * time.Second): + case <-r.Context().Done(): + } + })) + defer srv.Close() + + t.Setenv("MOLECULE_ORG_ID", "org-abc") + t.Setenv("ADMIN_TOKEN", "t") + t.Setenv("MOLECULE_CP_URL", srv.URL) + + start := time.Now() + err := refreshEnvFromCP() + elapsed := time.Since(start) + + if err == nil { + t.Fatalf("expected timeout error on slow upstream, got nil (would mean the 10s timeout is missing)") + } + // Bound the elapsed time at 11s — the 10s client timeout + up to + // 1s of slack for goroutine scheduling. If this is > 11s, either + // the timeout was raised or the test server isn't actually slow. + if elapsed > 11*time.Second { + t.Errorf("refreshEnvFromCP took %v on a 12s-delay server; expected the 10s client timeout to fire first (elapsed < 11s)", elapsed) + } + // The error should mention timeout / deadline — proves the failure + // mode is the client.Timeout, not a misrouted request. + errStr := err.Error() + if !strings.Contains(errStr, "timeout") && !strings.Contains(errStr, "deadline") { + t.Errorf("error should mention timeout/deadline (the client.Timeout path), got: %v", err) + } +} diff --git a/workspace-server/internal/middleware/mcp_ratelimit.go b/workspace-server/internal/middleware/mcp_ratelimit.go index 3133e2358..8b5c61f2c 100644 --- a/workspace-server/internal/middleware/mcp_ratelimit.go +++ b/workspace-server/internal/middleware/mcp_ratelimit.go @@ -4,7 +4,6 @@ import ( "context" "crypto/sha256" "fmt" - "log" "net/http" "strconv" "strings" @@ -42,11 +41,7 @@ func NewMCPRateLimiter(rate int, interval time.Duration, ctx context.Context) *M interval: interval, } go func() { - defer func() { - if r := recover(); r != nil { - log.Printf("mcp_ratelimit: PANIC in bucket cleanup: %v", r) - } - }() + defer recoverPanic("mcp_ratelimit: bucket cleanup") ticker := time.NewTicker(5 * time.Minute) defer ticker.Stop() for { diff --git a/workspace-server/internal/middleware/panic_recovery_test.go b/workspace-server/internal/middleware/panic_recovery_test.go new file mode 100644 index 000000000..405a2d187 --- /dev/null +++ b/workspace-server/internal/middleware/panic_recovery_test.go @@ -0,0 +1,130 @@ +package middleware + +import ( + "bytes" + "log" + "strings" + "testing" + "time" +) + +// TestRecoverPanic_RecoversFromPanicInGoroutine: the core#2125 +// family of fixes added `recover()` wrappers to 6 long-lived +// background goroutines (cache sweepers, rate-limit cleanup, session +// cache, A2A SSE idle watcher, terminal bridges, importer provision +// goroutine). The wrapper was refactored into a single +// `recoverPanic(prefix)` helper so the panic-recovery contract is +// testable in one place. This test proves the contract: +// +// 1. A goroutine that defers `recoverPanic(...)` and then panics +// does NOT crash the test process — the panic is contained. +// 2. The recovered value is logged with the caller's prefix — so +// operators can grep for the specific goroutine that tripped. +// 3. The goroutine's deferred epilogue (e.g. `close(done)`) STILL +// runs after the panic — the wrapper returns normally, so any +// `defer close(done)` placed BEFORE the recoverPanic defer +// fires (defers run in LIFO; the close runs after the recover). +// +// Runtime: sub-millisecond. +func TestRecoverPanic_RecoversFromPanicInGoroutine(t *testing.T) { + // Capture the log output so we can assert the prefix appears + // and the recovered value is included. + var buf bytes.Buffer + prevWriter := log.Writer() + prevFlags := log.Flags() + log.SetOutput(&buf) + log.SetFlags(0) + t.Cleanup(func() { + log.SetOutput(prevWriter) + log.SetFlags(prevFlags) + }) + + done := make(chan struct{}) + go func() { + defer recoverPanic("test_goroutine") + defer close(done) + panic("simulated panic value") + }() + + // Bound the wait so a regressed recoverPanic (one that fails to + // actually recover) surfaces as a test hang, not a CI timeout. + select { + case <-done: + // Recovered and returned cleanly. + case <-time.After(1 * time.Second): + t.Fatal("goroutine did not recover within 1s — recoverPanic is broken (or the panic is propagating)") + } + + // Assert the log line mentions the prefix AND the recovered value. + out := buf.String() + if !strings.Contains(out, "test_goroutine") { + t.Errorf("recoverPanic log should include the prefix; got: %q", out) + } + if !strings.Contains(out, "simulated panic value") { + t.Errorf("recoverPanic log should include the recovered panic value; got: %q", out) + } + if !strings.Contains(out, "PANIC") { + t.Errorf("recoverPanic log should include the 'PANIC' marker for grep-ability; got: %q", out) + } +} + +// TestRecoverPanic_NoopOnNormalReturn: a goroutine that defers +// recoverPanic(...) and returns normally must NOT log anything — +// `recover()` returns nil when there was no panic, and the wrapper +// must not emit a spurious "PANIC" line in that case (which would +// page operators for nothing and obscure the real signal). +func TestRecoverPanic_NoopOnNormalReturn(t *testing.T) { + var buf bytes.Buffer + prevWriter := log.Writer() + prevFlags := log.Flags() + log.SetOutput(&buf) + log.SetFlags(0) + t.Cleanup(func() { + log.SetOutput(prevWriter) + log.SetFlags(prevFlags) + }) + + done := make(chan struct{}) + go func() { + defer recoverPanic("normal_return_goroutine") + defer close(done) + // No panic — just return. + }() + + select { + case <-done: + case <-time.After(1 * time.Second): + t.Fatal("normal-return goroutine did not finish in 1s") + } + + if out := buf.String(); strings.Contains(out, "PANIC") { + t.Errorf("recoverPanic must NOT log on a normal return; got: %q", out) + } +} + +// (REMOVED) TestHTTPPanicRecovery_HandlerPanicReturns500: +// +// Originally added in commit e4b649c9 to address CR2's first +// round of feedback ("exercise the real HTTP panic-recovery +// path"). On the second round of review, CR2 correctly flagged +// that the test wired a synthetic `gin.New() + gin.Recovery()` +// engine rather than the production `router.Setup` flow (which +// uses `gin.Default()`). Exercising the production router +// requires constructing ~10 dependencies (Hub, Broadcaster, +// Provisioner, handlers.WorkspaceHandler, etc.) — out of scope +// for this regression-test PR. +// +// Per CR2's option 2 in the second review, the synthetic +// handler-panic coverage has been REMOVED from the PR. The +// PR's merge-blocking coverage is now narrowed to: +// (a) the real http client.Timeout regression (10s slow- +// upstream test against the production refreshEnvFromCP) +// (b) the recoverPanic helper contract (covered for the 3 +// in-production call sites in internal/middleware: the +// session_auth sweeper, ratelimit cleanup, mcp_ratelimit +// cleanup) +// Adding a real production-router handler-panic regression is +// a candidate for a follow-up PR — the work is "make router.Setup +// testable with the minimum dependency surface, then assert +// 500+no-crash through the real path", which is a non-trivial +// refactor on its own. diff --git a/workspace-server/internal/middleware/ratelimit.go b/workspace-server/internal/middleware/ratelimit.go index fa19a5da1..9a544446e 100644 --- a/workspace-server/internal/middleware/ratelimit.go +++ b/workspace-server/internal/middleware/ratelimit.go @@ -3,7 +3,6 @@ package middleware import ( "context" - "log" "net/http" "strconv" "strings" @@ -36,11 +35,7 @@ func NewRateLimiter(rate int, interval time.Duration, ctx context.Context) *Rate interval: interval, } go func() { - defer func() { - if r := recover(); r != nil { - log.Printf("ratelimit: PANIC in bucket cleanup: %v", r) - } - }() + defer recoverPanic("ratelimit: bucket cleanup") ticker := time.NewTicker(5 * time.Minute) defer ticker.Stop() for { diff --git a/workspace-server/internal/middleware/session_auth.go b/workspace-server/internal/middleware/session_auth.go index 62c7a970b..f02e29a4c 100644 --- a/workspace-server/internal/middleware/session_auth.go +++ b/workspace-server/internal/middleware/session_auth.go @@ -116,11 +116,7 @@ func sessionCachePut(key string, ok bool) { func init() { go func() { - defer func() { - if r := recover(); r != nil { - log.Printf("session_auth: PANIC in cache sweeper: %v", r) - } - }() + defer recoverPanic("session_auth: cache sweeper") // Jitter startup so restarts don't align sweeps. time.Sleep(time.Duration(rand.Int64N(int64(sessionCacheSweepEvery)))) t := time.NewTicker(sessionCacheSweepEvery) @@ -131,6 +127,30 @@ func init() { }() } +// recoverPanic is the canonical panic-recovery wrapper for the +// long-lived background goroutines the workspace-server middleware +// package runs (cache sweepers, rate-limit cleanup, session cache, +// A2A SSE idle watcher, terminal bridges, importer provision +// goroutine — see core#2125). It is invoked via `defer` at the +// TOP of any goroutine that must not let a panic crash the process +// (the package has no global recover; a panic that escapes a +// goroutine takes the whole workspace-server down). +// +// The wrapper logs the recovered value with a caller-supplied +// prefix so the operator can grep for the specific goroutine that +// tripped, then returns. The deferred caller continues to its +// function epilogue (typically `close(done)`). +// +// Refactored from inline `defer func() { if r := recover(); ... }()` +// blocks (which were duplicated across 6 call sites in #2125) so +// the panic-recovery contract is testable in one place — see +// TestRecoverPanic_RecoversFromPanicInGoroutine in panic_recovery_test.go. +func recoverPanic(prefix string) { + if r := recover(); r != nil { + log.Printf("%s: PANIC: %v", prefix, r) + } +} + // sweepExpired removes expired entries so a low-hit-rate cache still // releases memory. Cheap — we hold the lock briefly per entry. func sweepExpired() {