test(workspace-server): regression tests for core#2125 (http timeout + panic recovery) #2620

Merged
devops-engineer merged 7 commits from fix/core-2615-2125-regression-test into main 2026-06-12 03:31:58 +00:00
5 changed files with 214 additions and 17 deletions
@@ -7,6 +7,7 @@ import (
"os"
"strings"
"testing"
"time"
)
// TestRefreshEnvFromCP_NoopWhenNotSaaS: without MOLECULE_ORG_ID or
@@ -243,3 +244,59 @@ func TestRefreshEnvFromCP_RejectsOversizedValue(t *testing.T) {
"original", len(got))
}
}
// TestRefreshEnvFromCP_ClientTimeoutFiresOnSlowUpstream: the
// core#2125 fix replaced http.DefaultClient with
// `&http.Client{Timeout: 10 * time.Second}`. This regression test
// proves that a hung / slow upstream does NOT block the boot — the
// client times out at 10s and refreshEnvFromCP returns an error
// within a small bound. Without the timeout, this test would block
// for 12s+ (the slow server's delay) AND the test would still pass
// on the wrong invariant — so we ALSO assert the elapsed wall time
// is well under the server delay (proving the timeout fired, not
// the server response).
//
// Runtime cost: ~10s wall clock. Acceptable for a regression test
// that runs once per CI build; the alternative (mock the http
// transport) would test the mock, not the real http.Client.Timeout
// contract — exactly the trade-off core#2125 is about.
func TestRefreshEnvFromCP_ClientTimeoutFiresOnSlowUpstream(t *testing.T) {
if testing.Short() {
t.Skip("skipping 10s slow-upstream test in -short mode")
}
// Server that delays 12s — LONGER than the 10s client timeout, so
// the timeout MUST fire first. If the timeout were absent (or set
// higher), this handler would run to completion and refreshEnvFromCP
// would return success after 12s — both wrong.
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
select {
case <-time.After(12 * time.Second):
case <-r.Context().Done():
}
}))
defer srv.Close()
t.Setenv("MOLECULE_ORG_ID", "org-abc")
t.Setenv("ADMIN_TOKEN", "t")
t.Setenv("MOLECULE_CP_URL", srv.URL)
start := time.Now()
err := refreshEnvFromCP()
elapsed := time.Since(start)
if err == nil {
t.Fatalf("expected timeout error on slow upstream, got nil (would mean the 10s timeout is missing)")
}
// Bound the elapsed time at 11s — the 10s client timeout + up to
// 1s of slack for goroutine scheduling. If this is > 11s, either
// the timeout was raised or the test server isn't actually slow.
if elapsed > 11*time.Second {
t.Errorf("refreshEnvFromCP took %v on a 12s-delay server; expected the 10s client timeout to fire first (elapsed < 11s)", elapsed)
}
// The error should mention timeout / deadline — proves the failure
// mode is the client.Timeout, not a misrouted request.
errStr := err.Error()
if !strings.Contains(errStr, "timeout") && !strings.Contains(errStr, "deadline") {
t.Errorf("error should mention timeout/deadline (the client.Timeout path), got: %v", err)
}
}
@@ -4,7 +4,6 @@ import (
"context"
"crypto/sha256"
"fmt"
"log"
"net/http"
"strconv"
"strings"
@@ -42,11 +41,7 @@ func NewMCPRateLimiter(rate int, interval time.Duration, ctx context.Context) *M
interval: interval,
}
go func() {
defer func() {
if r := recover(); r != nil {
log.Printf("mcp_ratelimit: PANIC in bucket cleanup: %v", r)
}
}()
defer recoverPanic("mcp_ratelimit: bucket cleanup")
ticker := time.NewTicker(5 * time.Minute)
defer ticker.Stop()
for {
@@ -0,0 +1,130 @@
package middleware
import (
"bytes"
"log"
"strings"
"testing"
"time"
)
// TestRecoverPanic_RecoversFromPanicInGoroutine: the core#2125
// family of fixes added `recover()` wrappers to 6 long-lived
// background goroutines (cache sweepers, rate-limit cleanup, session
// cache, A2A SSE idle watcher, terminal bridges, importer provision
// goroutine). The wrapper was refactored into a single
// `recoverPanic(prefix)` helper so the panic-recovery contract is
// testable in one place. This test proves the contract:
//
// 1. A goroutine that defers `recoverPanic(...)` and then panics
// does NOT crash the test process — the panic is contained.
// 2. The recovered value is logged with the caller's prefix — so
// operators can grep for the specific goroutine that tripped.
// 3. The goroutine's deferred epilogue (e.g. `close(done)`) STILL
// runs after the panic — the wrapper returns normally, so any
// `defer close(done)` placed BEFORE the recoverPanic defer
// fires (defers run in LIFO; the close runs after the recover).
//
// Runtime: sub-millisecond.
func TestRecoverPanic_RecoversFromPanicInGoroutine(t *testing.T) {
// Capture the log output so we can assert the prefix appears
// and the recovered value is included.
var buf bytes.Buffer
prevWriter := log.Writer()
prevFlags := log.Flags()
log.SetOutput(&buf)
log.SetFlags(0)
t.Cleanup(func() {
log.SetOutput(prevWriter)
log.SetFlags(prevFlags)
})
done := make(chan struct{})
go func() {
defer recoverPanic("test_goroutine")
defer close(done)
panic("simulated panic value")
}()
// Bound the wait so a regressed recoverPanic (one that fails to
// actually recover) surfaces as a test hang, not a CI timeout.
select {
case <-done:
// Recovered and returned cleanly.
case <-time.After(1 * time.Second):
t.Fatal("goroutine did not recover within 1s — recoverPanic is broken (or the panic is propagating)")
}
// Assert the log line mentions the prefix AND the recovered value.
out := buf.String()
if !strings.Contains(out, "test_goroutine") {
t.Errorf("recoverPanic log should include the prefix; got: %q", out)
}
if !strings.Contains(out, "simulated panic value") {
t.Errorf("recoverPanic log should include the recovered panic value; got: %q", out)
}
if !strings.Contains(out, "PANIC") {
t.Errorf("recoverPanic log should include the 'PANIC' marker for grep-ability; got: %q", out)
}
}
// TestRecoverPanic_NoopOnNormalReturn: a goroutine that defers
// recoverPanic(...) and returns normally must NOT log anything —
// `recover()` returns nil when there was no panic, and the wrapper
// must not emit a spurious "PANIC" line in that case (which would
// page operators for nothing and obscure the real signal).
func TestRecoverPanic_NoopOnNormalReturn(t *testing.T) {
var buf bytes.Buffer
prevWriter := log.Writer()
prevFlags := log.Flags()
log.SetOutput(&buf)
log.SetFlags(0)
t.Cleanup(func() {
log.SetOutput(prevWriter)
log.SetFlags(prevFlags)
})
done := make(chan struct{})
go func() {
defer recoverPanic("normal_return_goroutine")
defer close(done)
// No panic — just return.
}()
select {
case <-done:
case <-time.After(1 * time.Second):
t.Fatal("normal-return goroutine did not finish in 1s")
}
if out := buf.String(); strings.Contains(out, "PANIC") {
t.Errorf("recoverPanic must NOT log on a normal return; got: %q", out)
}
}
// (REMOVED) TestHTTPPanicRecovery_HandlerPanicReturns500:
//
// Originally added in commit e4b649c9 to address CR2's first
// round of feedback ("exercise the real HTTP panic-recovery
// path"). On the second round of review, CR2 correctly flagged
// that the test wired a synthetic `gin.New() + gin.Recovery()`
// engine rather than the production `router.Setup` flow (which
// uses `gin.Default()`). Exercising the production router
// requires constructing ~10 dependencies (Hub, Broadcaster,
// Provisioner, handlers.WorkspaceHandler, etc.) — out of scope
// for this regression-test PR.
//
// Per CR2's option 2 in the second review, the synthetic
// handler-panic coverage has been REMOVED from the PR. The
// PR's merge-blocking coverage is now narrowed to:
// (a) the real http client.Timeout regression (10s slow-
// upstream test against the production refreshEnvFromCP)
// (b) the recoverPanic helper contract (covered for the 3
// in-production call sites in internal/middleware: the
// session_auth sweeper, ratelimit cleanup, mcp_ratelimit
// cleanup)
// Adding a real production-router handler-panic regression is
// a candidate for a follow-up PR — the work is "make router.Setup
// testable with the minimum dependency surface, then assert
// 500+no-crash through the real path", which is a non-trivial
// refactor on its own.
@@ -3,7 +3,6 @@ package middleware
import (
"context"
"log"
"net/http"
"strconv"
"strings"
@@ -36,11 +35,7 @@ func NewRateLimiter(rate int, interval time.Duration, ctx context.Context) *Rate
interval: interval,
}
go func() {
defer func() {
if r := recover(); r != nil {
log.Printf("ratelimit: PANIC in bucket cleanup: %v", r)
}
}()
defer recoverPanic("ratelimit: bucket cleanup")
ticker := time.NewTicker(5 * time.Minute)
defer ticker.Stop()
for {
@@ -116,11 +116,7 @@ func sessionCachePut(key string, ok bool) {
func init() {
go func() {
defer func() {
if r := recover(); r != nil {
log.Printf("session_auth: PANIC in cache sweeper: %v", r)
}
}()
defer recoverPanic("session_auth: cache sweeper")
// Jitter startup so restarts don't align sweeps.
time.Sleep(time.Duration(rand.Int64N(int64(sessionCacheSweepEvery))))
t := time.NewTicker(sessionCacheSweepEvery)
@@ -131,6 +127,30 @@ func init() {
}()
}
// recoverPanic is the canonical panic-recovery wrapper for the
// long-lived background goroutines the workspace-server middleware
// package runs (cache sweepers, rate-limit cleanup, session cache,
// A2A SSE idle watcher, terminal bridges, importer provision
// goroutine — see core#2125). It is invoked via `defer` at the
// TOP of any goroutine that must not let a panic crash the process
// (the package has no global recover; a panic that escapes a
// goroutine takes the whole workspace-server down).
//
// The wrapper logs the recovered value with a caller-supplied
// prefix so the operator can grep for the specific goroutine that
// tripped, then returns. The deferred caller continues to its
// function epilogue (typically `close(done)`).
//
// Refactored from inline `defer func() { if r := recover(); ... }()`
// blocks (which were duplicated across 6 call sites in #2125) so
// the panic-recovery contract is testable in one place — see
// TestRecoverPanic_RecoversFromPanicInGoroutine in panic_recovery_test.go.
func recoverPanic(prefix string) {
if r := recover(); r != nil {
log.Printf("%s: PANIC: %v", prefix, r)
}
}
// sweepExpired removes expired entries so a low-hit-rate cache still
// releases memory. Cheap — we hold the lock briefly per entry.
func sweepExpired() {