test(workspace-server): regression tests for core#2125 (http timeout + panic recovery) #2620
@@ -7,6 +7,7 @@ import (
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestRefreshEnvFromCP_NoopWhenNotSaaS: without MOLECULE_ORG_ID or
|
||||
@@ -243,3 +244,59 @@ func TestRefreshEnvFromCP_RejectsOversizedValue(t *testing.T) {
|
||||
"original", len(got))
|
||||
}
|
||||
}
|
||||
|
||||
// TestRefreshEnvFromCP_ClientTimeoutFiresOnSlowUpstream: the
|
||||
// core#2125 fix replaced http.DefaultClient with
|
||||
// `&http.Client{Timeout: 10 * time.Second}`. This regression test
|
||||
// proves that a hung / slow upstream does NOT block the boot — the
|
||||
// client times out at 10s and refreshEnvFromCP returns an error
|
||||
// within a small bound. Without the timeout, this test would block
|
||||
// for 12s+ (the slow server's delay) AND the test would still pass
|
||||
// on the wrong invariant — so we ALSO assert the elapsed wall time
|
||||
// is well under the server delay (proving the timeout fired, not
|
||||
// the server response).
|
||||
//
|
||||
// Runtime cost: ~10s wall clock. Acceptable for a regression test
|
||||
// that runs once per CI build; the alternative (mock the http
|
||||
// transport) would test the mock, not the real http.Client.Timeout
|
||||
// contract — exactly the trade-off core#2125 is about.
|
||||
func TestRefreshEnvFromCP_ClientTimeoutFiresOnSlowUpstream(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping 10s slow-upstream test in -short mode")
|
||||
}
|
||||
// Server that delays 12s — LONGER than the 10s client timeout, so
|
||||
// the timeout MUST fire first. If the timeout were absent (or set
|
||||
// higher), this handler would run to completion and refreshEnvFromCP
|
||||
// would return success after 12s — both wrong.
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
select {
|
||||
case <-time.After(12 * time.Second):
|
||||
case <-r.Context().Done():
|
||||
}
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
t.Setenv("MOLECULE_ORG_ID", "org-abc")
|
||||
t.Setenv("ADMIN_TOKEN", "t")
|
||||
t.Setenv("MOLECULE_CP_URL", srv.URL)
|
||||
|
||||
start := time.Now()
|
||||
err := refreshEnvFromCP()
|
||||
elapsed := time.Since(start)
|
||||
|
||||
if err == nil {
|
||||
t.Fatalf("expected timeout error on slow upstream, got nil (would mean the 10s timeout is missing)")
|
||||
}
|
||||
// Bound the elapsed time at 11s — the 10s client timeout + up to
|
||||
// 1s of slack for goroutine scheduling. If this is > 11s, either
|
||||
// the timeout was raised or the test server isn't actually slow.
|
||||
if elapsed > 11*time.Second {
|
||||
t.Errorf("refreshEnvFromCP took %v on a 12s-delay server; expected the 10s client timeout to fire first (elapsed < 11s)", elapsed)
|
||||
}
|
||||
// The error should mention timeout / deadline — proves the failure
|
||||
// mode is the client.Timeout, not a misrouted request.
|
||||
errStr := err.Error()
|
||||
if !strings.Contains(errStr, "timeout") && !strings.Contains(errStr, "deadline") {
|
||||
t.Errorf("error should mention timeout/deadline (the client.Timeout path), got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,6 @@ import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -42,11 +41,7 @@ func NewMCPRateLimiter(rate int, interval time.Duration, ctx context.Context) *M
|
||||
interval: interval,
|
||||
}
|
||||
go func() {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
log.Printf("mcp_ratelimit: PANIC in bucket cleanup: %v", r)
|
||||
}
|
||||
}()
|
||||
defer recoverPanic("mcp_ratelimit: bucket cleanup")
|
||||
ticker := time.NewTicker(5 * time.Minute)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
|
||||
@@ -0,0 +1,130 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"log"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestRecoverPanic_RecoversFromPanicInGoroutine: the core#2125
|
||||
// family of fixes added `recover()` wrappers to 6 long-lived
|
||||
// background goroutines (cache sweepers, rate-limit cleanup, session
|
||||
// cache, A2A SSE idle watcher, terminal bridges, importer provision
|
||||
// goroutine). The wrapper was refactored into a single
|
||||
// `recoverPanic(prefix)` helper so the panic-recovery contract is
|
||||
// testable in one place. This test proves the contract:
|
||||
//
|
||||
// 1. A goroutine that defers `recoverPanic(...)` and then panics
|
||||
// does NOT crash the test process — the panic is contained.
|
||||
// 2. The recovered value is logged with the caller's prefix — so
|
||||
// operators can grep for the specific goroutine that tripped.
|
||||
// 3. The goroutine's deferred epilogue (e.g. `close(done)`) STILL
|
||||
// runs after the panic — the wrapper returns normally, so any
|
||||
// `defer close(done)` placed BEFORE the recoverPanic defer
|
||||
// fires (defers run in LIFO; the close runs after the recover).
|
||||
//
|
||||
// Runtime: sub-millisecond.
|
||||
func TestRecoverPanic_RecoversFromPanicInGoroutine(t *testing.T) {
|
||||
// Capture the log output so we can assert the prefix appears
|
||||
// and the recovered value is included.
|
||||
var buf bytes.Buffer
|
||||
prevWriter := log.Writer()
|
||||
prevFlags := log.Flags()
|
||||
log.SetOutput(&buf)
|
||||
log.SetFlags(0)
|
||||
t.Cleanup(func() {
|
||||
log.SetOutput(prevWriter)
|
||||
log.SetFlags(prevFlags)
|
||||
})
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
defer recoverPanic("test_goroutine")
|
||||
defer close(done)
|
||||
panic("simulated panic value")
|
||||
}()
|
||||
|
||||
// Bound the wait so a regressed recoverPanic (one that fails to
|
||||
// actually recover) surfaces as a test hang, not a CI timeout.
|
||||
select {
|
||||
case <-done:
|
||||
// Recovered and returned cleanly.
|
||||
case <-time.After(1 * time.Second):
|
||||
t.Fatal("goroutine did not recover within 1s — recoverPanic is broken (or the panic is propagating)")
|
||||
}
|
||||
|
||||
// Assert the log line mentions the prefix AND the recovered value.
|
||||
out := buf.String()
|
||||
if !strings.Contains(out, "test_goroutine") {
|
||||
t.Errorf("recoverPanic log should include the prefix; got: %q", out)
|
||||
}
|
||||
if !strings.Contains(out, "simulated panic value") {
|
||||
t.Errorf("recoverPanic log should include the recovered panic value; got: %q", out)
|
||||
}
|
||||
if !strings.Contains(out, "PANIC") {
|
||||
t.Errorf("recoverPanic log should include the 'PANIC' marker for grep-ability; got: %q", out)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRecoverPanic_NoopOnNormalReturn: a goroutine that defers
|
||||
// recoverPanic(...) and returns normally must NOT log anything —
|
||||
// `recover()` returns nil when there was no panic, and the wrapper
|
||||
// must not emit a spurious "PANIC" line in that case (which would
|
||||
// page operators for nothing and obscure the real signal).
|
||||
func TestRecoverPanic_NoopOnNormalReturn(t *testing.T) {
|
||||
var buf bytes.Buffer
|
||||
prevWriter := log.Writer()
|
||||
prevFlags := log.Flags()
|
||||
log.SetOutput(&buf)
|
||||
log.SetFlags(0)
|
||||
t.Cleanup(func() {
|
||||
log.SetOutput(prevWriter)
|
||||
log.SetFlags(prevFlags)
|
||||
})
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
defer recoverPanic("normal_return_goroutine")
|
||||
defer close(done)
|
||||
// No panic — just return.
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(1 * time.Second):
|
||||
t.Fatal("normal-return goroutine did not finish in 1s")
|
||||
}
|
||||
|
||||
if out := buf.String(); strings.Contains(out, "PANIC") {
|
||||
t.Errorf("recoverPanic must NOT log on a normal return; got: %q", out)
|
||||
}
|
||||
}
|
||||
|
||||
// (REMOVED) TestHTTPPanicRecovery_HandlerPanicReturns500:
|
||||
//
|
||||
// Originally added in commit e4b649c9 to address CR2's first
|
||||
// round of feedback ("exercise the real HTTP panic-recovery
|
||||
// path"). On the second round of review, CR2 correctly flagged
|
||||
// that the test wired a synthetic `gin.New() + gin.Recovery()`
|
||||
// engine rather than the production `router.Setup` flow (which
|
||||
// uses `gin.Default()`). Exercising the production router
|
||||
// requires constructing ~10 dependencies (Hub, Broadcaster,
|
||||
// Provisioner, handlers.WorkspaceHandler, etc.) — out of scope
|
||||
// for this regression-test PR.
|
||||
//
|
||||
// Per CR2's option 2 in the second review, the synthetic
|
||||
// handler-panic coverage has been REMOVED from the PR. The
|
||||
// PR's merge-blocking coverage is now narrowed to:
|
||||
// (a) the real http client.Timeout regression (10s slow-
|
||||
// upstream test against the production refreshEnvFromCP)
|
||||
// (b) the recoverPanic helper contract (covered for the 3
|
||||
// in-production call sites in internal/middleware: the
|
||||
// session_auth sweeper, ratelimit cleanup, mcp_ratelimit
|
||||
// cleanup)
|
||||
// Adding a real production-router handler-panic regression is
|
||||
// a candidate for a follow-up PR — the work is "make router.Setup
|
||||
// testable with the minimum dependency surface, then assert
|
||||
// 500+no-crash through the real path", which is a non-trivial
|
||||
// refactor on its own.
|
||||
@@ -3,7 +3,6 @@ package middleware
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -36,11 +35,7 @@ func NewRateLimiter(rate int, interval time.Duration, ctx context.Context) *Rate
|
||||
interval: interval,
|
||||
}
|
||||
go func() {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
log.Printf("ratelimit: PANIC in bucket cleanup: %v", r)
|
||||
}
|
||||
}()
|
||||
defer recoverPanic("ratelimit: bucket cleanup")
|
||||
ticker := time.NewTicker(5 * time.Minute)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
|
||||
@@ -116,11 +116,7 @@ func sessionCachePut(key string, ok bool) {
|
||||
|
||||
func init() {
|
||||
go func() {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
log.Printf("session_auth: PANIC in cache sweeper: %v", r)
|
||||
}
|
||||
}()
|
||||
defer recoverPanic("session_auth: cache sweeper")
|
||||
// Jitter startup so restarts don't align sweeps.
|
||||
time.Sleep(time.Duration(rand.Int64N(int64(sessionCacheSweepEvery))))
|
||||
t := time.NewTicker(sessionCacheSweepEvery)
|
||||
@@ -131,6 +127,30 @@ func init() {
|
||||
}()
|
||||
}
|
||||
|
||||
// recoverPanic is the canonical panic-recovery wrapper for the
|
||||
// long-lived background goroutines the workspace-server middleware
|
||||
// package runs (cache sweepers, rate-limit cleanup, session cache,
|
||||
// A2A SSE idle watcher, terminal bridges, importer provision
|
||||
// goroutine — see core#2125). It is invoked via `defer` at the
|
||||
// TOP of any goroutine that must not let a panic crash the process
|
||||
// (the package has no global recover; a panic that escapes a
|
||||
// goroutine takes the whole workspace-server down).
|
||||
//
|
||||
// The wrapper logs the recovered value with a caller-supplied
|
||||
// prefix so the operator can grep for the specific goroutine that
|
||||
// tripped, then returns. The deferred caller continues to its
|
||||
// function epilogue (typically `close(done)`).
|
||||
//
|
||||
// Refactored from inline `defer func() { if r := recover(); ... }()`
|
||||
// blocks (which were duplicated across 6 call sites in #2125) so
|
||||
// the panic-recovery contract is testable in one place — see
|
||||
// TestRecoverPanic_RecoversFromPanicInGoroutine in panic_recovery_test.go.
|
||||
func recoverPanic(prefix string) {
|
||||
if r := recover(); r != nil {
|
||||
log.Printf("%s: PANIC: %v", prefix, r)
|
||||
}
|
||||
}
|
||||
|
||||
// sweepExpired removes expired entries so a low-hit-rate cache still
|
||||
// releases memory. Cheap — we hold the lock briefly per entry.
|
||||
func sweepExpired() {
|
||||
|
||||
Reference in New Issue
Block a user