molecule-core/workspace-server/internal/metrics/metrics_test.go
Hongming Wang c778b62202 feat(metrics): add molecule_phantom_busy_resets_total counter (#2865)
Closes #2865 (split-B of the #2669 root-cause stack).

The phantom-busy sweep in workspace-server/internal/scheduler/scheduler.go
already logs each row reset, but no aggregate metric surfaces "how often
is this firing." A regression that causes high reset rates (e.g.
controlplane#481's missing env vars, or future drift in the workspace
runtime's task-lifecycle accounting) only surfaces when users complain.

Fix: counter exposed at /metrics as molecule_phantom_busy_resets_total,
incremented from sweepPhantomBusy after each row whose active_tasks
was reset. Same shape as existing molecule_websocket_connections_active.

Operator-side dashboard: alert when daily phantom-busy reset count
> 0.5% of active workspaces. Today's steady-state is near-zero; any
increase is a regression signal.

Tests:
  - TestTrackPhantomBusyReset_IncrementsCounter
  - TestTrackPhantomBusyReset_RaceFreeUnderConcurrentWrites (50×200
    concurrent writes; tests atomic invariant)
  - TestHandler_ExposesPhantomBusyResetsCounter (asserts HELP + TYPE
    + value lines in Prometheus text format)
  - TestHandler_PhantomBusyResetsZeroByDefault (fresh-process 0
    contract — prevents a future refactor from accidentally dropping
    the metric from /metrics)

Race-detector clean. Vet clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 04:45:24 -07:00

105 lines
3.0 KiB
Go

package metrics
// Tests for the phantom-busy reset counter wired up by issue #2865.
// The counter is exposed at /metrics as
// molecule_phantom_busy_resets_total. A high steady-state value
// signals task-lifecycle accounting regressions in the agent loop —
// see scheduler.sweepPhantomBusy for the writer.
import (
"net/http/httptest"
"strings"
"sync"
"sync/atomic"
"testing"
"github.com/gin-gonic/gin"
)
// resetForTest zeroes the counter so a single test's TrackPhantomBusyReset
// calls don't compound onto a previous test's run. metrics.go's package-
// level state means every test that touches the counter must reset.
func resetForTest() {
atomic.StoreInt64(&phantomBusyResets, 0)
}
func TestTrackPhantomBusyReset_IncrementsCounter(t *testing.T) {
resetForTest()
for i := 0; i < 7; i++ {
TrackPhantomBusyReset()
}
got := atomic.LoadInt64(&phantomBusyResets)
if got != 7 {
t.Errorf("counter after 7 calls = %d, want 7", got)
}
}
func TestTrackPhantomBusyReset_RaceFreeUnderConcurrentWrites(t *testing.T) {
resetForTest()
var wg sync.WaitGroup
const goroutines = 50
const callsPerGoroutine = 200
wg.Add(goroutines)
for i := 0; i < goroutines; i++ {
go func() {
defer wg.Done()
for j := 0; j < callsPerGoroutine; j++ {
TrackPhantomBusyReset()
}
}()
}
wg.Wait()
want := int64(goroutines * callsPerGoroutine)
got := atomic.LoadInt64(&phantomBusyResets)
if got != want {
t.Errorf("counter under concurrent writes = %d, want %d (lost increments → atomic broken)",
got, want)
}
}
func TestHandler_ExposesPhantomBusyResetsCounter(t *testing.T) {
resetForTest()
for i := 0; i < 3; i++ {
TrackPhantomBusyReset()
}
gin.SetMode(gin.TestMode)
r := gin.New()
r.GET("/metrics", Handler())
w := httptest.NewRecorder()
req := httptest.NewRequest("GET", "/metrics", nil)
r.ServeHTTP(w, req)
body := w.Body.String()
// HELP + TYPE lines must precede the metric (Prometheus text exposition format).
if !strings.Contains(body, "# HELP molecule_phantom_busy_resets_total") {
t.Errorf("metrics output missing HELP line for molecule_phantom_busy_resets_total:\n%s", body)
}
if !strings.Contains(body, "# TYPE molecule_phantom_busy_resets_total counter") {
t.Errorf("metrics output missing TYPE line for molecule_phantom_busy_resets_total:\n%s", body)
}
if !strings.Contains(body, "molecule_phantom_busy_resets_total 3\n") {
t.Errorf("metrics output missing counter value 3:\n%s", body)
}
}
func TestHandler_PhantomBusyResetsZeroByDefault(t *testing.T) {
// Fresh process should report 0 — pin the contract so a future
// refactor that lazy-inits the counter to nil doesn't silently
// drop the metric from /metrics.
resetForTest()
gin.SetMode(gin.TestMode)
r := gin.New()
r.GET("/metrics", Handler())
w := httptest.NewRecorder()
req := httptest.NewRequest("GET", "/metrics", nil)
r.ServeHTTP(w, req)
if !strings.Contains(w.Body.String(), "molecule_phantom_busy_resets_total 0\n") {
t.Errorf("metric must report 0 by default:\n%s", w.Body.String())
}
}