molecule-core/workspace-server/internal/registry/healthsweep_test.go
Hongming Wang fdf1b5d76a refactor(workspace-status): typed constants + AST-based drift gate
Eliminate raw 'awaiting_agent'/'hibernating'/'failed'/etc string literals
from production status writes. Adds models.WorkspaceStatus typed alias and
models.AllWorkspaceStatuses canonical slice; every UPDATE workspaces SET
status = ... now passes a parameterized $N typed value rather than a
hard-coded SQL literal.

Defense-in-depth follow-up to migration 046 (#2388): the Postgres enum
type was missing 'awaiting_agent' + 'hibernating' for ~5 days because
sqlmock regex matching cannot enforce live enum constraints. The drift
gate is now a proper Go AST + SQL parser (no regex), asserting the
codebase ⊆ migration enum and every const appears in the canonical
slice. With status as a parameterized typed value, future enum mismatches
fail at the SQL layer in tests, not silently in prod.

Test coverage: full suite passes with -race; drift gate green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 10:41:41 -07:00

303 lines
9.3 KiB
Go

package registry
import (
"context"
"sync"
"testing"
"time"
"github.com/DATA-DOG/go-sqlmock"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
"github.com/alicebob/miniredis/v2"
"github.com/redis/go-redis/v9"
)
// mockChecker implements ContainerChecker for testing.
type mockChecker struct {
mu sync.Mutex
running map[string]bool
}
func (m *mockChecker) IsRunning(_ context.Context, workspaceID string) (bool, error) {
m.mu.Lock()
defer m.mu.Unlock()
return m.running[workspaceID], nil
}
func setupTestDB(t *testing.T) sqlmock.Sqlmock {
t.Helper()
mockDB, mock, err := sqlmock.New()
if err != nil {
t.Fatalf("failed to create sqlmock: %v", err)
}
db.DB = mockDB
t.Cleanup(func() { mockDB.Close() })
return mock
}
func setupTestRedis(t *testing.T) *miniredis.Miniredis {
t.Helper()
mr, err := miniredis.Run()
if err != nil {
t.Fatalf("failed to start miniredis: %v", err)
}
db.RDB = redis.NewClient(&redis.Options{Addr: mr.Addr()})
t.Cleanup(func() { mr.Close() })
return mr
}
func TestSweepOnlineWorkspaces_DeadContainer(t *testing.T) {
mock := setupTestDB(t)
mr := setupTestRedis(t)
// Set up Redis keys for a workspace that's about to be detected as dead
mr.Set("ws:ws-dead-123", "online")
mr.Set("ws:ws-dead-123:url", "http://127.0.0.1:32000")
mr.Set("ws:ws-dead-123:internal_url", "http://ws-ws-dead-123:8000")
// Mock: query returns one online workspace
rows := sqlmock.NewRows([]string{"id"}).AddRow("ws-dead-123")
mock.ExpectQuery("SELECT id FROM workspaces WHERE status IN").
WillReturnRows(rows)
// Mock: update to offline (Docker sweep keeps 'offline' status —
// 'awaiting_agent' is the external-runtime path).
mock.ExpectExec("UPDATE workspaces SET status =").
WithArgs(models.StatusOffline, "ws-dead-123").
WillReturnResult(sqlmock.NewResult(0, 1))
checker := &mockChecker{running: map[string]bool{
"ws-dead-123": false, // container is dead
}}
var offlineCalled []string
var mu sync.Mutex
onOffline := func(_ context.Context, id string) {
mu.Lock()
offlineCalled = append(offlineCalled, id)
mu.Unlock()
}
sweepOnlineWorkspaces(context.Background(), checker, onOffline)
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("unmet SQL expectations: %v", err)
}
mu.Lock()
defer mu.Unlock()
if len(offlineCalled) != 1 || offlineCalled[0] != "ws-dead-123" {
t.Fatalf("expected onOffline for ws-dead-123, got: %v", offlineCalled)
}
// Redis keys should be cleared
if mr.Exists("ws:ws-dead-123") {
t.Error("expected liveness key to be deleted")
}
if mr.Exists("ws:ws-dead-123:url") {
t.Error("expected URL cache to be deleted")
}
if mr.Exists("ws:ws-dead-123:internal_url") {
t.Error("expected internal URL cache to be deleted")
}
}
func TestSweepOnlineWorkspaces_RunningContainer(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
rows := sqlmock.NewRows([]string{"id"}).AddRow("ws-alive-456")
mock.ExpectQuery("SELECT id FROM workspaces WHERE status IN").
WillReturnRows(rows)
// No UPDATE expected — container is running
checker := &mockChecker{running: map[string]bool{
"ws-alive-456": true,
}}
offlineCalled := false
sweepOnlineWorkspaces(context.Background(), checker, func(_ context.Context, id string) {
offlineCalled = true
})
if offlineCalled {
t.Error("onOffline should not be called for running container")
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("unmet SQL expectations: %v", err)
}
}
func TestStartHealthSweep_NilChecker(t *testing.T) {
// Should return immediately without panicking
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer cancel()
done := make(chan struct{})
go func() {
StartHealthSweep(ctx, nil, time.Second, nil)
close(done)
}()
select {
case <-done:
// Good — returned immediately
case <-time.After(2 * time.Second):
t.Fatal("StartHealthSweep with nil checker should return immediately")
}
}
// ==================== Phase 30.7 — sweepStaleRemoteWorkspaces ====================
// The remote-liveness sweep queries workspaces with runtime='external'
// whose last_heartbeat_at is older than the stale-after window, marks
// them offline, clears Redis state, and fires onOffline. These tests
// verify the SQL shape, the offline-path side effects, and the
// environment-variable override for the staleness window.
func TestSweepStaleRemoteWorkspaces_MarksStaleAwaitingAgent(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
// Two stale remote workspaces returned by the query
mock.ExpectQuery(`FROM workspaces\s+WHERE status IN \('online', 'degraded'\)\s+AND COALESCE\(runtime, 'langgraph'\) = 'external'\s+AND COALESCE\(last_heartbeat_at, updated_at\) < now\(\) - `).
WillReturnRows(sqlmock.NewRows([]string{"id"}).
AddRow("ws-stale-1").
AddRow("ws-stale-2"))
mock.ExpectExec(`UPDATE workspaces SET status =`).
WithArgs(models.StatusAwaitingAgent, "ws-stale-1").
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectExec(`UPDATE workspaces SET status =`).
WithArgs(models.StatusAwaitingAgent, "ws-stale-2").
WillReturnResult(sqlmock.NewResult(0, 1))
var offlineCalls []string
onOffline := func(_ context.Context, id string) {
offlineCalls = append(offlineCalls, id)
}
sweepStaleRemoteWorkspaces(context.Background(), onOffline)
if len(offlineCalls) != 2 {
t.Errorf("expected onOffline called twice, got %d (%v)", len(offlineCalls), offlineCalls)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
}
}
func TestSweepStaleRemoteWorkspaces_NoStaleWorkspaces(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
mock.ExpectQuery(`FROM workspaces\s+WHERE status IN \('online', 'degraded'\)\s+AND COALESCE\(runtime, 'langgraph'\) = 'external'`).
WillReturnRows(sqlmock.NewRows([]string{"id"}))
called := 0
onOffline := func(_ context.Context, _ string) { called++ }
sweepStaleRemoteWorkspaces(context.Background(), onOffline)
if called != 0 {
t.Errorf("onOffline should not fire when no stale rows; got %d", called)
}
}
func TestSweepStaleRemoteWorkspaces_NilCallbackNoPanic(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
mock.ExpectQuery(`FROM workspaces`).
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-x"))
mock.ExpectExec(`UPDATE workspaces SET status =`).
WithArgs(models.StatusAwaitingAgent, "ws-x").
WillReturnResult(sqlmock.NewResult(0, 1))
// Must not panic with nil callback
sweepStaleRemoteWorkspaces(context.Background(), nil)
}
func TestSweepStaleRemoteWorkspaces_QueryErrorLogged(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
mock.ExpectQuery(`FROM workspaces`).
WillReturnError(assertDBDown{})
// Must return cleanly without panicking. No onOffline should fire.
called := 0
sweepStaleRemoteWorkspaces(context.Background(), func(_ context.Context, _ string) { called++ })
if called != 0 {
t.Errorf("on query error, no onOffline should fire; got %d", called)
}
}
type assertDBDown struct{}
func (assertDBDown) Error() string { return "simulated DB outage" }
// ==================== Phase 30.7 — remoteStaleAfter env override ====================
func TestRemoteStaleAfter_DefaultWhenUnset(t *testing.T) {
t.Setenv("REMOTE_LIVENESS_STALE_AFTER", "")
if got := remoteStaleAfter(); got != DefaultRemoteStaleAfter {
t.Errorf("expected default %s, got %s", DefaultRemoteStaleAfter, got)
}
}
func TestRemoteStaleAfter_HonorsValidOverride(t *testing.T) {
t.Setenv("REMOTE_LIVENESS_STALE_AFTER", "45")
if got := remoteStaleAfter(); got != 45*time.Second {
t.Errorf("expected 45s, got %s", got)
}
}
func TestRemoteStaleAfter_FallsBackOnGarbage(t *testing.T) {
for _, v := range []string{"abc", "0", "-10", ""} {
t.Setenv("REMOTE_LIVENESS_STALE_AFTER", v)
if got := remoteStaleAfter(); got != DefaultRemoteStaleAfter {
t.Errorf("value %q: expected fallback to default, got %s", v, got)
}
}
}
// ==================== Phase 30.7 — StartHealthSweep with nil Docker checker ====================
// Before 30.7, nil-checker caused StartHealthSweep to return immediately
// (no liveness monitoring at all). Now it should still run the remote
// sweep on the ticker. We verify by observing at least one remote-sweep
// query hits the mocked DB before we cancel.
func TestStartHealthSweep_NilCheckerRunsRemoteSweep(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
// The goroutine will tick once every 50ms; we give it 200ms then
// cancel. sqlmock will satisfy any number of calls.
mock.ExpectQuery(`FROM workspaces\s+WHERE status IN \('online', 'degraded'\)\s+AND COALESCE\(runtime, 'langgraph'\) = 'external'`).
WillReturnRows(sqlmock.NewRows([]string{"id"}))
ctx, cancel := context.WithCancel(context.Background())
done := make(chan struct{})
go func() {
StartHealthSweep(ctx, nil, 50*time.Millisecond, nil)
close(done)
}()
time.Sleep(120 * time.Millisecond)
cancel()
select {
case <-done:
case <-time.After(500 * time.Millisecond):
t.Fatal("StartHealthSweep did not return after ctx cancel")
}
// Expectations may have been met multiple times; we assert the
// query shape matched at least once. sqlmock.MatchExpectationsInOrder
// with a single Query expectation handles that by matching the
// first call and leaving subsequent calls unmatched (logged, not
// panicking). Test passes as long as we didn't panic.
}