Two follow-ups from PR #2494's review: 1. Two new sweep tests exercise the lookup path through sweepStuckProvisioning end-to-end: - ManifestOverrideSparesRow: claude-code 11min old, manifest=20min → no UPDATE, no broadcast (sparing works through the sweeper) - ManifestOverrideStillFlipsPastDeadline: claude-code 21min old, manifest=20min → flipped + payload.timeout_secs=1200 Closes the gap that the unit-test on provisioningTimeoutFor alone left open: a future refactor could drop the lookup arg from the sweeper's call and only the unit test caught it. Verified by regression-injecting `lookup→nil` in sweepStuckProvisioning — both new tests fail, the old ones still pass. 2. addProvisionTimeoutMs now goes through ProvisionTimeoutSecondsForRuntime instead of calling provisionTimeouts.get directly. Single accessor path for the same data — the canvas response and the sweeper now resolve identically by construction. No production behavior change; tests + accessor cleanup only. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
413 lines
15 KiB
Go
413 lines
15 KiB
Go
package registry
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"sync"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/DATA-DOG/go-sqlmock"
|
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
|
|
)
|
|
|
|
// fakeEmitter records every RecordAndBroadcast call so tests can assert
|
|
// payload shape + emission count. Safe for concurrent use since the sweeper
|
|
// itself is single-goroutine but keeping the lock lets the suite fan out.
|
|
type fakeEmitter struct {
|
|
mu sync.Mutex
|
|
events []emittedEvent
|
|
fail bool
|
|
}
|
|
|
|
type emittedEvent struct {
|
|
Type string
|
|
WorkspaceID string
|
|
Payload interface{}
|
|
}
|
|
|
|
func (f *fakeEmitter) RecordAndBroadcast(_ context.Context, eventType string, workspaceID string, payload interface{}) error {
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
f.events = append(f.events, emittedEvent{eventType, workspaceID, payload})
|
|
if f.fail {
|
|
return errors.New("broadcast boom")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (f *fakeEmitter) count() int {
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
return len(f.events)
|
|
}
|
|
|
|
// candidateRows builds the new-shape query result (id, runtime, age_sec).
|
|
// Use this in every sweep test to match the runtime-aware SELECT.
|
|
func candidateRows(rows ...[3]any) *sqlmock.Rows {
|
|
r := sqlmock.NewRows([]string{"id", "runtime", "age_sec"})
|
|
for _, row := range rows {
|
|
r = r.AddRow(row[0], row[1], row[2])
|
|
}
|
|
return r
|
|
}
|
|
|
|
// TestSweepStuckProvisioning_FlipsOverdue verifies the happy path: a stuck
|
|
// provisioning workspace gets flipped to failed AND an event is broadcast.
|
|
func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
|
|
mock := setupTestDB(t)
|
|
|
|
// claude-code workspace, 700s old > 600s default timeout → flipped.
|
|
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
|
|
WillReturnRows(candidateRows([3]any{"ws-stuck", "claude-code", 700}))
|
|
|
|
mock.ExpectExec(`UPDATE workspaces`).
|
|
WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
|
|
WillReturnResult(sqlmock.NewResult(0, 1))
|
|
|
|
emit := &fakeEmitter{}
|
|
sweepStuckProvisioning(context.Background(), emit, nil)
|
|
|
|
if emit.count() != 1 {
|
|
t.Fatalf("expected 1 event, got %d", emit.count())
|
|
}
|
|
if emit.events[0].Type != "WORKSPACE_PROVISION_FAILED" {
|
|
t.Errorf("event type = %q, want WORKSPACE_PROVISION_FAILED", emit.events[0].Type)
|
|
}
|
|
if emit.events[0].WorkspaceID != "ws-stuck" {
|
|
t.Errorf("workspace id = %q, want ws-stuck", emit.events[0].WorkspaceID)
|
|
}
|
|
if err := mock.ExpectationsWereMet(); err != nil {
|
|
t.Errorf("unmet expectations: %v", err)
|
|
}
|
|
}
|
|
|
|
// TestSweepStuckProvisioning_HermesGets30MinSlack — the regression that
|
|
// motivated the runtime-aware change. A hermes workspace 11 min into
|
|
// cold-boot must NOT be flipped to failed; the watcher's 25-min budget
|
|
// covers it. Without the fix, the 10-min sweep killed healthy hermes
|
|
// boots mid-install (issue #2061's E2E failure on 2026-04-26).
|
|
func TestSweepStuckProvisioning_HermesGets30MinSlack(t *testing.T) {
|
|
mock := setupTestDB(t)
|
|
|
|
// 11 min = 660 sec. < HermesProvisioningTimeout (1800s).
|
|
// No UPDATE should fire — hermes still has time.
|
|
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
|
|
WillReturnRows(candidateRows([3]any{"ws-hermes-booting", "hermes", 660}))
|
|
|
|
emit := &fakeEmitter{}
|
|
sweepStuckProvisioning(context.Background(), emit, nil)
|
|
|
|
if emit.count() != 0 {
|
|
t.Fatalf("hermes at 11min should NOT have been flipped, got %d events", emit.count())
|
|
}
|
|
if err := mock.ExpectationsWereMet(); err != nil {
|
|
t.Errorf("unmet expectations: %v", err)
|
|
}
|
|
}
|
|
|
|
// TestSweepStuckProvisioning_HermesPastDeadline — a hermes workspace
|
|
// past 30 min DOES get flipped. Closes the loop on the runtime-aware
|
|
// fix: it's still bounded, just with a longer threshold than other
|
|
// runtimes.
|
|
func TestSweepStuckProvisioning_HermesPastDeadline(t *testing.T) {
|
|
mock := setupTestDB(t)
|
|
|
|
// 31 min = 1860 sec > HermesProvisioningTimeout (1800s).
|
|
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
|
|
WillReturnRows(candidateRows([3]any{"ws-hermes-stuck", "hermes", 1860}))
|
|
mock.ExpectExec(`UPDATE workspaces`).
|
|
WithArgs("ws-hermes-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
|
|
WillReturnResult(sqlmock.NewResult(0, 1))
|
|
|
|
emit := &fakeEmitter{}
|
|
sweepStuckProvisioning(context.Background(), emit, nil)
|
|
|
|
if emit.count() != 1 {
|
|
t.Fatalf("hermes past 30min must be flipped, got %d events", emit.count())
|
|
}
|
|
// Payload should include runtime so ops can distinguish in logs.
|
|
payload, ok := emit.events[0].Payload.(map[string]interface{})
|
|
if !ok {
|
|
t.Fatalf("payload not a map: %T", emit.events[0].Payload)
|
|
}
|
|
if payload["runtime"] != "hermes" {
|
|
t.Errorf("payload.runtime = %v, want hermes", payload["runtime"])
|
|
}
|
|
}
|
|
|
|
// TestSweepStuckProvisioning_ManifestOverrideSparesRow pins the
|
|
// integration of the sweeper + RuntimeTimeoutLookup contract introduced
|
|
// in #2494. Closes the gap that the unit-test on provisioningTimeoutFor
|
|
// alone left open: a future refactor could drop the lookup arg from
|
|
// sweepStuckProvisioning's call to provisioningTimeoutFor and only the
|
|
// unit test would catch it. This test fails on that refactor too.
|
|
//
|
|
// Scenario: a claude-code workspace 11 min old (660s). Default budget
|
|
// is 10 min (600s) → without manifest override, this would be flipped
|
|
// to failed. Manifest override declares 1200s → it should be SPARED.
|
|
// No UPDATE, no event emitted.
|
|
func TestSweepStuckProvisioning_ManifestOverrideSparesRow(t *testing.T) {
|
|
mock := setupTestDB(t)
|
|
|
|
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
|
|
WillReturnRows(candidateRows([3]any{"ws-claude-templated", "claude-code", 660}))
|
|
|
|
// No ExpectExec — if the sweeper still flips the row, sqlmock will
|
|
// fail with an unexpected-query error.
|
|
|
|
lookup := func(runtime string) int {
|
|
if runtime == "claude-code" {
|
|
return 1200 // manifest override: 20 min
|
|
}
|
|
return 0
|
|
}
|
|
|
|
emit := &fakeEmitter{}
|
|
sweepStuckProvisioning(context.Background(), emit, lookup)
|
|
|
|
if emit.count() != 0 {
|
|
t.Errorf("manifest-overridden row should NOT have been flipped, got %d events", emit.count())
|
|
}
|
|
if err := mock.ExpectationsWereMet(); err != nil {
|
|
t.Errorf("unmet expectations: %v", err)
|
|
}
|
|
}
|
|
|
|
// TestSweepStuckProvisioning_ManifestOverrideStillFlipsPastDeadline —
|
|
// the symmetric case. Manifest override gives a longer window but a
|
|
// row past THAT longer window must still be flipped. Otherwise a
|
|
// template that declares an absurd timeout could leave rows wedged
|
|
// forever.
|
|
func TestSweepStuckProvisioning_ManifestOverrideStillFlipsPastDeadline(t *testing.T) {
|
|
mock := setupTestDB(t)
|
|
|
|
// 21 min = 1260s > 1200s manifest override → flipped.
|
|
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
|
|
WillReturnRows(candidateRows([3]any{"ws-claude-truly-stuck", "claude-code", 1260}))
|
|
mock.ExpectExec(`UPDATE workspaces`).
|
|
WithArgs("ws-claude-truly-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
|
|
WillReturnResult(sqlmock.NewResult(0, 1))
|
|
|
|
lookup := func(runtime string) int {
|
|
if runtime == "claude-code" {
|
|
return 1200
|
|
}
|
|
return 0
|
|
}
|
|
|
|
emit := &fakeEmitter{}
|
|
sweepStuckProvisioning(context.Background(), emit, lookup)
|
|
|
|
if emit.count() != 1 {
|
|
t.Fatalf("row past manifest deadline must still be flipped, got %d events", emit.count())
|
|
}
|
|
payload, ok := emit.events[0].Payload.(map[string]interface{})
|
|
if !ok {
|
|
t.Fatalf("payload not a map: %T", emit.events[0].Payload)
|
|
}
|
|
if payload["timeout_secs"] != 1200 {
|
|
t.Errorf("payload.timeout_secs = %v, want 1200 (manifest override applied to event payload)", payload["timeout_secs"])
|
|
}
|
|
if err := mock.ExpectationsWereMet(); err != nil {
|
|
t.Errorf("unmet expectations: %v", err)
|
|
}
|
|
}
|
|
|
|
// TestSweepStuckProvisioning_RaceSafe covers the case where UPDATE affects
|
|
// 0 rows because the workspace flipped to online (or got restarted) between
|
|
// the SELECT and the UPDATE. We should skip the event, not emit a false
|
|
// WORKSPACE_PROVISION_FAILED.
|
|
func TestSweepStuckProvisioning_RaceSafe(t *testing.T) {
|
|
mock := setupTestDB(t)
|
|
|
|
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
|
|
WillReturnRows(candidateRows([3]any{"ws-raced", "claude-code", 700}))
|
|
|
|
mock.ExpectExec(`UPDATE workspaces`).
|
|
WithArgs("ws-raced", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
|
|
WillReturnResult(sqlmock.NewResult(0, 0)) // 0 rows — raced
|
|
|
|
emit := &fakeEmitter{}
|
|
sweepStuckProvisioning(context.Background(), emit, nil)
|
|
|
|
if emit.count() != 0 {
|
|
t.Errorf("expected 0 events on race, got %d", emit.count())
|
|
}
|
|
if err := mock.ExpectationsWereMet(); err != nil {
|
|
t.Errorf("unmet expectations: %v", err)
|
|
}
|
|
}
|
|
|
|
// TestSweepStuckProvisioning_NoStuck verifies that an empty candidate list
|
|
// produces no events and no update queries.
|
|
func TestSweepStuckProvisioning_NoStuck(t *testing.T) {
|
|
mock := setupTestDB(t)
|
|
|
|
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
|
|
WillReturnRows(candidateRows())
|
|
|
|
emit := &fakeEmitter{}
|
|
sweepStuckProvisioning(context.Background(), emit, nil)
|
|
|
|
if emit.count() != 0 {
|
|
t.Errorf("expected 0 events when nothing stuck, got %d", emit.count())
|
|
}
|
|
if err := mock.ExpectationsWereMet(); err != nil {
|
|
t.Errorf("unmet expectations: %v", err)
|
|
}
|
|
}
|
|
|
|
// TestSweepStuckProvisioning_MultipleStuck covers the realistic case where
|
|
// both agents (claude-code + hermes) are stuck — both should get flipped
|
|
// and both should get events. claude-code at 11 min (over its 10-min
|
|
// limit), hermes at 31 min (over its 30-min limit).
|
|
func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) {
|
|
mock := setupTestDB(t)
|
|
|
|
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
|
|
WillReturnRows(candidateRows(
|
|
[3]any{"ws-claude-code", "claude-code", 700},
|
|
[3]any{"ws-hermes", "hermes", 1860},
|
|
))
|
|
|
|
mock.ExpectExec(`UPDATE workspaces`).
|
|
WithArgs("ws-claude-code", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
|
|
WillReturnResult(sqlmock.NewResult(0, 1))
|
|
mock.ExpectExec(`UPDATE workspaces`).
|
|
WithArgs("ws-hermes", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
|
|
WillReturnResult(sqlmock.NewResult(0, 1))
|
|
|
|
emit := &fakeEmitter{}
|
|
sweepStuckProvisioning(context.Background(), emit, nil)
|
|
|
|
if emit.count() != 2 {
|
|
t.Fatalf("expected 2 events, got %d", emit.count())
|
|
}
|
|
}
|
|
|
|
// TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash ensures the
|
|
// sweeper tolerates a broadcast error (Redis hiccup) — the DB row is
|
|
// already flipped so the state stays coherent.
|
|
func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) {
|
|
mock := setupTestDB(t)
|
|
|
|
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
|
|
WillReturnRows(candidateRows([3]any{"ws-stuck", "claude-code", 700}))
|
|
mock.ExpectExec(`UPDATE workspaces`).
|
|
WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
|
|
WillReturnResult(sqlmock.NewResult(0, 1))
|
|
|
|
emit := &fakeEmitter{fail: true}
|
|
// Must not panic.
|
|
sweepStuckProvisioning(context.Background(), emit, nil)
|
|
}
|
|
|
|
// TestProvisioningTimeout_EnvOverride verifies PROVISION_TIMEOUT_SECONDS
|
|
// env var takes effect when set to a positive integer, and falls back to
|
|
// the per-runtime default otherwise.
|
|
func TestProvisioningTimeout_EnvOverride(t *testing.T) {
|
|
t.Setenv("PROVISION_TIMEOUT_SECONDS", "60")
|
|
// When env override is set it wins over runtime defaults.
|
|
if got := provisioningTimeoutFor("", nil); got.Seconds() != 60 {
|
|
t.Errorf("override (no runtime): got %v, want 60s", got)
|
|
}
|
|
if got := provisioningTimeoutFor("hermes", nil); got.Seconds() != 60 {
|
|
t.Errorf("override (hermes): got %v, want 60s", got)
|
|
}
|
|
t.Setenv("PROVISION_TIMEOUT_SECONDS", "")
|
|
if got := provisioningTimeoutFor("", nil); got != DefaultProvisioningTimeout {
|
|
t.Errorf("default (no runtime): got %v, want %v", got, DefaultProvisioningTimeout)
|
|
}
|
|
t.Setenv("PROVISION_TIMEOUT_SECONDS", "not-a-number")
|
|
if got := provisioningTimeoutFor("claude-code", nil); got != DefaultProvisioningTimeout {
|
|
t.Errorf("bad override (claude-code): got %v, want default %v", got, DefaultProvisioningTimeout)
|
|
}
|
|
}
|
|
|
|
// TestProvisioningTimeout_RuntimeAware verifies hermes gets the longer
|
|
// HermesProvisioningTimeout while other runtimes keep the default.
|
|
// Mirrors bootstrap_watcher.go's bootstrapTimeoutFn — these two
|
|
// timeouts must stay in sync (sweep > watcher) or healthy hermes
|
|
// boots get killed mid-install.
|
|
func TestProvisioningTimeout_RuntimeAware(t *testing.T) {
|
|
cases := []struct {
|
|
runtime string
|
|
want time.Duration
|
|
}{
|
|
{"hermes", HermesProvisioningTimeout},
|
|
{"langgraph", DefaultProvisioningTimeout},
|
|
{"claude-code", DefaultProvisioningTimeout},
|
|
{"crewai", DefaultProvisioningTimeout},
|
|
{"autogen", DefaultProvisioningTimeout},
|
|
{"", DefaultProvisioningTimeout},
|
|
{"unknown-runtime", DefaultProvisioningTimeout},
|
|
}
|
|
for _, c := range cases {
|
|
if got := provisioningTimeoutFor(c.runtime, nil); got != c.want {
|
|
t.Errorf("runtime=%q: got %v, want %v", c.runtime, got, c.want)
|
|
}
|
|
}
|
|
}
|
|
|
|
// TestProvisioningTimeout_ManifestOverride pins the resolution order
|
|
// when a template's config.yaml declared
|
|
// `runtime_config.provision_timeout_seconds`. Without this gate, the
|
|
// sweeper kept the hardcoded 10-min floor regardless of manifest —
|
|
// which is the original wiring gap that drove false-positive timeouts
|
|
// on cold-pull claude-code bursts.
|
|
//
|
|
// Order pinned:
|
|
//
|
|
// 1. PROVISION_TIMEOUT_SECONDS env beats everything (ops debug).
|
|
// 2. Manifest lookup beats hermes special-case + default.
|
|
// 3. Hermes default applies when lookup returns 0 for hermes.
|
|
// 4. DefaultProvisioningTimeout applies when lookup returns 0 for
|
|
// anything else.
|
|
// 5. Lookup returning 0 for ANY runtime is "no override" — never
|
|
// a 0-second timeout (which would kill every workspace instantly).
|
|
func TestProvisioningTimeout_ManifestOverride(t *testing.T) {
|
|
manifest := map[string]int{
|
|
"claude-code": 900, // 15 min — what an ops manifest bump would set
|
|
"langgraph": 1200,
|
|
"hermes": 2400, // 40 min — manifest can override hermes default too
|
|
}
|
|
lookup := func(runtime string) int { return manifest[runtime] }
|
|
|
|
cases := []struct {
|
|
name string
|
|
runtime string
|
|
want time.Duration
|
|
}{
|
|
{"manifest override beats default for claude-code", "claude-code", 900 * time.Second},
|
|
{"manifest override applied for langgraph", "langgraph", 1200 * time.Second},
|
|
{"manifest override beats hermes default", "hermes", 2400 * time.Second},
|
|
{"unknown runtime + no manifest entry → default", "unknown-runtime", DefaultProvisioningTimeout},
|
|
{"empty runtime + no manifest entry → default", "", DefaultProvisioningTimeout},
|
|
}
|
|
for _, c := range cases {
|
|
t.Run(c.name, func(t *testing.T) {
|
|
if got := provisioningTimeoutFor(c.runtime, lookup); got != c.want {
|
|
t.Errorf("got %v, want %v", got, c.want)
|
|
}
|
|
})
|
|
}
|
|
|
|
// Env override beats manifest — ops debug must be the top priority.
|
|
t.Setenv("PROVISION_TIMEOUT_SECONDS", "60")
|
|
if got := provisioningTimeoutFor("claude-code", lookup); got.Seconds() != 60 {
|
|
t.Errorf("env-override should beat manifest: got %v, want 60s", got)
|
|
}
|
|
t.Setenv("PROVISION_TIMEOUT_SECONDS", "")
|
|
|
|
// Lookup returning 0 means "no entry" — must NOT result in a
|
|
// 0-second timeout. Falls through to runtime defaults.
|
|
zeroLookup := func(_ string) int { return 0 }
|
|
if got := provisioningTimeoutFor("claude-code", zeroLookup); got != DefaultProvisioningTimeout {
|
|
t.Errorf("zero-from-lookup should fall through to default, got %v", got)
|
|
}
|
|
if got := provisioningTimeoutFor("hermes", zeroLookup); got != HermesProvisioningTimeout {
|
|
t.Errorf("zero-from-lookup should fall through to hermes default, got %v", got)
|
|
}
|
|
}
|