fix(registry): runtime-aware provision-timeout sweep — give hermes 30 min

Pre-fix: workspace-server's provision-timeout sweep was hardcoded
at 10 min for all runtimes. The CP-side bootstrap-watcher (cp#245)
correctly gives hermes 25 min for cold-boot (hermes installs
include apt + uv + Python venv + Node + hermes-agent — 13–25 min on
slow apt mirrors is normal). The two timeout systems disagreed:
the watcher would happily wait 25 min, but the workspace-server's
10-min sweep killed healthy hermes boots mid-install at 10 min and
marked them failed.

Today's example: #2061's E2E run on 2026-04-26 at 08:06:34Z
created a hermes workspace, EC2 cloud-init was visibly making
progress on apt-installs (libcjson1, libmbedcrypto7t64) when the
sweep flipped status to 'failed' at 08:17:00Z (10:26 elapsed). The
test threw "Workspace failed: " (empty error from sql.NullString
serialization) and CI failed on a healthy boot.

Fix: provisioningTimeoutFor(runtime) — same shape as the CP's
bootstrapTimeoutFn:
  - hermes:  30 min (watcher's 25 min + 5 min slack)
  - others:  10 min (unchanged — claude-code/langgraph/etc. boot
                     in <5 min, 10 min is plenty)

PROVISION_TIMEOUT_SECONDS env override still works (applies to all
runtimes — operators who care about the runtime distinction
shouldn't use the override anyway).

Sweep query change: pulls (id, runtime, age_sec) per row instead
of pre-filtering by age in SQL. Per-row Go evaluation picks the
correct timeout. Slightly more rows scanned but bounded by the
status='provisioning' partial index — workspaces in flight, not
historical.

Tests:
  - TestProvisioningTimeout_RuntimeAware — locks in the per-runtime
    mapping
  - TestSweepStuckProvisioning_HermesGets30MinSlack — hermes at
    11 min must NOT be flipped
  - TestSweepStuckProvisioning_HermesPastDeadline — hermes at
    31 min IS flipped, payload includes runtime
  - Existing tests updated for the new query shape

Verified:
  - go build ./... clean
  - go vet ./... clean
  - go test ./... all green

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-26 01:44:09 -07:00
parent c4681c335e
commit be1beff4a0
2 changed files with 188 additions and 52 deletions

View File

@ -18,30 +18,49 @@ type ProvisionTimeoutEmitter interface {
}
// DefaultProvisioningTimeout is how long a workspace may sit in
// status='provisioning' before the sweeper flips it to 'failed'. The
// container-launch path has its own 3-minute context timeout
// (provisioner.ProvisionTimeout) but that only bounds the docker API call —
// a container that started but crashes before /registry/register never
// triggers that path and would sit in provisioning forever. 10 minutes
// covers pathological image-pull + user-data execution on a cold EC2 worker
// while still getting well ahead of the "15+ minute" stuck state users see
// in production.
// status='provisioning' before the sweeper flips it to 'failed'.
// Default for non-hermes runtimes (claude-code, langgraph, crewai,
// autogen, etc.) which cold-boot in <5 min. The container-launch path
// has its own 3-minute context timeout (provisioner.ProvisionTimeout)
// but that only bounds the docker API call — a container that started
// but crashes before /registry/register never triggers that path and
// would sit in provisioning forever. 10 minutes covers pathological
// image-pull + user-data execution on a cold EC2 worker while still
// getting well ahead of the "15+ minute" stuck state users see in
// production.
const DefaultProvisioningTimeout = 10 * time.Minute
// HermesProvisioningTimeout matches the CP bootstrap-watcher's
// runtime-aware deadline (cp#245) for hermes workspaces: 25 min watcher
// + 5 min sweep slack. Hermes cold-boot does apt + uv + Python venv +
// Node + hermes-agent install — 1325 min on slow apt mirrors is
// normal. Without this, the sweep would flip the workspace to 'failed'
// at 10 min while the watcher (and the workspace itself) is still
// happily progressing through install. Issue #1843 follow-up: a
// healthy 10.5-min hermes boot was killed by the 10-min sweep on
// 2026-04-26, breaking #2061's E2E.
const HermesProvisioningTimeout = 30 * time.Minute
// DefaultProvisionSweepInterval is how often the sweeper polls. Same cadence
// as the hibernation monitor — cheap and bounded by the provisioning-state
// query which hits the primary key / status partial index.
const DefaultProvisionSweepInterval = 30 * time.Second
// provisioningTimeout reads the override from env, falling back to the
// default. Env var expressed in seconds so operators can tune via a normal
// container restart without a code change.
func provisioningTimeout() time.Duration {
// provisioningTimeoutFor picks the per-runtime sweep deadline. Mirrors
// the CP bootstrap-watcher's runtime gating (provisioner.bootstrapTimeoutFn).
// PROVISION_TIMEOUT_SECONDS env override, when set, applies to ALL
// runtimes — useful for ops debugging but loses the runtime nuance, so
// operators should prefer the defaults unless they have a specific
// reason.
func provisioningTimeoutFor(runtime string) time.Duration {
if v := os.Getenv("PROVISION_TIMEOUT_SECONDS"); v != "" {
if n, err := strconv.Atoi(v); err == nil && n > 0 {
return time.Duration(n) * time.Second
}
}
if runtime == "hermes" {
return HermesProvisioningTimeout
}
return DefaultProvisioningTimeout
}
@ -65,7 +84,8 @@ func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeout
ticker := time.NewTicker(interval)
defer ticker.Stop()
log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s)", interval, provisioningTimeout())
log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s default / %s hermes)",
interval, DefaultProvisioningTimeout, HermesProvisioningTimeout)
for {
select {
@ -80,33 +100,51 @@ func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeout
// sweepStuckProvisioning is one tick of the sweeper. Exported-for-test via
// the package boundary: keep all time.Now reads inside so tests can drive it
// deterministically by seeding updated_at rather than manipulating time.
//
// Runtime-aware: the per-workspace timeout depends on `runtime`. Hermes
// gets 30 min (matching the CP bootstrap-watcher's 25-min deadline + 5
// min slack); everything else gets 10 min. Without this distinction a
// healthy hermes cold-boot at 1025 min got killed mid-install by this
// sweep, leaving an incoherent "marked failed but actually working"
// state. See bootstrap_watcher.go's bootstrapTimeoutFn for the
// canonical CP-side gating.
func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter) {
timeout := provisioningTimeout()
timeoutSec := int(timeout / time.Second)
// Read candidates first so the event broadcast can include each id. The
// subsequent UPDATE re-checks the predicate to stay race-safe against
// concurrent restart / register paths that write updated_at.
// We can't pre-filter by age in SQL because the threshold depends
// on the row's runtime. Pull every provisioning row + its runtime
// + its age, evaluate per-row in Go. Still cheap — the
// status='provisioning' row count is bounded (workspaces in
// flight, not historical) and the partial index on status keeps
// it fast.
rows, err := db.DB.QueryContext(ctx, `
SELECT id FROM workspaces
SELECT id, COALESCE(runtime, ''), EXTRACT(EPOCH FROM (now() - updated_at))::int
FROM workspaces
WHERE status = 'provisioning'
AND updated_at < now() - ($1 || ' seconds')::interval
`, timeoutSec)
`)
if err != nil {
log.Printf("Provision-timeout sweep: query error: %v", err)
return
}
defer rows.Close()
var ids []string
type candidate struct {
id string
runtime string
ageSec int
}
var ids []candidate
for rows.Next() {
var id string
if err := rows.Scan(&id); err == nil {
ids = append(ids, id)
var c candidate
if err := rows.Scan(&c.id, &c.runtime, &c.ageSec); err == nil {
ids = append(ids, c)
}
}
for _, id := range ids {
for _, c := range ids {
timeout := provisioningTimeoutFor(c.runtime)
timeoutSec := int(timeout / time.Second)
if c.ageSec < timeoutSec {
continue
}
msg := "provisioning timed out — container started but never called /registry/register. Check container logs and network connectivity to the platform."
res, err := db.DB.ExecContext(ctx, `
UPDATE workspaces
@ -116,9 +154,9 @@ func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter
WHERE id = $1
AND status = 'provisioning'
AND updated_at < now() - ($3 || ' seconds')::interval
`, id, msg, timeoutSec)
`, c.id, msg, timeoutSec)
if err != nil {
log.Printf("Provision-timeout sweep: failed to flip %s to failed: %v", id, err)
log.Printf("Provision-timeout sweep: failed to flip %s to failed: %v", c.id, err)
continue
}
affected, _ := res.RowsAffected()
@ -126,18 +164,19 @@ func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter
// Raced with restart / register — no harm, just skip.
continue
}
log.Printf("Provision-timeout sweep: %s stuck in provisioning > %s — marked failed", id, timeout)
log.Printf("Provision-timeout sweep: %s (runtime=%q) stuck in provisioning > %s — marked failed", c.id, c.runtime, timeout)
// Emit as WORKSPACE_PROVISION_FAILED, not _TIMEOUT, because the
// canvas event handler only flips node state on the _FAILED case.
// A separate event type was considered but the UI reaction is
// identical either way — operators who need to distinguish can
// tell from the `source` payload field.
if emitErr := emitter.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", id, map[string]interface{}{
if emitErr := emitter.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", c.id, map[string]interface{}{
"error": msg,
"timeout_secs": timeoutSec,
"runtime": c.runtime,
"source": "provision_timeout_sweep",
}); emitErr != nil {
log.Printf("Provision-timeout sweep: broadcast failed for %s: %v", id, emitErr)
log.Printf("Provision-timeout sweep: broadcast failed for %s: %v", c.id, emitErr)
}
}
}

View File

@ -5,6 +5,7 @@ import (
"errors"
"sync"
"testing"
"time"
"github.com/DATA-DOG/go-sqlmock"
)
@ -40,13 +41,24 @@ func (f *fakeEmitter) count() int {
return len(f.events)
}
// candidateRows builds the new-shape query result (id, runtime, age_sec).
// Use this in every sweep test to match the runtime-aware SELECT.
func candidateRows(rows ...[3]any) *sqlmock.Rows {
r := sqlmock.NewRows([]string{"id", "runtime", "age_sec"})
for _, row := range rows {
r = r.AddRow(row[0], row[1], row[2])
}
return r
}
// TestSweepStuckProvisioning_FlipsOverdue verifies the happy path: a stuck
// provisioning workspace gets flipped to failed AND an event is broadcast.
func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
mock := setupTestDB(t)
mock.ExpectQuery(`SELECT id FROM workspaces`).
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-stuck"))
// claude-code workspace, 700s old > 600s default timeout → flipped.
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
WillReturnRows(candidateRows([3]any{"ws-stuck", "claude-code", 700}))
mock.ExpectExec(`UPDATE workspaces`).
WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg()).
@ -69,6 +81,60 @@ func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
}
}
// TestSweepStuckProvisioning_HermesGets30MinSlack — the regression that
// motivated the runtime-aware change. A hermes workspace 11 min into
// cold-boot must NOT be flipped to failed; the watcher's 25-min budget
// covers it. Without the fix, the 10-min sweep killed healthy hermes
// boots mid-install (issue #2061's E2E failure on 2026-04-26).
func TestSweepStuckProvisioning_HermesGets30MinSlack(t *testing.T) {
mock := setupTestDB(t)
// 11 min = 660 sec. < HermesProvisioningTimeout (1800s).
// No UPDATE should fire — hermes still has time.
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
WillReturnRows(candidateRows([3]any{"ws-hermes-booting", "hermes", 660}))
emit := &fakeEmitter{}
sweepStuckProvisioning(context.Background(), emit)
if emit.count() != 0 {
t.Fatalf("hermes at 11min should NOT have been flipped, got %d events", emit.count())
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet expectations: %v", err)
}
}
// TestSweepStuckProvisioning_HermesPastDeadline — a hermes workspace
// past 30 min DOES get flipped. Closes the loop on the runtime-aware
// fix: it's still bounded, just with a longer threshold than other
// runtimes.
func TestSweepStuckProvisioning_HermesPastDeadline(t *testing.T) {
mock := setupTestDB(t)
// 31 min = 1860 sec > HermesProvisioningTimeout (1800s).
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
WillReturnRows(candidateRows([3]any{"ws-hermes-stuck", "hermes", 1860}))
mock.ExpectExec(`UPDATE workspaces`).
WithArgs("ws-hermes-stuck", sqlmock.AnyArg(), sqlmock.AnyArg()).
WillReturnResult(sqlmock.NewResult(0, 1))
emit := &fakeEmitter{}
sweepStuckProvisioning(context.Background(), emit)
if emit.count() != 1 {
t.Fatalf("hermes past 30min must be flipped, got %d events", emit.count())
}
// Payload should include runtime so ops can distinguish in logs.
payload, ok := emit.events[0].Payload.(map[string]interface{})
if !ok {
t.Fatalf("payload not a map: %T", emit.events[0].Payload)
}
if payload["runtime"] != "hermes" {
t.Errorf("payload.runtime = %v, want hermes", payload["runtime"])
}
}
// TestSweepStuckProvisioning_RaceSafe covers the case where UPDATE affects
// 0 rows because the workspace flipped to online (or got restarted) between
// the SELECT and the UPDATE. We should skip the event, not emit a false
@ -76,8 +142,8 @@ func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
func TestSweepStuckProvisioning_RaceSafe(t *testing.T) {
mock := setupTestDB(t)
mock.ExpectQuery(`SELECT id FROM workspaces`).
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-raced"))
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
WillReturnRows(candidateRows([3]any{"ws-raced", "claude-code", 700}))
mock.ExpectExec(`UPDATE workspaces`).
WithArgs("ws-raced", sqlmock.AnyArg(), sqlmock.AnyArg()).
@ -99,8 +165,8 @@ func TestSweepStuckProvisioning_RaceSafe(t *testing.T) {
func TestSweepStuckProvisioning_NoStuck(t *testing.T) {
mock := setupTestDB(t)
mock.ExpectQuery(`SELECT id FROM workspaces`).
WillReturnRows(sqlmock.NewRows([]string{"id"}))
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
WillReturnRows(candidateRows())
emit := &fakeEmitter{}
sweepStuckProvisioning(context.Background(), emit)
@ -115,14 +181,16 @@ func TestSweepStuckProvisioning_NoStuck(t *testing.T) {
// TestSweepStuckProvisioning_MultipleStuck covers the realistic case where
// both agents (claude-code + hermes) are stuck — both should get flipped
// and both should get events.
// and both should get events. claude-code at 11 min (over its 10-min
// limit), hermes at 31 min (over its 30-min limit).
func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) {
mock := setupTestDB(t)
mock.ExpectQuery(`SELECT id FROM workspaces`).
WillReturnRows(sqlmock.NewRows([]string{"id"}).
AddRow("ws-claude-code").
AddRow("ws-hermes"))
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
WillReturnRows(candidateRows(
[3]any{"ws-claude-code", "claude-code", 700},
[3]any{"ws-hermes", "hermes", 1860},
))
mock.ExpectExec(`UPDATE workspaces`).
WithArgs("ws-claude-code", sqlmock.AnyArg(), sqlmock.AnyArg()).
@ -145,8 +213,8 @@ func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) {
func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) {
mock := setupTestDB(t)
mock.ExpectQuery(`SELECT id FROM workspaces`).
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-stuck"))
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
WillReturnRows(candidateRows([3]any{"ws-stuck", "claude-code", 700}))
mock.ExpectExec(`UPDATE workspaces`).
WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg()).
WillReturnResult(sqlmock.NewResult(0, 1))
@ -158,18 +226,47 @@ func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) {
// TestProvisioningTimeout_EnvOverride verifies PROVISION_TIMEOUT_SECONDS
// env var takes effect when set to a positive integer, and falls back to
// default otherwise.
// the per-runtime default otherwise.
func TestProvisioningTimeout_EnvOverride(t *testing.T) {
t.Setenv("PROVISION_TIMEOUT_SECONDS", "60")
if got := provisioningTimeout(); got.Seconds() != 60 {
t.Errorf("override: got %v, want 60s", got)
// When env override is set it wins over runtime defaults.
if got := provisioningTimeoutFor(""); got.Seconds() != 60 {
t.Errorf("override (no runtime): got %v, want 60s", got)
}
if got := provisioningTimeoutFor("hermes"); got.Seconds() != 60 {
t.Errorf("override (hermes): got %v, want 60s", got)
}
t.Setenv("PROVISION_TIMEOUT_SECONDS", "")
if got := provisioningTimeout(); got != DefaultProvisioningTimeout {
t.Errorf("default: got %v, want %v", got, DefaultProvisioningTimeout)
if got := provisioningTimeoutFor(""); got != DefaultProvisioningTimeout {
t.Errorf("default (no runtime): got %v, want %v", got, DefaultProvisioningTimeout)
}
t.Setenv("PROVISION_TIMEOUT_SECONDS", "not-a-number")
if got := provisioningTimeout(); got != DefaultProvisioningTimeout {
t.Errorf("bad override: got %v, want default %v", got, DefaultProvisioningTimeout)
if got := provisioningTimeoutFor("claude-code"); got != DefaultProvisioningTimeout {
t.Errorf("bad override (claude-code): got %v, want default %v", got, DefaultProvisioningTimeout)
}
}
// TestProvisioningTimeout_RuntimeAware verifies hermes gets the longer
// HermesProvisioningTimeout while other runtimes keep the default.
// Mirrors bootstrap_watcher.go's bootstrapTimeoutFn — these two
// timeouts must stay in sync (sweep > watcher) or healthy hermes
// boots get killed mid-install.
func TestProvisioningTimeout_RuntimeAware(t *testing.T) {
cases := []struct {
runtime string
want time.Duration
}{
{"hermes", HermesProvisioningTimeout},
{"langgraph", DefaultProvisioningTimeout},
{"claude-code", DefaultProvisioningTimeout},
{"crewai", DefaultProvisioningTimeout},
{"autogen", DefaultProvisioningTimeout},
{"", DefaultProvisioningTimeout},
{"unknown-runtime", DefaultProvisioningTimeout},
}
for _, c := range cases {
if got := provisioningTimeoutFor(c.runtime); got != c.want {
t.Errorf("runtime=%q: got %v, want %v", c.runtime, got, c.want)
}
}
}