Files
molecule-core/workspace-server/internal/registry/provisiontimeout.go
T
claude-ceo-assistant f7e2976324
ci-arm64-advisory / fast-checks (pull_request) Waiting to run
Lint shellcheck (arm64 pilot) / shellcheck-arm64 (pilot) (pull_request) Successful in 9s
Block internal-flavored paths / Block forbidden paths (pull_request) Successful in 7s
Check migration collisions / Migration version collision check (pull_request) Successful in 10s
CI / Detect changes (pull_request) Successful in 7s
CI / Python Lint & Test (pull_request) Successful in 5s
E2E API Smoke Test / detect-changes (pull_request) Successful in 7s
E2E Chat / detect-changes (pull_request) Successful in 7s
E2E Peer Visibility (literal MCP list_peers) / E2E Peer Visibility (pull_request) Successful in 5s
E2E Staging Canvas (Playwright) / detect-changes (pull_request) Successful in 10s
E2E Staging SaaS (full lifecycle) / E2E Staging SaaS (pull_request) Has been skipped
Handlers Postgres Integration / detect-changes (pull_request) Successful in 6s
Harness Replays / detect-changes (pull_request) Successful in 4s
Lint forbidden tenant-env keys / Scan workspace_secrets writers for forbidden env keys (pull_request) Successful in 4s
E2E Staging SaaS (full lifecycle) / pr-validate (pull_request) Successful in 33s
E2E Peer Visibility (literal MCP list_peers) / E2E Peer Visibility (local) (pull_request) Successful in 50s
Lint no tenant GITEA or GITHUB token write / Scan for repo-host token write into tenant workspace surface (pull_request) Successful in 8s
Secret scan / Scan diff for credential-shaped strings (pull_request) Successful in 9s
lint-required-no-paths / lint-required-no-paths (pull_request) Successful in 58s
gate-check-v3 / gate-check (pull_request) Successful in 4s
qa-review / approved (pull_request) Successful in 3s
security-review / approved (pull_request) Successful in 3s
sop-checklist / na-declarations (pull_request) N/A: (none)
sop-checklist / all-items-acked (pull_request) Successful in 4s
sop-checklist / review-refire (pull_request) Has been skipped
sop-tier-check / tier-check (pull_request) Successful in 4s
Ops Scripts Tests / Ops scripts (unittest) (pull_request) Successful in 1m6s
E2E Staging External Runtime / E2E Staging External Runtime (pull_request) Successful in 5m25s
CI / Shellcheck (E2E scripts) (pull_request) Successful in 20s
E2E Chat / E2E Chat (pull_request) Successful in 33s
E2E Staging Canvas (Playwright) / Canvas tabs E2E (pull_request) Successful in 11s
E2E API Smoke Test / E2E API Smoke Test (pull_request) Successful in 1m58s
Handlers Postgres Integration / Handlers Postgres Integration (pull_request) Successful in 2m44s
Harness Replays / Harness Replays (pull_request) Successful in 6s
CI / Platform (Go) (pull_request) Successful in 6m9s
CI / Canvas (Next.js) (pull_request) Successful in 7m41s
CI / all-required (pull_request) Successful in 32m0s
CI / Canvas Deploy Reminder (pull_request) Has been skipped
audit-force-merge / audit (pull_request) Successful in 32s
chore: retire unmaintained workspace runtimes
2026-05-23 23:45:09 -07:00

214 lines
8.7 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package registry
import (
"context"
"log"
"os"
"strconv"
"time"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
)
// ProvisionTimeoutEmitter is the narrow broadcaster dependency the sweeper
// needs. Defined locally so the registry package stays event-bus agnostic
// (same pattern as OfflineHandler in healthsweep.go).
type ProvisionTimeoutEmitter interface {
RecordAndBroadcast(ctx context.Context, eventType string, workspaceID string, payload interface{}) error
}
// DefaultProvisioningTimeout is how long a workspace may sit in
// status='provisioning' before the sweeper flips it to 'failed'.
// Default for non-hermes runtimes (claude-code, codex, openclaw, etc.)
// which cold-boot in <5 min. The container-launch path
// has its own 3-minute context timeout (provisioner.ProvisionTimeout)
// but that only bounds the docker API call — a container that started
// but crashes before /registry/register never triggers that path and
// would sit in provisioning forever. 10 minutes covers pathological
// image-pull + user-data execution on a cold EC2 worker while still
// getting well ahead of the "15+ minute" stuck state users see in
// production.
const DefaultProvisioningTimeout = 10 * time.Minute
// HermesProvisioningTimeout matches the CP bootstrap-watcher's
// runtime-aware deadline (cp#245) for hermes workspaces: 25 min watcher
// + 5 min sweep slack. Hermes cold-boot does apt + uv + Python venv +
// Node + hermes-agent install — 1325 min on slow apt mirrors is
// normal. Without this, the sweep would flip the workspace to 'failed'
// at 10 min while the watcher (and the workspace itself) is still
// happily progressing through install. Issue #1843 follow-up: a
// healthy 10.5-min hermes boot was killed by the 10-min sweep on
// 2026-04-26, breaking #2061's E2E.
const HermesProvisioningTimeout = 30 * time.Minute
// DefaultProvisionSweepInterval is how often the sweeper polls. Same cadence
// as the hibernation monitor — cheap and bounded by the provisioning-state
// query which hits the primary key / status partial index.
const DefaultProvisionSweepInterval = 30 * time.Second
// RuntimeTimeoutLookup returns the per-runtime provision timeout in
// seconds when a template's config.yaml declared
// `runtime_config.provision_timeout_seconds`, else zero (= "no override,
// fall through to runtime defaults below"). Same shape as
// runtimeProvisionTimeoutsCache.get in handlers — wired through main.go
// so this package stays template-discovery agnostic.
//
// Why an interface instead of importing the cache directly: registry
// already sits below handlers in the import graph (handlers → registry,
// not the reverse). A function-typed argument keeps that flow.
type RuntimeTimeoutLookup func(runtime string) int
// provisioningTimeoutFor picks the per-runtime sweep deadline. Resolution
// order:
//
// 1. PROVISION_TIMEOUT_SECONDS env — global override, ops-debug only.
// 2. Template manifest override (lookup) — what the canvas spinner
// also reads via #2054 phase 2. Without this, a template that
// declared `runtime_config.provision_timeout_seconds: 900` would
// still get killed by the sweeper at the 10-min hardcoded floor —
// a real wiring gap that drove every claude-code burst on a cold
// EC2 to false-positive timeout.
// 3. Hermes special-case (CP bootstrap-watcher 25 min + 5 min slack).
// 4. DefaultProvisioningTimeout (10 min) for everything else.
//
// lookup may be nil (during package tests, or before main.go has wired
// it) — falls through to the legacy hermes/default split.
func provisioningTimeoutFor(runtime string, lookup RuntimeTimeoutLookup) time.Duration {
if v := os.Getenv("PROVISION_TIMEOUT_SECONDS"); v != "" {
if n, err := strconv.Atoi(v); err == nil && n > 0 {
return time.Duration(n) * time.Second
}
}
if lookup != nil {
if secs := lookup(runtime); secs > 0 {
return time.Duration(secs) * time.Second
}
}
if runtime == "hermes" {
return HermesProvisioningTimeout
}
return DefaultProvisioningTimeout
}
// StartProvisioningTimeoutSweep periodically scans for workspaces stuck in
// `status='provisioning'` past the timeout window, flips them to `failed`,
// and broadcasts a WORKSPACE_PROVISION_TIMEOUT event so the canvas can
// render a fail-state instead of the indefinite cosmetic "Provisioning
// Timeout" banner.
//
// The sweep is idempotent: the UPDATE's WHERE clause re-checks both status
// and age under the same row lock, so a workspace that raced to `online` or
// was restarted while the sweep was scanning will not get flipped.
func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeoutEmitter, interval time.Duration, lookup RuntimeTimeoutLookup) {
if emitter == nil {
log.Println("Provision-timeout sweep: emitter is nil — skipping (no one to broadcast to)")
return
}
if interval <= 0 {
interval = DefaultProvisionSweepInterval
}
ticker := time.NewTicker(interval)
defer ticker.Stop()
log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s default / %s hermes / per-runtime manifest override=%v)",
interval, DefaultProvisioningTimeout, HermesProvisioningTimeout, lookup != nil)
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
sweepStuckProvisioning(ctx, emitter, lookup)
}
}
}
// sweepStuckProvisioning is one tick of the sweeper. Exported-for-test via
// the package boundary: keep all time.Now reads inside so tests can drive it
// deterministically by seeding updated_at rather than manipulating time.
//
// Runtime-aware: the per-workspace timeout depends on `runtime`. Hermes
// gets 30 min (matching the CP bootstrap-watcher's 25-min deadline + 5
// min slack); everything else gets 10 min. Without this distinction a
// healthy hermes cold-boot at 1025 min got killed mid-install by this
// sweep, leaving an incoherent "marked failed but actually working"
// state. See bootstrap_watcher.go's bootstrapTimeoutFn for the
// canonical CP-side gating.
func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter, lookup RuntimeTimeoutLookup) {
// We can't pre-filter by age in SQL because the threshold depends
// on the row's runtime. Pull every provisioning row + its runtime
// + its age, evaluate per-row in Go. Still cheap — the
// status='provisioning' row count is bounded (workspaces in
// flight, not historical) and the partial index on status keeps
// it fast.
rows, err := db.DB.QueryContext(ctx, `
SELECT id, COALESCE(runtime, ''), EXTRACT(EPOCH FROM (now() - updated_at))::int
FROM workspaces
WHERE status = 'provisioning'
`)
if err != nil {
log.Printf("Provision-timeout sweep: query error: %v", err)
return
}
defer rows.Close()
type candidate struct {
id string
runtime string
ageSec int
}
var ids []candidate
for rows.Next() {
var c candidate
if err := rows.Scan(&c.id, &c.runtime, &c.ageSec); err == nil {
ids = append(ids, c)
}
}
if err := rows.Err(); err != nil {
log.Printf("Provision-timeout sweep: rows error: %v", err)
}
for _, c := range ids {
timeout := provisioningTimeoutFor(c.runtime, lookup)
timeoutSec := int(timeout / time.Second)
if c.ageSec < timeoutSec {
continue
}
msg := "provisioning timed out — container started but never called /registry/register. Check container logs and network connectivity to the platform."
res, err := db.DB.ExecContext(ctx, `
UPDATE workspaces
SET status = $4,
last_sample_error = $2,
updated_at = now()
WHERE id = $1
AND status = 'provisioning'
AND updated_at < now() - ($3 || ' seconds')::interval
`, c.id, msg, timeoutSec, models.StatusFailed)
if err != nil {
log.Printf("Provision-timeout sweep: failed to flip %s to failed: %v", c.id, err)
continue
}
affected, _ := res.RowsAffected()
if affected == 0 {
// Raced with restart / register — no harm, just skip.
continue
}
log.Printf("Provision-timeout sweep: %s (runtime=%q) stuck in provisioning > %s — marked failed", c.id, c.runtime, timeout)
// Emit as WORKSPACE_PROVISION_FAILED, not _TIMEOUT, because the
// canvas event handler only flips node state on the _FAILED case.
// A separate event type was considered but the UI reaction is
// identical either way — operators who need to distinguish can
// tell from the `source` payload field.
if emitErr := emitter.RecordAndBroadcast(ctx, string(events.EventWorkspaceProvisionFailed), c.id, map[string]interface{}{
"error": msg,
"timeout_secs": timeoutSec,
"runtime": c.runtime,
"source": "provision_timeout_sweep",
}); emitErr != nil {
log.Printf("Provision-timeout sweep: broadcast failed for %s: %v", c.id, emitErr)
}
}
}