Files
Molecule AI Dev Engineer A (Kimi) 02a3de7c0e
ci-arm64-advisory / fast-checks (pull_request) Waiting to run
Block internal-flavored paths / Block forbidden paths (pull_request) Successful in 4s
CI / Python Lint & Test (pull_request) Successful in 4s
E2E API Smoke Test / detect-changes (pull_request) Successful in 12s
CI / Detect changes (pull_request) Successful in 13s
Lint shellcheck (arm64 pilot) / shellcheck-arm64 (pilot) (pull_request) Successful in 12s
E2E Chat / detect-changes (pull_request) Successful in 12s
E2E Staging Canvas (Playwright) / detect-changes (pull_request) Successful in 10s
Handlers Postgres Integration / detect-changes (pull_request) Successful in 4s
Harness Replays / detect-changes (pull_request) Successful in 5s
Lint forbidden tenant-env keys / Scan workspace_secrets writers for forbidden env keys (pull_request) Successful in 4s
Lint no tenant GITEA or GITHUB token write / Scan for repo-host token write into tenant workspace surface (pull_request) Successful in 4s
Secret scan / Scan diff for credential-shaped strings (pull_request) Successful in 5s
gate-check-v3 / gate-check (pull_request) Successful in 6s
qa-review / approved (pull_request) Failing after 7s
sop-checklist / na-declarations (pull_request) N/A: (none)
security-review / approved (pull_request) Failing after 7s
sop-checklist / review-refire (pull_request) Has been skipped
sop-checklist / all-items-acked (pull_request) Successful in 4s
sop-tier-check / tier-check (pull_request) Successful in 4s
CI / Canvas (Next.js) (pull_request) Successful in 6s
CI / Shellcheck (E2E scripts) (pull_request) Successful in 2s
lint-required-no-paths / lint-required-no-paths (pull_request) Successful in 1m13s
E2E Chat / E2E Chat (pull_request) Successful in 6s
E2E Staging Canvas (Playwright) / Canvas tabs E2E (pull_request) Successful in 7s
E2E API Smoke Test / E2E API Smoke Test (pull_request) Successful in 1m37s
Harness Replays / Harness Replays (pull_request) Successful in 4s
CI / Canvas Deploy Reminder (pull_request) Has been skipped
Handlers Postgres Integration / Handlers Postgres Integration (pull_request) Successful in 2m16s
E2E Staging External Runtime / E2E Staging External Runtime (pull_request) Successful in 5m4s
CI / Platform (Go) (pull_request) Successful in 5m20s
CI / all-required (pull_request) Successful in 6m5s
audit-force-merge / audit (pull_request) Successful in 10s
fix(workspace-server): replace time.After with time.NewTimer to prevent goroutine leaks
Inside loops, time.After creates a new timer goroutine each iteration
that cannot be GC'd until it fires. In long-running loops (supervisor
restart backoff, Telegram polling, restart-context polling, CP stop
retry) this leaks goroutines proportional to iteration count.

Replace with time.NewTimer + timer.Stop() on ctx cancellation so the
timer is cleaned up immediately when the goroutine exits.

Affected files:
- supervised/supervised.go (RunWithRecover backoff)
- channels/telegram.go (429 retry + poll error sleep)
- handlers/restart_context.go (online + heartbeat polling)
- handlers/workspace_restart.go (cpStop retry backoff)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-27 09:45:31 +00:00

145 lines
4.2 KiB
Go

// Package supervised provides a panic-recovering supervisor for long-running
// background goroutines on the platform. Every "go X.Start(ctx)" invocation
// in main.go should go through [RunWithRecover] so a single panic from one
// tenant's data cannot silently kill a subsystem that serves every tenant.
//
// Incident that motivated this (issue #85, 2026-04-14):
//
// The cron scheduler goroutine died silently at 14:21 UTC and stayed dead
// for 12+ hours. Platform restart didn't recover it. Root cause: no
// defer recover() in the tick loop. Observable signals (HTTP 200, container
// healthy, DB healthy) all stayed green — only the subsystem was dead.
//
// In a multi-tenant SaaS deployment the blast radius is every tenant
// simultaneously, which is exactly the class of failure we cannot afford.
package supervised
import (
"context"
"log"
"runtime/debug"
"sync"
"time"
)
// Default backoff bounds for RunWithRecover restarts.
const (
initialBackoff = 1 * time.Second
maxBackoff = 30 * time.Second
)
// RunWithRecover runs fn in a recover wrapper. If fn panics, the panic is
// logged with its stack trace and fn is restarted after an exponential
// backoff (capped at maxBackoff). The loop exits cleanly when ctx is done.
//
// fn is expected to be a long-running loop (e.g. "for { select { ticker ... } }").
// If fn returns without panicking (e.g. ctx.Done), RunWithRecover returns.
//
// go supervised.RunWithRecover(ctx, "scheduler", func(c context.Context) {
// scheduler.Start(c)
// })
//
// name is used in log lines and by the liveness registry below.
func RunWithRecover(ctx context.Context, name string, fn func(context.Context)) {
backoff := initialBackoff
for {
select {
case <-ctx.Done():
log.Printf("supervised[%s]: context done; stopping", name)
return
default:
}
panicked := runOnce(ctx, name, fn)
// Clean return → the goroutine decided to stop (likely ctx.Done inside fn).
// Don't restart.
if !panicked {
log.Printf("supervised[%s]: returned cleanly; not restarting", name)
return
}
// Panic → back off and restart.
timer := time.NewTimer(backoff)
select {
case <-ctx.Done():
timer.Stop()
return
case <-timer.C:
}
if backoff < maxBackoff {
backoff *= 2
if backoff > maxBackoff {
backoff = maxBackoff
}
}
}
}
// runOnce invokes fn with recover. Returns true iff fn panicked.
func runOnce(ctx context.Context, name string, fn func(context.Context)) (panicked bool) {
defer func() {
if r := recover(); r != nil {
panicked = true
log.Printf("supervised[%s]: PANIC recovered: %v\n%s", name, r, debug.Stack())
}
}()
fn(ctx)
return false
}
// --- Liveness registry -----------------------------------------------------
//
// Each subsystem calls Heartbeat(name) at the end of each tick / iteration.
// Operators read the registry via /admin/liveness to detect stuck-but-not-
// crashed subsystems (e.g. a tick that deadlocks without panicking).
var (
livenessMu sync.RWMutex
lastTicks = map[string]time.Time{}
)
// Heartbeat records that subsystem `name` is alive as of now.
func Heartbeat(name string) {
livenessMu.Lock()
lastTicks[name] = time.Now()
livenessMu.Unlock()
}
// LastTick returns the wall-clock time of the most recent Heartbeat for
// subsystem `name`. Returns the zero time if the subsystem has never
// heartbeated.
func LastTick(name string) time.Time {
livenessMu.RLock()
defer livenessMu.RUnlock()
return lastTicks[name]
}
// Snapshot returns a copy of every subsystem's last-tick time, for admin
// endpoints.
func Snapshot() map[string]time.Time {
livenessMu.RLock()
defer livenessMu.RUnlock()
out := make(map[string]time.Time, len(lastTicks))
for k, v := range lastTicks {
out[k] = v
}
return out
}
// IsHealthy returns true iff every subsystem in `expected` has tickled
// within `staleThreshold` ago. Use from /health (or a strict variant of it)
// to surface stuck subsystems to an external orchestrator.
func IsHealthy(expected []string, staleThreshold time.Duration) (healthy bool, stale []string) {
livenessMu.RLock()
defer livenessMu.RUnlock()
now := time.Now()
for _, name := range expected {
last, ok := lastTicks[name]
if !ok || now.Sub(last) > staleThreshold {
stale = append(stale, name)
}
}
return len(stale) == 0, stale
}