molecule-core/workspace-server/internal/handlers/delegation_sweeper.go

package handlers

import (
	"context"
	"database/sql"
	"log"
	"os"
	"strconv"
	"time"

	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
)

// delegation_sweeper.go — RFC #2829 PR-3: stuck-task sweeper.
//
// What it does
// ------------
// Periodically scans the `delegations` table (PR-1 schema) for in-flight
// rows that have either:
//
//   1. Blown past their `deadline` — agent claims to still be working but
//      the hard ceiling fired. Mark `failed` with error_detail = "deadline
//      exceeded".
//   2. Stopped heartbeating for >stuckThreshold while still claiming
//      in_progress. Mark `stuck` with error_detail = "no heartbeat for Ns".
//
// Why both rules
// --------------
// Deadline catches forever-heartbeating agents that never make progress
// (a wedged agent looping on a heartbeat call inside its main work loop
// looks "alive" by liveness signals but is not actually advancing).
// Heartbeat-staleness catches agents that crash or get OOM-killed
// without graceful shutdown — no terminal status update fires, but the
// heartbeat stops cold.
//
// Order matters: deadline check fires first because deadline → failed
// is a stronger statement than deadline → stuck. A stuck row can be
// retried by the operator; a failed row says "give up, retry was
// already exhausted or not viable."
//
// Frequency
// ---------
// 5min default cadence. Faster than that wastes DB round-trips for the
// hot index; slower means a stuck task isn't caught until ~5min after
// the heartbeat stops. Operators can override via DELEGATION_SWEEPER_INTERVAL_S.
//
// Threshold
// ---------
// Default 10× the runtime's heartbeat interval (≈100s for hermes that
// beats every 10s during stream output). 10× is the heuristic from the
// RFC #2829 design discussion: it tolerates legitimate slow LLM
// responses (a single completion can stall a heartbeat for 30-60s) while
// still catching real wedges within ~2 minutes. Operators override via
// DELEGATION_STUCK_THRESHOLD_S.
//
// Safety
// ------
// All transitions go through DelegationLedger.SetStatus so the
// terminal-state forward-only protection applies — a delegation that
// just transitioned to completed concurrently with the sweep won't be
// flipped back to failed/stuck. The ledger's same-status replay no-op
// also makes re-running the sweep idempotent.

const (
	defaultSweeperInterval = 5 * time.Minute

	// 10min = 60× the typical 10s hermes heartbeat. Tightens to ~10×
	// once the user community settles on a tighter heartbeat cadence;
	// today's mix of runtimes (hermes 10s, claude-code 30-60s, langchain
	// minute-scale) needs the looser threshold to avoid false positives.
	defaultStuckThreshold = 10 * time.Minute
)

// DelegationSweeper runs the periodic sweep. Construct via
// NewDelegationSweeper, then Start(ctx) in main.go to begin ticking.
type DelegationSweeper struct {
	db        *sql.DB
	ledger    *DelegationLedger
	interval  time.Duration
	threshold time.Duration
}

// NewDelegationSweeper builds a sweeper bound to the package db.DB
// (production wiring) or a test handle. Reads optional env overrides
// at construction time so a long-running process picks them up via
// restart, not mid-flight.
func NewDelegationSweeper(handle *sql.DB, ledger *DelegationLedger) *DelegationSweeper {
	if handle == nil {
		handle = db.DB
	}
	if ledger == nil {
		ledger = NewDelegationLedger(handle)
	}
	return &DelegationSweeper{
		db:        handle,
		ledger:    ledger,
		interval:  envDuration("DELEGATION_SWEEPER_INTERVAL_S", defaultSweeperInterval),
		threshold: envDuration("DELEGATION_STUCK_THRESHOLD_S", defaultStuckThreshold),
	}
}

// envDuration parses an integer-seconds env var into a Duration. Falls
// back to def on missing/invalid input — never fails fast on misconfig
// (a typo'd env var should run with sane defaults, not crash startup).
func envDuration(key string, def time.Duration) time.Duration {
	v := os.Getenv(key)
	if v == "" {
		return def
	}
	n, err := strconv.Atoi(v)
	if err != nil || n <= 0 {
		log.Printf("delegation_sweeper: invalid %s=%q, using default %s", key, v, def)
		return def
	}
	return time.Duration(n) * time.Second
}

// Interval exposes the configured tick cadence — tests use it; main.go
// uses it implicitly via Start.
func (s *DelegationSweeper) Interval() time.Duration { return s.interval }

// Threshold exposes the heartbeat-staleness threshold.
func (s *DelegationSweeper) Threshold() time.Duration { return s.threshold }

// Start ticks Sweep() at the configured interval until ctx is cancelled.
// Defers panic recovery so a single bad row can't kill the sweeper.
//
// Wired into main.go: `go sweeper.Start(ctx)`. No-op until both the
// `delegations` table (PR-1) and the result-push flag (PR-2) have rolled
// out — the sweeper just won't find any rows to mark.
func (s *DelegationSweeper) Start(ctx context.Context) {
	t := time.NewTicker(s.interval)
	defer t.Stop()
	log.Printf("DelegationSweeper: started (interval=%s, stuck-threshold=%s)",
		s.interval, s.threshold)

	tickWithRecover := func() {
		defer func() {
			if r := recover(); r != nil {
				log.Printf("DelegationSweeper: PANIC in tick — recovered: %v", r)
			}
		}()
		s.Sweep(ctx)
	}

	// First sweep immediately so operators see it run on startup, not
	// after waiting one interval.
	tickWithRecover()

	for {
		select {
		case <-ctx.Done():
			log.Printf("DelegationSweeper: stopped")
			return
		case <-t.C:
			tickWithRecover()
		}
	}
}

// SweepResult records what the last sweep changed. Surfaced via the
// admin dashboard (PR-4); also useful for tests to assert behavior
// without diffing log lines.
type SweepResult struct {
	DeadlineFailures int
	StuckMarked      int
	Errors           int
}

// Sweep runs one pass: find every in-flight delegation, mark deadline-
// exceeded as failed, mark heartbeat-stale as stuck. Returns counts
// for observability.
//
// SQL strategy: one indexed scan over the partial inflight index, two
// updaters per offending row. We fold both checks into a single SELECT
// to amortize the round-trip — the row count in flight at any time
// is small (single-digit hundreds even on a busy tenant), so reading
// them all and dispatching SetStatus per-row is cheaper than two
// separate UPDATEs with bespoke WHERE clauses.
func (s *DelegationSweeper) Sweep(ctx context.Context) SweepResult {
	res := SweepResult{}

	rows, err := s.db.QueryContext(ctx, `
		SELECT delegation_id, last_heartbeat, deadline
		  FROM delegations
		 WHERE status IN ('queued','dispatched','in_progress')
	`)
	if err != nil {
		log.Printf("DelegationSweeper: query failed: %v", err)
		res.Errors++
		return res
	}
	defer rows.Close()

	now := time.Now()
	type candidate struct {
		id       string
		lastBeat sql.NullTime
		deadline time.Time
	}
	var todo []candidate
	for rows.Next() {
		var c candidate
		if err := rows.Scan(&c.id, &c.lastBeat, &c.deadline); err != nil {
			log.Printf("DelegationSweeper: scan failed: %v", err)
			res.Errors++
			continue
		}
		todo = append(todo, c)
	}
	if err := rows.Err(); err != nil {
		log.Printf("DelegationSweeper: rows.Err: %v", err)
		res.Errors++
	}

	for _, c := range todo {
		// Deadline first — stronger statement than stuck.
		if now.After(c.deadline) {
			if err := s.ledger.SetStatus(ctx, c.id, "failed",
				"deadline exceeded by sweeper", ""); err != nil {
				log.Printf("DelegationSweeper: SetStatus(%s, failed): %v", c.id, err)
				res.Errors++
				continue
			}
			res.DeadlineFailures++
			continue
		}

		// Heartbeat staleness. A NULL last_heartbeat counts as stale ONLY
		// if the row has lived past one threshold since creation — gives
		// the agent one full window to emit its first beat. We fold this
		// by treating NULL as "created_at — but we don't have created_at
		// in the SELECT. Approximate: NULL last_heartbeat + deadline more
		// than (5h, default deadline=6h) away from now means the row was
		// created ≤1h ago, give it a free pass. Simpler heuristic: NULL
		// heartbeat is only stale if deadline is already imminent (within
		// 1 threshold).
		var lastBeat time.Time
		if c.lastBeat.Valid {
			lastBeat = c.lastBeat.Time
		}
		if !c.lastBeat.Valid {
			// Row never heartbeat. Don't mark stuck — let the deadline
			// catch it. Reduces false positives during the agent's first
			// beat window after restart.
			continue
		}
		if now.Sub(lastBeat) > s.threshold {
			if err := s.ledger.SetStatus(ctx, c.id, "stuck",
				"no heartbeat for "+now.Sub(lastBeat).Round(time.Second).String(),
				""); err != nil {
				log.Printf("DelegationSweeper: SetStatus(%s, stuck): %v", c.id, err)
				res.Errors++
				continue
			}
			res.StuckMarked++
		}
	}

	if res.DeadlineFailures > 0 || res.StuckMarked > 0 || res.Errors > 0 {
		log.Printf("DelegationSweeper: sweep complete — deadline_failures=%d stuck=%d errors=%d",
			res.DeadlineFailures, res.StuckMarked, res.Errors)
	}
	return res
}