molecule-core/platform/internal/registry/healthsweep.go

package registry

import (
	"context"
	"log"
	"os"
	"strconv"
	"time"

	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
)

// ContainerChecker checks if a workspace container is running via Docker API.
type ContainerChecker interface {
	IsRunning(ctx context.Context, workspaceID string) (bool, error)
}

// DefaultRemoteStaleAfter is the default heartbeat-freshness window for
// `runtime='external'` workspaces before they're marked offline. Chosen
// slightly longer than the 60s Redis TTL so transient network hiccups
// don't cause a flapping online/offline ping-pong on the canvas. Override
// via `REMOTE_LIVENESS_STALE_AFTER` env var (integer seconds).
const DefaultRemoteStaleAfter = 90 * time.Second

// remoteStaleAfter reads the override from env, falling back to default.
// Called once per sweep tick — we don't cache because ops occasionally
// tune this live via a container restart, and the overhead of reading
// an env var on a 15s cadence is irrelevant.
func remoteStaleAfter() time.Duration {
	if v := os.Getenv("REMOTE_LIVENESS_STALE_AFTER"); v != "" {
		if n, err := strconv.Atoi(v); err == nil && n > 0 {
			return time.Duration(n) * time.Second
		}
	}
	return DefaultRemoteStaleAfter
}

// StartHealthSweep periodically checks all "online" workspaces. For
// container-backed runtimes (langgraph, claude-code, …) it calls the
// Docker API via `checker.IsRunning`. For `runtime='external'` (remote
// agents per Phase 30) it checks heartbeat freshness: a heartbeat older
// than `REMOTE_LIVENESS_STALE_AFTER` (default 90s) marks the workspace
// offline and calls `onOffline`.
//
// If `checker` is nil we still run the remote-liveness path — a
// deployment without Docker (e.g. a pure SaaS front-door) is a valid
// configuration and shouldn't lose liveness monitoring for its remote
// agents.
func StartHealthSweep(ctx context.Context, checker ContainerChecker, interval time.Duration, onOffline OfflineHandler) {
	if checker == nil {
		log.Println("Health sweep: no Docker container checker — running remote-liveness sweep only")
	}

	ticker := time.NewTicker(interval)
	defer ticker.Stop()

	log.Printf("Health sweep: started (interval=%s, remote stale-after=%s)", interval, remoteStaleAfter())

	for {
		select {
		case <-ctx.Done():
			return
		case <-ticker.C:
			if checker != nil {
				sweepOnlineWorkspaces(ctx, checker, onOffline)
			}
			sweepStaleRemoteWorkspaces(ctx, onOffline)
		}
	}
}

func sweepOnlineWorkspaces(ctx context.Context, checker ContainerChecker, onOffline OfflineHandler) {
	// Skip external workspaces (runtime='external') — they have no Docker container
	rows, err := db.DB.QueryContext(ctx,
		`SELECT id FROM workspaces WHERE status IN ('online', 'degraded') AND COALESCE(runtime, 'langgraph') != 'external'`)
	if err != nil {
		log.Printf("Health sweep: query error: %v", err)
		return
	}
	defer rows.Close()

	var ids []string
	for rows.Next() {
		var id string
		if err := rows.Scan(&id); err == nil {
			ids = append(ids, id)
		}
	}

	for _, id := range ids {
		running, err := checker.IsRunning(ctx, id)
		if err != nil {
			continue // Docker API error — skip, don't false-positive
		}
		if running {
			continue
		}

		log.Printf("Health sweep: container for %s is gone — marking offline", id)

		_, err = db.DB.ExecContext(ctx,
			`UPDATE workspaces SET status = 'offline', updated_at = now()
			 WHERE id = $1 AND status NOT IN ('removed', 'provisioning')`, id)
		if err != nil {
			log.Printf("Health sweep: failed to mark %s offline: %v", id, err)
			continue
		}

		db.ClearWorkspaceKeys(ctx, id)

		if onOffline != nil {
			onOffline(ctx, id)
		}
	}
}

// sweepStaleRemoteWorkspaces marks `runtime='external'` workspaces offline
// when their last heartbeat is older than `remoteStaleAfter()`. This is
// the Phase 30.7 analogue of `sweepOnlineWorkspaces` — instead of asking
// Docker "is the container alive?" we ask the DB "did the agent check in
// recently?". Workspaces that never heartbeated (last_heartbeat_at IS
// NULL) are eligible for sweep only after they've been online longer
// than the staleness window, so a newly-registered agent gets a full
// grace period to send its first heartbeat.
func sweepStaleRemoteWorkspaces(ctx context.Context, onOffline OfflineHandler) {
	staleAfter := remoteStaleAfter()
	staleAfterSec := int(staleAfter / time.Second)

	// Use Postgres age arithmetic so the cutoff is computed server-side
	// (no clock skew between platform host and DB). `COALESCE` ensures
	// a NULL heartbeat is compared against updated_at (which is set
	// when the external workspace was created + marked online) — that
	// way an agent that registered but immediately crashed before its
	// first heartbeat still gets swept after the grace window.
	rows, err := db.DB.QueryContext(ctx, `
		SELECT id FROM workspaces
		WHERE status IN ('online', 'degraded')
		  AND COALESCE(runtime, 'langgraph') = 'external'
		  AND COALESCE(last_heartbeat_at, updated_at) < now() - ($1 || ' seconds')::interval
	`, staleAfterSec)
	if err != nil {
		log.Printf("Health sweep (remote): query error: %v", err)
		return
	}
	defer rows.Close()

	var ids []string
	for rows.Next() {
		var id string
		if err := rows.Scan(&id); err == nil {
			ids = append(ids, id)
		}
	}

	for _, id := range ids {
		log.Printf("Health sweep (remote): %s heartbeat stale (>%s) — marking offline", id, staleAfter)

		_, err = db.DB.ExecContext(ctx,
			`UPDATE workspaces SET status = 'offline', updated_at = now()
			 WHERE id = $1 AND status NOT IN ('removed', 'provisioning', 'paused')`, id)
		if err != nil {
			log.Printf("Health sweep (remote): failed to mark %s offline: %v", id, err)
			continue
		}

		db.ClearWorkspaceKeys(ctx, id)

		if onOffline != nil {
			onOffline(ctx, id)
		}
	}
}