forked from molecule-ai/molecule-core
Implements automatic workspace hibernation for workspaces that have been idle longer than their configured hibernation_idle_minutes threshold. Changes: - migrations/029: Add hibernation_idle_minutes INT DEFAULT NULL column + partial index on workspaces table - registry/hibernation.go: New StartHibernationMonitor goroutine that ticks every 2 min and calls hibernateIdleWorkspaces via the HibernateHandler callback (same import-cycle-prevention pattern as OfflineHandler) - registry/hibernation_test.go: 5 unit tests covering handler calls, no-rows, DB error, tick behaviour, and context-cancel shutdown - handlers/workspace_restart.go: New Hibernate() HTTP handler (POST /workspaces/:id/hibernate) + HibernateWorkspace(ctx, id) method — stops container, sets status='hibernated', clears Redis keys, broadcasts event - handlers/a2a_proxy.go: Auto-wake in resolveAgentURL — when status='hibernated' and URL is empty, triggers async RestartByID and returns 503 + Retry-After: 15 so callers can retry transparently - registry/liveness.go: Exclude 'hibernated' workspaces from offline detection - router.go: Register POST /workspaces/:id/hibernate under wsAuth group - cmd/server/main.go: Wire hibernation monitor via supervised.RunWithRecover Closes #711 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
103 lines
3.6 KiB
Go
103 lines
3.6 KiB
Go
package registry
|
|
|
|
import (
|
|
"context"
|
|
"log"
|
|
"time"
|
|
|
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/supervised"
|
|
)
|
|
|
|
// HibernateHandler is called for each workspace that the hibernation monitor
|
|
// decides should be hibernated. The handler stops the container, updates the
|
|
// DB status, and broadcasts the event.
|
|
type HibernateHandler func(ctx context.Context, workspaceID string)
|
|
|
|
// defaultHibernationInterval is how often the hibernation monitor polls the
|
|
// database for idle-too-long workspaces. Two minutes is fine-grained enough
|
|
// for typical idle_hibernate_minutes values (≥5) and cheap enough on a busy
|
|
// platform — the query hits a partial index and does a small range scan.
|
|
const defaultHibernationInterval = 2 * time.Minute
|
|
|
|
// StartHibernationMonitor periodically scans for workspaces that have been
|
|
// idle (active_tasks == 0) longer than their configured hibernation_idle_minutes
|
|
// and calls onHibernate for each. It runs under supervised.RunWithRecover so a
|
|
// panic is recovered with exponential backoff rather than silently dying.
|
|
//
|
|
// Only workspaces with:
|
|
// - status IN ('online', 'degraded')
|
|
// - active_tasks == 0
|
|
// - hibernation_idle_minutes IS NOT NULL AND > 0
|
|
// - runtime != 'external' (external agents have no Docker container)
|
|
// - last heartbeat older than hibernation_idle_minutes minutes ago
|
|
//
|
|
// are candidates. The last_heartbeat_at column tracks the most recent
|
|
// successful heartbeat from the agent; when it is NULL the workspace has
|
|
// never heartbeated and is not yet eligible for hibernation (we give it a
|
|
// full grace period equal to hibernation_idle_minutes from its created_at).
|
|
func StartHibernationMonitor(ctx context.Context, onHibernate HibernateHandler) {
|
|
StartHibernationMonitorWithInterval(ctx, defaultHibernationInterval, onHibernate)
|
|
}
|
|
|
|
// StartHibernationMonitorWithInterval is StartHibernationMonitor with a
|
|
// configurable tick interval — exposed for tests so they don't have to wait
|
|
// 2 minutes for a tick.
|
|
func StartHibernationMonitorWithInterval(ctx context.Context, interval time.Duration, onHibernate HibernateHandler) {
|
|
ticker := time.NewTicker(interval)
|
|
defer ticker.Stop()
|
|
|
|
log.Printf("Hibernation monitor: started (interval=%s)", interval)
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
log.Println("Hibernation monitor: context done; stopping")
|
|
return
|
|
case <-ticker.C:
|
|
hibernateIdleWorkspaces(ctx, onHibernate)
|
|
supervised.Heartbeat("hibernation-monitor")
|
|
}
|
|
}
|
|
}
|
|
|
|
// hibernateIdleWorkspaces queries for hibernation candidates and calls
|
|
// onHibernate for each. Errors from DB are logged but do not crash the loop.
|
|
func hibernateIdleWorkspaces(ctx context.Context, onHibernate HibernateHandler) {
|
|
rows, err := db.DB.QueryContext(ctx, `
|
|
SELECT id
|
|
FROM workspaces
|
|
WHERE hibernation_idle_minutes IS NOT NULL
|
|
AND hibernation_idle_minutes > 0
|
|
AND status IN ('online', 'degraded')
|
|
AND active_tasks = 0
|
|
AND COALESCE(runtime, 'langgraph') != 'external'
|
|
AND last_heartbeat_at IS NOT NULL
|
|
AND last_heartbeat_at < now() - (hibernation_idle_minutes * INTERVAL '1 minute')
|
|
`)
|
|
if err != nil {
|
|
log.Printf("Hibernation monitor: query error: %v", err)
|
|
return
|
|
}
|
|
defer rows.Close()
|
|
|
|
var ids []string
|
|
for rows.Next() {
|
|
var id string
|
|
if rows.Scan(&id) == nil {
|
|
ids = append(ids, id)
|
|
}
|
|
}
|
|
if err := rows.Err(); err != nil {
|
|
log.Printf("Hibernation monitor: row iteration error: %v", err)
|
|
return
|
|
}
|
|
|
|
for _, id := range ids {
|
|
log.Printf("Hibernation monitor: hibernating idle workspace %s", id)
|
|
if onHibernate != nil {
|
|
onHibernate(ctx, id)
|
|
}
|
|
}
|
|
}
|