molecule-core/platform/internal/registry/liveness.go
molecule-ai[bot] 7f5f74d493
feat(registry): workspace hibernation — auto-pause idle workspaces (#711)
Implements automatic workspace hibernation for workspaces that have been idle
longer than their configured hibernation_idle_minutes threshold.

Changes:
- migrations/029: Add hibernation_idle_minutes INT DEFAULT NULL column +
  partial index on workspaces table
- registry/hibernation.go: New StartHibernationMonitor goroutine that ticks
  every 2 min and calls hibernateIdleWorkspaces via the HibernateHandler
  callback (same import-cycle-prevention pattern as OfflineHandler)
- registry/hibernation_test.go: 5 unit tests covering handler calls, no-rows,
  DB error, tick behaviour, and context-cancel shutdown
- handlers/workspace_restart.go: New Hibernate() HTTP handler (POST
  /workspaces/:id/hibernate) + HibernateWorkspace(ctx, id) method — stops
  container, sets status='hibernated', clears Redis keys, broadcasts event
- handlers/a2a_proxy.go: Auto-wake in resolveAgentURL — when status='hibernated'
  and URL is empty, triggers async RestartByID and returns 503 + Retry-After: 15
  so callers can retry transparently
- registry/liveness.go: Exclude 'hibernated' workspaces from offline detection
- router.go: Register POST /workspaces/:id/hibernate under wsAuth group
- cmd/server/main.go: Wire hibernation monitor via supervised.RunWithRecover

Closes #711

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-17 13:27:39 +00:00

60 lines
1.5 KiB
Go

package registry
import (
"context"
"log"
"strings"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
)
// OfflineHandler is called when a workspace's liveness key expires.
type OfflineHandler func(ctx context.Context, workspaceID string)
// StartLivenessMonitor subscribes to Redis keyspace expiry events.
// When a workspace's liveness key (ws:{id}) expires, it marks the workspace offline
// and calls the onOffline handler.
func StartLivenessMonitor(ctx context.Context, onOffline OfflineHandler) {
sub := db.RDB.PSubscribe(ctx, "__keyevent@0__:expired")
log.Println("Liveness monitor started — listening for Redis key expirations")
ch := sub.Channel()
for {
select {
case <-ctx.Done():
sub.Close()
return
case msg := <-ch:
if msg == nil {
continue
}
key := msg.Payload
if !strings.HasPrefix(key, "ws:") {
continue
}
parts := strings.SplitN(key, ":", 3)
if len(parts) != 2 {
continue
}
workspaceID := parts[1]
log.Printf("Liveness: workspace %s TTL expired", workspaceID)
// Mark offline in Postgres — skip paused and hibernated workspaces (no active container)
_, err := db.DB.ExecContext(ctx, `
UPDATE workspaces SET status = 'offline', updated_at = now()
WHERE id = $1 AND status NOT IN ('removed', 'paused', 'hibernated')
`, workspaceID)
if err != nil {
log.Printf("Liveness: failed to mark %s offline: %v", workspaceID, err)
continue
}
if onOffline != nil {
onOffline(ctx, workspaceID)
}
}
}
}