molecule-core/workspace-server/cmd/server/main.go
Hongming Wang c0a5d842b4 feat(runtime): native_scheduler skip — primitive #3 of 6
When an adapter declares provides_native_scheduler=True (because its
SDK has built-in cron / Temporal-style workflows), the platform's
polling loop must skip firing schedules for that workspace — otherwise
the schedule fires twice (once natively, once via platform). The
native skip preserves observability (next_run_at still advances, the
schedule row stays in the DB, last_run_at would still update) while
moving the FIRE responsibility to the SDK.

Stacked on PR #2139 (idle_timeout_override end-to-end). The
RuntimeMetadata heartbeat block already carries the capability map;
this PR teaches the platform how to read and act on the scheduler bit.

Components:

  - handlers/runtime_overrides.go: extended the cache to store
    capability flags alongside idle timeout. Two heartbeat fields are
    independent — SetIdleTimeout / SetCapabilities each update one
    without stomping the other. Defensive copy on SetCapabilities so
    a caller mutating its map after the call doesn't retroactively
    change cached declarations. Empty entries dropped to avoid stale
    husks.

  - handlers/runtime_overrides.go: new HasCapability(workspaceID, name)
    + ProvidesNativeScheduler(workspaceID) — the latter is the
    package-level adapter the scheduler imports (avoids a
    handlers/scheduler import cycle).

  - handlers/registry.go: heartbeat handler now calls SetCapabilities
    in addition to SetIdleTimeout.

  - scheduler/scheduler.go: NativeSchedulerCheck function-pointer DI
    (mirrors the existing QueueDrainFunc pattern). New() leaves the
    field nil so existing callers preserve today's "always fire"
    behavior. SetNativeSchedulerCheck wires production. tick() drops
    workspaces declaring native ownership before goroutine fan-out;
    advances next_run_at so we don't tight-loop on the same row.

  - cmd/server/main.go: wires handlers.ProvidesNativeScheduler into
    the cron scheduler at server boot.

Tests:
  Go (7 new):
    - SetCapabilitiesAndHas (round-trip)
    - per-workspace isolation (ws-a's declaration doesn't leak to ws-b)
    - nil/empty map clears (adapter dropping the flag restores fallback)
    - SetCapabilities is a defensive copy (caller mutation can't
      retroactively flip cached value)
    - SetIdleTimeout preserves capabilities and vice-versa (two-field
      independence)
    - empty entry deleted (no stale husks)
    - ProvidesNativeScheduler reads the same singleton heartbeat writes
    - SetNativeSchedulerCheck wires the function (scheduler-side)
    - nil-check safety contract for tick

  Python: no change needed — the heartbeat already serializes the
  full capability map via _runtime_metadata_payload (PR #2139). An
  adapter setting RuntimeCapabilities(provides_native_scheduler=True)
  automatically flows through.

Verification:
  - 1308 / 1308 Python pytest pass (unchanged)
  - All Go handlers + scheduler tests pass
  - go build + go vet clean

See project memory `project_runtime_native_pluggable.md`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 22:47:00 -07:00

393 lines
15 KiB
Go

package main
import (
"context"
"fmt"
"log"
"net/http"
"os"
"os/signal"
"path/filepath"
"strings"
"syscall"
"time"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/channels"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/crypto"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/handlers"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/imagewatch"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/registry"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/router"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/scheduler"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/supervised"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/ws"
// External plugins — each registers EnvMutator(s) that run at workspace
// provision time. Loaded via soft-dep gates in main() so self-hosters
// without the App or without per-agent identity configured keep working.
githubappauth "github.com/Molecule-AI/molecule-ai-plugin-github-app-auth/pluginloader"
ghidentity "github.com/Molecule-AI/molecule-ai-plugin-gh-identity/pluginloader"
"github.com/Molecule-AI/molecule-monorepo/platform/pkg/provisionhook"
)
func main() {
// .env auto-load: in dev, the operator keeps MOLECULE_ENV /
// DATABASE_URL / etc. in the monorepo's .env file. Loading it here
// — before any code reads env — means a fresh `/tmp/molecule-server`
// run picks up dev config without `set -a && source .env`. No-op
// in production (Docker image doesn't ship a .env, and existing env
// always wins over file values, so container env stays dominant).
loadDotEnvIfPresent()
// CP self-refresh: pull any operator-rotated config (e.g. a new
// MOLECULE_CP_SHARED_SECRET) before any other code reads env.
// Best-effort — if the CP is unreachable we keep booting with the
// env we were provisioned with. Older SaaS tenants predate PR #53
// and can arrive here with MOLECULE_CP_SHARED_SECRET unset; this
// is how they heal without SSH.
if err := refreshEnvFromCP(); err != nil {
log.Printf("CP env refresh: %v (continuing with baked-in env)", err)
}
// Secrets encryption. In MOLECULE_ENV=prod, boot refuses to start
// without a valid SECRETS_ENCRYPTION_KEY (fail-secure — Top-5 #5).
// In any other environment, missing keys just log a warning and
// continue with encryption disabled for dev ergonomics.
if err := crypto.InitStrict(); err != nil {
log.Fatalf("Secrets encryption: %v", err)
}
if crypto.IsEnabled() {
log.Println("Secrets encryption: AES-256-GCM enabled")
} else {
log.Println("Secrets encryption: disabled (set SECRETS_ENCRYPTION_KEY — required when MOLECULE_ENV=prod)")
}
// Database
databaseURL := envOr("DATABASE_URL", "postgres://dev:dev@localhost:5432/molecule?sslmode=disable")
if err := db.InitPostgres(databaseURL); err != nil {
log.Fatalf("Postgres init failed: %v", err)
}
// Run migrations
migrationsDir := findMigrationsDir()
if migrationsDir != "" {
if err := db.RunMigrations(migrationsDir); err != nil {
log.Fatalf("Migrations failed: %v", err)
}
}
// Redis
redisURL := envOr("REDIS_URL", "redis://localhost:6379")
if err := db.InitRedis(redisURL); err != nil {
log.Fatalf("Redis init failed: %v", err)
}
// WebSocket Hub — inject CanCommunicate as a function to avoid import cycles
hub := ws.NewHub(registry.CanCommunicate)
go hub.Run()
// Event Broadcaster
broadcaster := events.NewBroadcaster(hub)
// Start Redis pub/sub subscriber
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Every long-running subsystem below is wrapped by supervised.RunWithRecover:
// a panic (e.g. from a single bad tenant row) is logged + the subsystem is
// restarted with exponential backoff instead of silently dying forever.
// Motivation: issue #85 (scheduler silent outage for 12+ hours) and #92
// (systemic — affects every background goroutine).
go supervised.RunWithRecover(ctx, "broadcaster", broadcaster.Subscribe)
// Activity log retention — configurable via env vars
retentionDays := envOr("ACTIVITY_RETENTION_DAYS", "7")
cleanupHours := envOr("ACTIVITY_CLEANUP_INTERVAL_HOURS", "6")
cleanupInterval, _ := time.ParseDuration(cleanupHours + "h")
if cleanupInterval == 0 {
cleanupInterval = 6 * time.Hour
}
go func() {
ticker := time.NewTicker(cleanupInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
result, err := db.DB.ExecContext(ctx, `DELETE FROM activity_logs WHERE created_at < now() - ($1 || ' days')::interval`, retentionDays)
if err != nil {
log.Printf("Activity log cleanup error: %v", err)
} else if n, _ := result.RowsAffected(); n > 0 {
log.Printf("Activity log cleanup: purged %d old entries", n)
}
}
}
}()
// Provisioner — auto-detect backend:
// 1. MOLECULE_ORG_ID set → SaaS tenant → control plane provisioner
// 2. Docker available → self-hosted → Docker provisioner
// 3. Neither → provisioner disabled (external agents only)
var prov *provisioner.Provisioner
var cpProv *provisioner.CPProvisioner
if os.Getenv("MOLECULE_ORG_ID") != "" {
// SaaS tenant — provision via control plane (holds Fly token, manages billing)
if cp, err := provisioner.NewCPProvisioner(); err != nil {
log.Printf("Control plane provisioner unavailable: %v", err)
} else {
cpProv = cp
defer cpProv.Close()
log.Println("Provisioner: Control Plane (auto-detected SaaS tenant)")
}
} else {
// Self-hosted — use local Docker daemon
if p, err := provisioner.New(); err != nil {
log.Printf("Provisioner disabled (Docker not available): %v", err)
} else {
prov = p
defer prov.Close()
log.Println("Provisioner: Docker")
}
}
port := envOr("PORT", "8080")
platformURL := envOr("PLATFORM_URL", fmt.Sprintf("http://host.docker.internal:%s", port))
configsDir := envOr("CONFIGS_DIR", findConfigsDir())
// Init order: wh → onWorkspaceOffline → liveness/healthSweep → router
// WorkspaceHandler is created before the router so RestartByID can be wired into
// the offline callbacks used by both the liveness monitor and the health sweep.
wh := handlers.NewWorkspaceHandler(broadcaster, prov, platformURL, configsDir)
if cpProv != nil {
wh.SetCPProvisioner(cpProv)
}
// External-plugin env mutators — each plugin contributes 0+ mutators
// onto a shared registry. Order matters: gh-identity populates
// MOLECULE_AGENT_ROLE-derived attribution env vars that downstream
// mutators and the workspace's install.sh can then read. Keep
// github-app-auth last because it fails loudly on misconfig and its
// failure mode is "no GITHUB_TOKEN" — worth surfacing after the
// cheaper mutators already ran.
envReg := provisionhook.NewRegistry()
// gh-identity plugin — per-agent attribution via env injection + gh
// wrapper shipped as base64 env. Soft-dep: no config file is OK
// (plugin no-ops when no role is set on the workspace).
// Tracks molecule-core#1957.
if res, err := ghidentity.BuildRegistry(); err != nil {
log.Fatalf("gh-identity plugin: %v", err)
} else {
envReg.Register(res.Mutator)
log.Printf("gh-identity: registered (config file=%q)", os.Getenv("MOLECULE_GH_IDENTITY_CONFIG_FILE"))
}
// github-app-auth plugin — injects GITHUB_TOKEN + GH_TOKEN into every
// workspace env using the App's installation access token (rotates ~hourly).
// Soft-skip when GITHUB_APP_* env vars are absent so dev/self-hosters
// without an App configured keep working; fail-loud only on MISCONFIG
// (e.g. APP_ID set but key file missing), not on unset.
if os.Getenv("GITHUB_APP_ID") != "" {
if reg, err := githubappauth.BuildRegistry(); err != nil {
log.Fatalf("github-app-auth plugin: %v", err)
} else {
// Copy the plugin's mutators onto the shared registry so the
// TokenProvider probe (FirstTokenProvider) still finds them.
for _, m := range reg.Mutators() {
envReg.Register(m)
}
log.Printf("github-app-auth: registered, %d mutator(s) added to chain", reg.Len())
}
} else {
log.Println("github-app-auth: GITHUB_APP_ID unset — skipping plugin registration (agents will use any PAT from .env)")
}
wh.SetEnvMutators(envReg)
log.Printf("env-mutator chain: %v", envReg.Names())
// Offline handler: broadcast event + auto-restart the dead workspace
onWorkspaceOffline := func(innerCtx context.Context, workspaceID string) {
if err := broadcaster.RecordAndBroadcast(innerCtx, "WORKSPACE_OFFLINE", workspaceID, map[string]interface{}{}); err != nil {
log.Printf("Offline broadcast error for %s: %v", workspaceID, err)
}
// Auto-restart: bring the workspace back automatically
go wh.RestartByID(workspaceID)
}
// Start Liveness Monitor — Redis TTL expiry-based offline detection + auto-restart
go supervised.RunWithRecover(ctx, "liveness-monitor", func(c context.Context) {
registry.StartLivenessMonitor(c, onWorkspaceOffline)
})
// Proactive container health sweep — detects dead containers faster than Redis TTL.
// Checks all "online" workspaces against Docker every 15 seconds.
if prov != nil {
go supervised.RunWithRecover(ctx, "health-sweep", func(c context.Context) {
registry.StartHealthSweep(c, prov, 15*time.Second, onWorkspaceOffline)
})
}
// Orphan-container reconcile sweep — finds running containers
// whose workspace row is already status='removed' and stops
// them. Defence in depth on top of the inline cleanup in
// handlers/workspace_crud.go: any Docker hiccup that left a
// container alive after the user clicked delete heals on the
// next sweep instead of leaking forever.
if prov != nil {
go supervised.RunWithRecover(ctx, "orphan-sweeper", func(c context.Context) {
registry.StartOrphanSweeper(c, prov)
})
}
// Provision-timeout sweep — flips workspaces that have been stuck in
// status='provisioning' past the timeout window to 'failed' and emits
// WORKSPACE_PROVISION_TIMEOUT. Without this the UI banner is cosmetic
// and the state is incoherent (e.g. user sees "Retry" after 15min but
// backend still thinks provisioning is in progress).
go supervised.RunWithRecover(ctx, "provision-timeout-sweep", func(c context.Context) {
registry.StartProvisioningTimeoutSweep(c, broadcaster, registry.DefaultProvisionSweepInterval)
})
// Cron Scheduler — fires A2A messages to workspaces on user-defined schedules
cronSched := scheduler.New(wh, broadcaster)
// Wire the native-scheduler skip — when an adapter's heartbeat
// declares provides_native_scheduler=true, the platform's polling
// loop drops that workspace's schedules to avoid double-fire (the
// SDK runs them itself). See project memory
// `project_runtime_native_pluggable.md` and capability primitive #3.
cronSched.SetNativeSchedulerCheck(handlers.ProvidesNativeScheduler)
go supervised.RunWithRecover(ctx, "scheduler", cronSched.Start)
// Hibernation Monitor — auto-pauses idle workspaces that have
// hibernation_idle_minutes configured (#711). Wakeup is triggered
// automatically on the next incoming A2A message.
go supervised.RunWithRecover(ctx, "hibernation-monitor", func(c context.Context) {
registry.StartHibernationMonitor(c, wh.HibernateWorkspace)
})
// Channel Manager — social channel integrations (Telegram, Slack, etc.)
channelMgr := channels.NewManager(wh, broadcaster)
go supervised.RunWithRecover(ctx, "channel-manager", channelMgr.Start)
// Image auto-refresh — closes the runtime CD chain to "merge → containers
// running new code" with no human in between. Polls GHCR for digest
// changes on workspace-template-* :latest tags and invokes the same
// refresh logic /admin/workspace-images/refresh exposes. Opt-in:
// SaaS deploys whose pipeline already pulls every release should leave
// it off (would be redundant work). Self-hosters get true zero-touch.
if prov != nil && strings.EqualFold(os.Getenv("IMAGE_AUTO_REFRESH"), "true") {
svc := handlers.NewWorkspaceImageService(prov.DockerClient())
watcher := imagewatch.New(svc)
go supervised.RunWithRecover(ctx, "image-auto-refresh", watcher.Run)
}
// Wire channel manager into scheduler for auto-posting cron output to Slack
cronSched.SetChannels(channelMgr)
// Router
r := router.Setup(hub, broadcaster, prov, platformURL, configsDir, wh, channelMgr)
// HTTP server with graceful shutdown
srv := &http.Server{
Addr: fmt.Sprintf(":%s", port),
Handler: r,
}
// Start server in goroutine
go func() {
log.Printf("Platform starting on :%s", port)
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
log.Fatalf("Server failed: %v", err)
}
}()
// Wait for interrupt signal
quit := make(chan os.Signal, 1)
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
<-quit
log.Println("Shutting down gracefully...")
// Cancel background goroutines (liveness monitor, Redis subscriber)
cancel()
// Drain HTTP connections (30s timeout)
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer shutdownCancel()
if err := srv.Shutdown(shutdownCtx); err != nil {
log.Printf("Server forced shutdown: %v", err)
}
// Close WebSocket hub
hub.Close()
log.Println("Platform stopped")
}
func envOr(key, fallback string) string {
if v := os.Getenv(key); v != "" {
return v
}
return fallback
}
func findConfigsDir() string {
candidates := []string{
"workspace-configs-templates",
"../workspace-configs-templates",
"../../workspace-configs-templates",
}
for _, c := range candidates {
if info, err := os.Stat(c); err == nil && info.IsDir() {
// Verify the directory has at least one template with a config.yaml
entries, _ := os.ReadDir(c)
hasTemplate := false
for _, e := range entries {
if e.IsDir() {
if _, err := os.Stat(filepath.Join(c, e.Name(), "config.yaml")); err == nil {
hasTemplate = true
break
}
}
}
if !hasTemplate {
continue
}
abs, _ := filepath.Abs(c)
return abs
}
}
return "workspace-configs-templates"
}
func findMigrationsDir() string {
candidates := []string{
"migrations",
"platform/migrations",
"../migrations",
"../../migrations",
}
if exe, err := os.Executable(); err == nil {
dir := filepath.Dir(exe)
candidates = append(candidates,
filepath.Join(dir, "migrations"),
filepath.Join(dir, "..", "migrations"),
filepath.Join(dir, "..", "..", "migrations"),
)
}
for _, c := range candidates {
if info, err := os.Stat(c); err == nil && info.IsDir() {
abs, _ := filepath.Abs(c)
log.Printf("Found migrations at: %s", abs)
return abs
}
}
log.Println("No migrations directory found")
return ""
}