molecule-core/workspace-server/cmd/server/main.go

package main

import (
	"context"
	"fmt"
	"log"
	"net/http"
	"os"
	"os/signal"
	"path/filepath"
	"strings"
	"syscall"
	"time"

	"github.com/Molecule-AI/molecule-monorepo/platform/internal/channels"
	"github.com/Molecule-AI/molecule-monorepo/platform/internal/crypto"
	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
	"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
	"github.com/Molecule-AI/molecule-monorepo/platform/internal/handlers"
	"github.com/Molecule-AI/molecule-monorepo/platform/internal/imagewatch"
	memwiring "github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/wiring"
	"github.com/Molecule-AI/molecule-monorepo/platform/internal/middleware"
	"github.com/Molecule-AI/molecule-monorepo/platform/internal/pendinguploads"
	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
	"github.com/Molecule-AI/molecule-monorepo/platform/internal/registry"
	"github.com/Molecule-AI/molecule-monorepo/platform/internal/router"
	"github.com/Molecule-AI/molecule-monorepo/platform/internal/scheduler"
	"github.com/Molecule-AI/molecule-monorepo/platform/internal/supervised"
	"github.com/Molecule-AI/molecule-monorepo/platform/internal/ws"

	// External plugins — each registers EnvMutator(s) that run at workspace
	// provision time. Loaded via soft-dep gates in main() so self-hosters
	// without per-agent identity configured keep working.
	ghidentity "github.com/Molecule-AI/molecule-ai-plugin-gh-identity/pluginloader"

	"github.com/Molecule-AI/molecule-monorepo/platform/pkg/provisionhook"
)

func main() {
	// .env auto-load: in dev, the operator keeps MOLECULE_ENV /
	// DATABASE_URL / etc. in the monorepo's .env file. Loading it here
	// — before any code reads env — means a fresh `/tmp/molecule-server`
	// run picks up dev config without `set -a && source .env`. No-op
	// in production (Docker image doesn't ship a .env, and existing env
	// always wins over file values, so container env stays dominant).
	loadDotEnvIfPresent()

	// CP self-refresh: pull any operator-rotated config (e.g. a new
	// MOLECULE_CP_SHARED_SECRET) before any other code reads env.
	// Best-effort — if the CP is unreachable we keep booting with the
	// env we were provisioned with. Older SaaS tenants predate PR #53
	// and can arrive here with MOLECULE_CP_SHARED_SECRET unset; this
	// is how they heal without SSH.
	if err := refreshEnvFromCP(); err != nil {
		log.Printf("CP env refresh: %v (continuing with baked-in env)", err)
	}

	// Secrets encryption. In MOLECULE_ENV=prod, boot refuses to start
	// without a valid SECRETS_ENCRYPTION_KEY (fail-secure — Top-5 #5).
	// In any other environment, missing keys just log a warning and
	// continue with encryption disabled for dev ergonomics.
	if err := crypto.InitStrict(); err != nil {
		log.Fatalf("Secrets encryption: %v", err)
	}
	if crypto.IsEnabled() {
		log.Println("Secrets encryption: AES-256-GCM enabled")
	} else {
		log.Println("Secrets encryption: disabled (set SECRETS_ENCRYPTION_KEY — required when MOLECULE_ENV=prod)")
	}

	// Database
	databaseURL := envOr("DATABASE_URL", "postgres://dev:dev@localhost:5432/molecule?sslmode=disable")
	if err := db.InitPostgres(databaseURL); err != nil {
		log.Fatalf("Postgres init failed: %v", err)
	}

	// Run migrations
	migrationsDir := findMigrationsDir()
	if migrationsDir != "" {
		if err := db.RunMigrations(migrationsDir); err != nil {
			log.Fatalf("Migrations failed: %v", err)
		}
	}

	// Redis
	redisURL := envOr("REDIS_URL", "redis://localhost:6379")
	if err := db.InitRedis(redisURL); err != nil {
		log.Fatalf("Redis init failed: %v", err)
	}

	// WebSocket Hub — inject CanCommunicate as a function to avoid import cycles
	hub := ws.NewHub(registry.CanCommunicate)
	go hub.Run()

	// Event Broadcaster
	broadcaster := events.NewBroadcaster(hub)

	// Start Redis pub/sub subscriber
	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()
	// Every long-running subsystem below is wrapped by supervised.RunWithRecover:
	// a panic (e.g. from a single bad tenant row) is logged + the subsystem is
	// restarted with exponential backoff instead of silently dying forever.
	// Motivation: issue #85 (scheduler silent outage for 12+ hours) and #92
	// (systemic — affects every background goroutine).
	go supervised.RunWithRecover(ctx, "broadcaster", broadcaster.Subscribe)

	// Activity log retention — configurable via env vars
	retentionDays := envOr("ACTIVITY_RETENTION_DAYS", "7")
	cleanupHours := envOr("ACTIVITY_CLEANUP_INTERVAL_HOURS", "6")
	cleanupInterval, _ := time.ParseDuration(cleanupHours + "h")
	if cleanupInterval == 0 {
		cleanupInterval = 6 * time.Hour
	}
	go func() {
		ticker := time.NewTicker(cleanupInterval)
		defer ticker.Stop()
		for {
			select {
			case <-ctx.Done():
				return
			case <-ticker.C:
				result, err := db.DB.ExecContext(ctx, `DELETE FROM activity_logs WHERE created_at < now() - ($1 || ' days')::interval`, retentionDays)
				if err != nil {
					log.Printf("Activity log cleanup error: %v", err)
				} else if n, _ := result.RowsAffected(); n > 0 {
					log.Printf("Activity log cleanup: purged %d old entries", n)
				}
			}
		}
	}()

	// Provisioner — auto-detect backend:
	//   1. MOLECULE_ORG_ID set → SaaS tenant → control plane provisioner
	//   2. Docker available     → self-hosted → Docker provisioner
	//   3. Neither              → provisioner disabled (external agents only)
	var prov *provisioner.Provisioner
	var cpProv *provisioner.CPProvisioner
	if os.Getenv("MOLECULE_ORG_ID") != "" {
		// SaaS tenant — provision via control plane (holds Fly token, manages billing)
		if cp, err := provisioner.NewCPProvisioner(); err != nil {
			log.Printf("Control plane provisioner unavailable: %v", err)
		} else {
			cpProv = cp
			defer cpProv.Close()
			log.Println("Provisioner: Control Plane (auto-detected SaaS tenant)")
		}
	} else {
		// Self-hosted — use local Docker daemon
		if p, err := provisioner.New(); err != nil {
			log.Printf("Provisioner disabled (Docker not available): %v", err)
		} else {
			prov = p
			defer prov.Close()
			log.Println("Provisioner: Docker")
		}
	}

	port := envOr("PORT", "8080")
	platformURL := envOr("PLATFORM_URL", fmt.Sprintf("http://host.docker.internal:%s", port))
	configsDir := envOr("CONFIGS_DIR", findConfigsDir())

	// Init order: wh → onWorkspaceOffline → liveness/healthSweep → router
	// WorkspaceHandler is created before the router so RestartByID can be wired into
	// the offline callbacks used by both the liveness monitor and the health sweep.
	wh := handlers.NewWorkspaceHandler(broadcaster, prov, platformURL, configsDir)
	if cpProv != nil {
		wh.SetCPProvisioner(cpProv)
	}

	// Memory v2 plugin (RFC #2728): build the dependency bundle once
	// here so all three handlers (MCPHandler, AdminMemoriesHandler,
	// WorkspaceHandler) get the same plugin/resolver pair. memBundle
	// is nil when MEMORY_PLUGIN_URL is unset — every consumer
	// nil-checks before using.
	memBundle := memwiring.Build(db.DB)
	if memBundle != nil {
		wh.WithNamespaceCleanup(memBundle.NamespaceCleanupFn())
	}

	// External-plugin env mutators — each plugin contributes 0+ mutators
	// onto a shared registry. gh-identity populates MOLECULE_AGENT_ROLE-
	// derived attribution env vars that the workspace's install.sh can
	// then read.
	//
	// github-app-auth was dropped 2026-05-07 (closes #157): per-agent
	// Gitea identities (this gh-identity plugin's role-derived path)
	// replaced GitHub-App-installation tokens after the 2026-05-06
	// suspension. Workspaces now provision with a per-persona Gitea PAT
	// from .env instead of an App-rotated GITHUB_TOKEN.
	envReg := provisionhook.NewRegistry()

	// gh-identity plugin — per-agent attribution via env injection + gh
	// wrapper shipped as base64 env. Soft-dep: no config file is OK
	// (plugin no-ops when no role is set on the workspace).
	// Tracks molecule-core#1957.
	if res, err := ghidentity.BuildRegistry(); err != nil {
		log.Fatalf("gh-identity plugin: %v", err)
	} else {
		envReg.Register(res.Mutator)
		log.Printf("gh-identity: registered (config file=%q)", os.Getenv("MOLECULE_GH_IDENTITY_CONFIG_FILE"))
	}

	wh.SetEnvMutators(envReg)
	log.Printf("env-mutator chain: %v", envReg.Names())

	// Offline handler: broadcast event + auto-restart the dead workspace
	onWorkspaceOffline := func(innerCtx context.Context, workspaceID string) {
		if err := broadcaster.RecordAndBroadcast(innerCtx, "WORKSPACE_OFFLINE", workspaceID, map[string]interface{}{}); err != nil {
			log.Printf("Offline broadcast error for %s: %v", workspaceID, err)
		}
		// Auto-restart: bring the workspace back automatically
		go wh.RestartByID(workspaceID)
	}

	// Start Liveness Monitor — Redis TTL expiry-based offline detection + auto-restart
	go supervised.RunWithRecover(ctx, "liveness-monitor", func(c context.Context) {
		registry.StartLivenessMonitor(c, onWorkspaceOffline)
	})

	// Proactive health sweep — two passes per tick:
	//   1. Docker-side: checks "online" workspaces against the local Docker
	//      daemon (only runs when prov is non-nil, i.e. self-hosted mode).
	//   2. Remote-side: scans runtime='external' rows whose last_heartbeat_at
	//      is past REMOTE_LIVENESS_STALE_AFTER and flips them to
	//      awaiting_agent. Runs regardless of provisioner mode — SaaS
	//      tenants need this even though they don't run Docker locally,
	//      because external-runtime workspaces are operator-managed and
	//      the platform-side liveness sweep is the only thing that
	//      transitions them off 'online' when the operator's CLI dies.
	//
	// Pre-2026-04-30 this goroutine was gated on prov != nil, which silently
	// disabled the remote-side sweep on every SaaS tenant. The function in
	// healthsweep.go has always handled nil checker correctly; only the
	// orchestration was wrong. See #2392's CI failure for the trace.
	go supervised.RunWithRecover(ctx, "health-sweep", func(c context.Context) {
		registry.StartHealthSweep(c, prov, 15*time.Second, onWorkspaceOffline)
	})

	// Orphan-container reconcile sweep — finds running containers
	// whose workspace row is already status='removed' and stops
	// them. Defence in depth on top of the inline cleanup in
	// handlers/workspace_crud.go: any Docker hiccup that left a
	// container alive after the user clicked delete heals on the
	// next sweep instead of leaking forever.
	if prov != nil {
		go supervised.RunWithRecover(ctx, "orphan-sweeper", func(c context.Context) {
			registry.StartOrphanSweeper(c, prov)
		})
	}

	// CP-mode orphan sweeper — SaaS counterpart to the Docker sweeper
	// above. Re-issues cpProv.Stop for any workspace at status='removed'
	// with a non-NULL instance_id, healing the deprovision split-write
	// race documented in #2989: tenant marks status='removed' BEFORE
	// calling CP DELETE, so a transient CP failure leaves the EC2
	// running with no retry path. cpProv.Stop is idempotent against
	// already-terminated instances; on success we clear instance_id.
	if cpProv != nil {
		go supervised.RunWithRecover(ctx, "cp-orphan-sweeper", func(c context.Context) {
			registry.StartCPOrphanSweeper(c, cpProv)
		})
	}

	// Pending-uploads GC sweep — deletes acked rows past their retention
	// window plus unacked rows past expires_at. Without this the
	// pending_uploads table grows unbounded; even with the 24h hard TTL,
	// nothing actually deletes a row, just makes it un-fetchable.
	go supervised.RunWithRecover(ctx, "pending-uploads-sweeper", func(c context.Context) {
		pendinguploads.StartSweeper(c, pendinguploads.NewPostgres(db.DB), 0)
	})

	// Provision-timeout sweep — flips workspaces that have been stuck in
	// status='provisioning' past the timeout window to 'failed' and emits
	// WORKSPACE_PROVISION_TIMEOUT. Without this the UI banner is cosmetic
	// and the state is incoherent (e.g. user sees "Retry" after 15min but
	// backend still thinks provisioning is in progress).
	go supervised.RunWithRecover(ctx, "provision-timeout-sweep", func(c context.Context) {
		// Pass the handler's per-runtime template-manifest lookup so the
		// sweeper honours `runtime_config.provision_timeout_seconds`
		// declared in any template's config.yaml — the same value the
		// canvas already reads via addProvisionTimeoutMs. Without this
		// the sweeper killed claude-code at the 10-min hardcoded floor
		// regardless of the manifest. See registry.RuntimeTimeoutLookup.
		registry.StartProvisioningTimeoutSweep(c, broadcaster, registry.DefaultProvisionSweepInterval, wh.ProvisionTimeoutSecondsForRuntime)
	})

	// Cron Scheduler — fires A2A messages to workspaces on user-defined schedules
	cronSched := scheduler.New(wh, broadcaster)
	// Wire the native-scheduler skip — when an adapter's heartbeat
	// declares provides_native_scheduler=true, the platform's polling
	// loop drops that workspace's schedules to avoid double-fire (the
	// SDK runs them itself). See project memory
	// `project_runtime_native_pluggable.md` and capability primitive #3.
	cronSched.SetNativeSchedulerCheck(handlers.ProvidesNativeScheduler)
	go supervised.RunWithRecover(ctx, "scheduler", cronSched.Start)

	// Hibernation Monitor — auto-pauses idle workspaces that have
	// hibernation_idle_minutes configured (#711). Wakeup is triggered
	// automatically on the next incoming A2A message.
	go supervised.RunWithRecover(ctx, "hibernation-monitor", func(c context.Context) {
		registry.StartHibernationMonitor(c, wh.HibernateWorkspace)
	})

	// RFC #2829 PR-3: stuck-task sweeper for the durable delegations
	// ledger. Marks deadline-exceeded rows as failed and heartbeat-stale
	// in-flight rows as stuck. Both transitions go through the ledger's
	// terminal forward-only protection so concurrent UpdateStatus calls
	// are not clobbered. Defaults: 5min interval, 10min stale threshold;
	// override via DELEGATION_SWEEPER_INTERVAL_S / DELEGATION_STUCK_THRESHOLD_S.
	delegSweeper := handlers.NewDelegationSweeper(nil, nil)
	go supervised.RunWithRecover(ctx, "delegation-sweeper", delegSweeper.Start)

	// Channel Manager — social channel integrations (Telegram, Slack, etc.)
	channelMgr := channels.NewManager(wh, broadcaster)
	go supervised.RunWithRecover(ctx, "channel-manager", channelMgr.Start)

	// Image auto-refresh — closes the runtime CD chain to "merge → containers
	// running new code" with no human in between. Polls GHCR for digest
	// changes on workspace-template-* :latest tags and invokes the same
	// refresh logic /admin/workspace-images/refresh exposes. Opt-in:
	// SaaS deploys whose pipeline already pulls every release should leave
	// it off (would be redundant work). Self-hosters get true zero-touch.
	if prov != nil && strings.EqualFold(os.Getenv("IMAGE_AUTO_REFRESH"), "true") {
		svc := handlers.NewWorkspaceImageService(prov.DockerClient())
		watcher := imagewatch.New(svc)
		go supervised.RunWithRecover(ctx, "image-auto-refresh", watcher.Run)
	}

	// Wire channel manager into scheduler for auto-posting cron output to Slack
	cronSched.SetChannels(channelMgr)

	// Router
	r := router.Setup(hub, broadcaster, prov, platformURL, configsDir, wh, channelMgr, memBundle)

	// HTTP server with graceful shutdown.
	//
	// Bind host: in dev-mode (no ADMIN_TOKEN, MOLECULE_ENV=dev|development)
	// the AdminAuth chain fails open by design; pairing that with a wildcard
	// bind would expose unauth /workspaces to any same-LAN peer. Default to
	// loopback when fail-open is active. Operators who need LAN exposure set
	// BIND_ADDR=0.0.0.0 explicitly. Production (ADMIN_TOKEN set) is unchanged.
	// See molecule-core#7.
	bindHost := resolveBindHost()
	srv := &http.Server{
		Addr:    fmt.Sprintf("%s:%s", bindHost, port),
		Handler: r,
	}

	// Start server in goroutine
	go func() {
		log.Printf("Platform starting on %s:%s (dev-mode-fail-open=%v)", bindHost, port, middleware.IsDevModeFailOpen())
		if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
			log.Fatalf("Server failed: %v", err)
		}
	}()

	// Wait for interrupt signal
	quit := make(chan os.Signal, 1)
	signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
	<-quit
	log.Println("Shutting down gracefully...")

	// Cancel background goroutines (liveness monitor, Redis subscriber)
	cancel()

	// Drain HTTP connections (30s timeout)
	shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 30*time.Second)
	defer shutdownCancel()
	if err := srv.Shutdown(shutdownCtx); err != nil {
		log.Printf("Server forced shutdown: %v", err)
	}

	// Close WebSocket hub
	hub.Close()

	log.Println("Platform stopped")
}

func envOr(key, fallback string) string {
	if v := os.Getenv(key); v != "" {
		return v
	}
	return fallback
}

// resolveBindHost picks the listener interface for the HTTP server.
//
// Precedence:
//  1. BIND_ADDR — explicit operator override (any value, including "0.0.0.0").
//  2. dev-mode fail-open active → "127.0.0.1" (loopback only).
//  3. otherwise → "" (Go binds every interface; existing prod/self-host shape).
//
// Coupling the loopback default to middleware.IsDevModeFailOpen() means the
// two safety levers — bind narrowness and auth strength — move together. A
// production deploy (ADMIN_TOKEN set) keeps binding to all interfaces because
// the auth chain is doing its job; a dev Mac (no ADMIN_TOKEN, MOLECULE_ENV=dev)
// is reachable only via loopback because the auth chain is fail-open. See
// molecule-core#7 for the original LAN exposure finding.
func resolveBindHost() string {
	if v := os.Getenv("BIND_ADDR"); v != "" {
		return v
	}
	if middleware.IsDevModeFailOpen() {
		return "127.0.0.1"
	}
	return ""
}

func findConfigsDir() string {
	candidates := []string{
		"workspace-configs-templates",
		"../workspace-configs-templates",
		"../../workspace-configs-templates",
	}
	for _, c := range candidates {
		if info, err := os.Stat(c); err == nil && info.IsDir() {
			// Verify the directory has at least one template with a config.yaml
			entries, _ := os.ReadDir(c)
			hasTemplate := false
			for _, e := range entries {
				if e.IsDir() {
					if _, err := os.Stat(filepath.Join(c, e.Name(), "config.yaml")); err == nil {
						hasTemplate = true
						break
					}
				}
			}
			if !hasTemplate {
				continue
			}
			abs, _ := filepath.Abs(c)
			return abs
		}
	}
	return "workspace-configs-templates"
}

func findMigrationsDir() string {
	candidates := []string{
		"migrations",
		"platform/migrations",
		"../migrations",
		"../../migrations",
	}

	if exe, err := os.Executable(); err == nil {
		dir := filepath.Dir(exe)
		candidates = append(candidates,
			filepath.Join(dir, "migrations"),
			filepath.Join(dir, "..", "migrations"),
			filepath.Join(dir, "..", "..", "migrations"),
		)
	}

	for _, c := range candidates {
		if info, err := os.Stat(c); err == nil && info.IsDir() {
			abs, _ := filepath.Abs(c)
			log.Printf("Found migrations at: %s", abs)
			return abs
		}
	}
	log.Println("No migrations directory found")
	return ""
}