molecule-core/workspace/preflight.py

"""Startup preflight checks for workspace runtime configs."""

import importlib
import os
from dataclasses import dataclass, field
from pathlib import Path

from config import WorkspaceConfig


def _validate_runtime_via_adapter(runtime: str) -> tuple[bool, str]:
    """Discover the installed adapter and confirm it matches the
    config's `runtime` field. Returns (ok, detail) — detail is the
    operator-actionable failure message when ok is False.

    Replaces the previous hardcoded SUPPORTED_RUNTIMES allowlist
    (claude-code / codex / ollama / langgraph / etc.). The static list
    couldn't keep up with new template repos: each new adapter required
    a code change in molecule-runtime to be 'supported', a violation of
    the universal-runtime principle (#87).

    Discovery uses the same ADAPTER_MODULE env var that production load
    paths consult (workspace/adapters/__init__.py:get_adapter). The
    adapter's static name() string is the source of truth — config.yaml
    just labels which one the operator expects, and the check warns on
    drift.

    Failure modes the function distinguishes (each gets a distinct
    operator-facing message so debugging is concrete):
      - ADAPTER_MODULE unset → "no adapter installed"
      - ADAPTER_MODULE set but module won't import → "import failed: …"
      - module imports but no Adapter class → "Adapter class missing"
      - Adapter.name() differs from config.runtime → drift warning
    """
    adapter_module = os.environ.get("ADAPTER_MODULE", "").strip()
    if not adapter_module:
        return False, (
            "ADAPTER_MODULE env var is unset — no adapter installed in this "
            f"image. Workspace declares runtime='{runtime}' but the runtime "
            "discovery path can't find any. In a template image this is set "
            "in the Dockerfile (ENV ADAPTER_MODULE=adapter); in dev, set it "
            "to your local adapter module name."
        )
    try:
        mod = importlib.import_module(adapter_module)
    except Exception as exc:
        return False, (
            f"ADAPTER_MODULE={adapter_module!r} is not importable: "
            f"{type(exc).__name__}: {exc}. Check the module path + that its "
            "dependencies installed cleanly."
        )
    adapter_cls = getattr(mod, "Adapter", None)
    if adapter_cls is None:
        return False, (
            f"ADAPTER_MODULE={adapter_module!r} imported, but no `Adapter` "
            "class is exported. Add `Adapter = YourAdapterClass` at module "
            "scope (convention from BaseAdapter docstring)."
        )
    try:
        adapter_name = adapter_cls.name()
    except Exception as exc:
        return False, (
            f"Adapter.name() raised {type(exc).__name__}: {exc}. The static "
            "name() classmethod must return the runtime identifier without "
            "side effects."
        )
    if not isinstance(adapter_name, str) or not adapter_name:
        return False, "Adapter.name() must return a non-empty string."
    if adapter_name != runtime:
        # Drift between config.yaml and the installed adapter is unusual
        # but not fatal — the adapter wins (it's what actually runs).
        # Operator-facing detail names both so they can fix whichever is
        # stale.
        return True, (
            f"Drift: config.yaml runtime={runtime!r} but installed Adapter "
            f"reports name={adapter_name!r}. The adapter wins; update "
            "config.yaml to match if the drift is unintended."
        )
    return True, ""


@dataclass
class PreflightIssue:
    severity: str
    title: str
    detail: str
    fix: str = ""


@dataclass
class PreflightReport:
    warnings: list[PreflightIssue] = field(default_factory=list)
    failures: list[PreflightIssue] = field(default_factory=list)

    @property
    def ok(self) -> bool:
        return not self.failures


def run_preflight(config: WorkspaceConfig, config_path: str) -> PreflightReport:
    """Check the workspace config for obvious startup blockers."""
    report = PreflightReport()
    config_dir = Path(config_path)

    runtime_ok, runtime_detail = _validate_runtime_via_adapter(config.runtime)
    if not runtime_ok:
        report.failures.append(
            PreflightIssue(
                severity="fail",
                title="Runtime",
                detail=runtime_detail,
                fix=(
                    "Install the matching adapter (template repo's Dockerfile "
                    "should set ADAPTER_MODULE) or correct the runtime field in "
                    "config.yaml."
                ),
            )
        )
    elif runtime_detail:
        # ok=True with a detail = drift warning, not a failure.
        report.warnings.append(
            PreflightIssue(
                severity="warn",
                title="Runtime",
                detail=runtime_detail,
                fix="Update config.yaml runtime to match the installed Adapter.name().",
            )
        )

    if not 1 <= int(config.a2a.port) <= 65535:
        report.failures.append(
            PreflightIssue(
                severity="fail",
                title="A2A port",
                detail=f"Invalid A2A port: {config.a2a.port}",
                fix="Set a2a.port to a value between 1 and 65535.",
            )
        )

    # Check required environment variables (e.g. CLAUDE_CODE_OAUTH_TOKEN, OPENAI_API_KEY).
    # These are declared per-runtime in config.yaml and injected via the secrets API.
    required_env = getattr(config.runtime_config, "required_env", []) or []

    # Per-model override path. When the template's runtime_config declares
    # `models[]` (canvas Model dropdown), prefer the picked model's own
    # `required_env` over the top-level fallback. The picked model is
    # `runtime_config.model` (which already honors the MODEL_PROVIDER env
    # override at parse time — see config.py:RuntimeConfig.model resolution).
    # Match on `entry["id"]` case-insensitively because canvas-side ids
    # ("MiniMax-M2.7") and adapter-side normalization ("minimax-m2.7") drift
    # by case across registries.
    #
    # Bug surfaced 2026-05-02: claude-code-default top-level required_env
    # demands CLAUDE_CODE_OAUTH_TOKEN, but the user picked MiniMax and only
    # set MINIMAX_API_KEY. Without this lookup, preflight failed and the
    # workspace crash-looped despite the user having satisfied the picked
    # model's actual auth requirement.
    models = getattr(config.runtime_config, "models", None) or []
    picked_model = (getattr(config.runtime_config, "model", "") or "").strip()
    if models and picked_model:
        picked_lower = picked_model.lower()
        for entry in models:
            if not isinstance(entry, dict):
                continue
            entry_id = str(entry.get("id", "")).strip()
            if not entry_id:
                continue
            if entry_id.lower() != picked_lower:
                continue
            if "required_env" in entry:
                # Per-model required_env wins outright — do NOT union with the
                # top-level list. Templates use per-model entries precisely
                # to express that different models have *different* auth
                # paths (OAuth token vs API key vs third-party provider key);
                # unioning would re-introduce the very crash-loop this fix
                # closes. An explicit empty list means "no auth needed"
                # (e.g. local Ollama or self-hosted endpoints) and MUST
                # short-circuit the top-level fallback — that's why we key
                # off `"required_env" in entry` rather than truthiness.
                required_env = list(entry.get("required_env") or [])
            break

    # Smoke mode skips the auth-env block: the boot smoke (CI publish-image,
    # issue #2275) exercises executor.execute() against stub deps, never
    # hits the real provider, and CI cannot enumerate every adapter's auth
    # env without forming a maintenance treadmill. Hermes 2026-05-03 outage:
    # template smoke crashed for two cycles because molecule-ci injected
    # CLAUDE_CODE_OAUTH_TOKEN/ANTHROPIC_API_KEY/etc. but not HERMES_API_KEY.
    # Bypass here means new templates can ship without the workflow
    # learning their env names.
    smoke_mode = os.environ.get("MOLECULE_SMOKE_MODE", "").strip().lower() in (
        "1", "true", "yes", "on",
    )
    for env_var in required_env:
        if os.environ.get(env_var):
            continue
        if smoke_mode:
            report.warnings.append(
                PreflightIssue(
                    severity="warn",
                    title="Required env",
                    detail=f"Missing {env_var} (skipped — MOLECULE_SMOKE_MODE)",
                    fix="",
                )
            )
            continue
        # Missing required env is a CONFIGURATION issue, not a STRUCTURAL one.
        # The workspace can still bind /.well-known/agent-card.json — adapter.setup()
        # raises on the missing key, main.py's PR #2756 try/except mounts the
        # not-configured JSON-RPC handler, canvas surfaces a clear "agent not
        # configured: <reason>" error to the user. Hard-failing preflight here
        # would crash before the not-configured path even loads, leaving the
        # workspace invisible (the failure mode that bit codex/openclaw bench
        # 25335853189 on 2026-05-04 even after PR #2756). Warn loudly so logs
        # remain actionable, but let the boot continue.
        report.warnings.append(
            PreflightIssue(
                severity="warn",
                title="Required env",
                detail=f"Missing required environment variable: {env_var}",
                fix=(
                    f"Set {env_var} via the secrets API (global or workspace-level). "
                    "Workspace will boot in not-configured state until this is set; "
                    "JSON-RPC will return -32603 'agent not configured' on every request."
                ),
            )
        )

    # Backward compat: if legacy auth_token_file is set, warn — same reasoning
    # as the required_env block above. The downstream auth check fires inside
    # adapter.setup(), which is wrapped by main.py's try/except.
    token_file = getattr(config.runtime_config, "auth_token_file", "")
    if token_file:
        token_path = config_dir / token_file
        if not token_path.exists():
            token_env = getattr(config.runtime_config, "auth_token_env", "")
            env_has_token = bool(token_env and os.environ.get(token_env))
            # Also check if any required_env is set (covers the new path)
            if not env_has_token and required_env:
                env_has_token = all(os.environ.get(e) for e in required_env)

            if not env_has_token:
                report.warnings.append(
                    PreflightIssue(
                        severity="warn",
                        title="Auth token",
                        detail=f"Missing auth token file: {token_file}",
                        fix=(
                            "Remove auth_token_file and use required_env + secrets API "
                            "instead. Workspace will boot in not-configured state until "
                            "the token is provided."
                        ),
                    )
                )

    prompt_files = config.prompt_files or ["system-prompt.md"]
    for prompt_file in prompt_files:
        prompt_path = config_dir / prompt_file
        if not prompt_path.exists():
            report.warnings.append(
                PreflightIssue(
                    severity="warn",
                    title="Prompt file",
                    detail=f"Missing prompt file: {prompt_file}",
                    fix="Add the file or remove it from prompt_files.",
                )
            )

    skills_dir = config_dir / "skills"
    for skill_name in config.skills:
        skill_path = skills_dir / skill_name / "SKILL.md"
        if not skill_path.exists():
            report.warnings.append(
                PreflightIssue(
                    severity="warn",
                    title="Skill",
                    detail=f"Missing skill package: {skill_name}",
                    fix="Restore the skill folder or remove it from config.yaml.",
                )
            )

    return report


def render_preflight_report(report: PreflightReport) -> None:
    """Print a concise startup report."""
    if not report.warnings and not report.failures:
        return

    print("Preflight checks:")
    for issue in report.failures:
        print(f"[FAIL] {issue.title}: {issue.detail}")
        if issue.fix:
            print(f"  Fix: {issue.fix}")
    for issue in report.warnings:
        print(f"[WARN] {issue.title}: {issue.detail}")
        if issue.fix:
            print(f"  Fix: {issue.fix}")