molecule-ai-workspace-templ…/Dockerfile

FROM python:3.11-slim

# System deps:
#   curl, ca-certificates — TLS + Node tarball download
#   git           — codex's agent tools use git
#   gosu          — drop privileges in start.sh
#   xz-utils      — Node tarball is .tar.xz
#
# T4 escalation leg (RFC internal#456 §9 / PR#474 — mirrors the
# already-live-verified claude-code template image, commit 12dd604,
# and the in-flight hermes PR#26 / openclaw PR#19):
#   sudo + util-linux(nsenter) + docker.io(CLI) are baked here so the
#   uid-1000 `agent` (see useradd below — UNCHANGED, agent stays
#   uid-1000; start.sh still `exec gosu agent`) has a wired, audited
#   path to host root inside the provisioner's `--privileged
#   --pid=host -v /:/host -v /var/run/docker.sock:/var/run/docker.sock`
#   container. Without sudo, a uid-1000 process in --privileged CANNOT
#   nsenter/chroot /host (--privileged grants caps to root, not
#   uid-1000) and cannot use the root:docker 0660 docker.sock — T4
#   would be provisioner-shape-only (the documented ABSENT-escalation
#   -leg gap; the codex prod pin sha256:877e0687 / git 99e7f13 is the
#   2026-05-06 ECR-mirror rollback that PREDATES all T4 work and ships
#   NO leg). The sudoers drop-in + docker-group add are below, after
#   useradd, so `agent` exists. This is ADDITIVE: it does NOT change
#   the agent uid and does NOT change token ownership. The codex MCP
#   list_peers-401 token-resolution class (RFC internal#456 §10) is
#   fixed atomically in the SAME image revision via codex_mcp_config.sh
#   + start.sh's `chown -R agent:agent /configs`.
RUN apt-get update && apt-get install -y --no-install-recommends \
    curl ca-certificates git gosu xz-utils \
    sudo util-linux docker.io \
    && rm -rf /var/lib/apt/lists/*

# Node.js 20 LTS via NodeSource (codex CLI requires Node ≥20).
RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
    && apt-get install -y --no-install-recommends nodejs \
    && rm -rf /var/lib/apt/lists/*

# Non-root agent user — UNCHANGED. codex stores sessions under
# ~/.codex/sessions/ so /home/agent should be a persistent volume in
# production deployments to keep thread state across workspace
# restarts. The agent runs as uid-1000; the T4 escalation leg below is
# additive and does NOT promote the agent to root.
RUN useradd -u 1000 -m -s /bin/bash agent

# --- T4 escalation leg (RFC internal#456 §9.3 / PR#474) ---
# Wired path: uid-1000 agent -> host root inside the provisioner's
# --privileged --pid=host -v /:/host -v docker.sock container.
#   1. NOPASSWD sudoers drop-in (mode 0440, visudo-validated at build
#      so a malformed sudoers can never ship a broken-sudo image).
#   2. agent in the `docker` group so the bind-mounted root:docker
#      0660 /var/run/docker.sock is usable without sudo.
# Atomic co-sequencing (RFC §10): this ships in the SAME image
# revision as the uid-1000 + agent-owned-token start.sh contract and
# the codex_mcp_config.sh CONFIGS_DIR resolution fix; the Layer-3
# conformance gate asserts BOTH host-root reach AND agent-owned token
# on the running container. Mirrors claude-code template image
# (12dd604, already live-verified) verbatim.
RUN set -eux; \
    printf 'agent ALL=(ALL) NOPASSWD:ALL\n' > /etc/sudoers.d/agent-t4; \
    chmod 0440 /etc/sudoers.d/agent-t4; \
    visudo -cf /etc/sudoers.d/agent-t4; \
    groupadd -f docker; \
    usermod -aG docker agent; \
    id agent

WORKDIR /app

# RUNTIME_VERSION arg matches hermes/openclaw conventions — when set
# (cascade-triggered builds), it pins the exact runtime version PyPI
# just published. Including it as ARG changes the cache key for the
# pip install layer below — without this, identical Dockerfile +
# requirements.txt would let docker reuse the cached layer with the
# previous version baked in (the cache trap that bit us 5x on
# 2026-04-27 — see runtime publish pipeline gates memory).
ARG RUNTIME_VERSION=

COPY requirements.txt .
# The codex runtime is registered the SAME way hermes/claude-code do
# it: ENV ADAPTER_MODULE=adapter (set below) — the runtime's adapter
# discovery loads adapter.py and `CodexAdapter.name()` ("codex") is
# authoritative. The previous Dockerfile (inherited from the stale
# single-commit Gitea mirror) ALSO monkeypatched
# `molecule_runtime.preflight.SUPPORTED_RUNTIMES` via an unguarded
# `python3 -c ...add('codex')` + a brittle in-file `sed`. That worked
# against the 2026-05-04 runtime baked into the deployed image
# (sha256:877e0687) but the CURRENT published runtime no longer
# exposes that exact mutable-set literal, so the unguarded RUN exited
# 1 and FAILED THE BUILD (validate-runtime + t4-conformance, CI run
# 1). Root-cause fix: drop the brittle file-rewrite entirely (neither
# hermes nor claude-code patch preflight — adapter discovery is the
# real registration path) and keep only a defensive, idempotent,
# never-fail compatibility shim for any older runtime that still gates
# on a mutable SUPPORTED_RUNTIMES set. `|| true` so a runtime that has
# no such attribute (the modern shape) builds clean.
RUN pip install --no-cache-dir -r requirements.txt && \
    if [ -n "${RUNTIME_VERSION}" ]; then \
      pip install --no-cache-dir --upgrade "molecule-ai-workspace-runtime==${RUNTIME_VERSION}"; \
    fi && \
    python3 -c "import molecule_runtime.preflight as pf; s=getattr(pf,'SUPPORTED_RUNTIMES',None); s.add('codex') if isinstance(s,set) else None; print('preflight SUPPORTED_RUNTIMES shim:', 'patched' if isinstance(s,set) else 'n/a (adapter-module discovery is authoritative)')" || true

COPY adapter.py executor.py app_server.py provider_config.py __init__.py ./
COPY config.yaml ./
COPY start.sh /usr/local/bin/start.sh

# Generic GIT_ASKPASS helper. Reads HTTPS Basic-Auth credentials from
# env vars (GIT_HTTP_USERNAME / GIT_HTTP_PASSWORD, with GITEA_USER /
# GITEA_TOKEN as fallback) and emits them on the git credential-prompt
# protocol, so container-side `git` can authenticate to any private
# HTTPS remote without on-disk .gitconfig / .git-credentials mutation.
# Installed as /usr/local/bin/molecule-askpass — the platform-side
# provisioner sets GIT_ASKPASS to that path. Script body contains no
# hostnames or vendor literals; the deployer decides which remote the
# credentials apply to by virtue of populating those env vars.
COPY scripts/git-askpass.sh /usr/local/bin/molecule-askpass
RUN chmod +x /usr/local/bin/molecule-askpass
# Provider/MCP config helpers — invoked by start.sh on every boot.
#
# render_provider_toml.py is the new YAML-driven entry point: reads
# `providers:` from config.yaml, resolves to the right provider for
# the env, and writes ~/.codex/config.toml accordingly. Replaces the
# legacy hardcoded codex_minimax_config.sh path.
#
# codex_minimax_config.sh is kept as a compat fallback (one release)
# for downstream ops scripts and existing tests that exec it directly;
# start.sh prefers the python helper when available.
#
# codex_mcp_config.sh appends the molecule A2A MCP server block
# (list_peers / delegate_task / commit_memory) and resolves the
# correct CONFIGS_DIR so the MCP child reads the same .auth_token the
# runtime writes (the list_peers-401 fix). start.sh probes both
# /usr/local/bin and /app — install to /usr/local/bin (the primary).
COPY render_provider_toml.py /usr/local/bin/render_provider_toml.py
# provider_config.py is imported by render_provider_toml.py at runtime;
# co-install into /usr/local/bin so the script can find it from there
# (the `_HERE` sys.path insert in render_provider_toml.py picks it up).
# It also lives in /app via the COPY above for adapter.py import.
COPY provider_config.py /usr/local/bin/provider_config.py
COPY codex_minimax_config.sh codex_mcp_config.sh /usr/local/bin/
# codex_auth_refresh.sh — OAuth refresh watchdog (RFC internal#569).
# start.sh launches it as `gosu agent` after auth.json is materialized;
# it polls every 6h and rewrites auth.json atomically when the access
# token is within 4h of expiry OR last_refresh is older than 7d. Inert
# when no auth.json is present (the API-key / MiniMax paths skip it).
COPY codex_auth_refresh.sh /usr/local/bin/codex_auth_refresh.sh
RUN chmod +x /usr/local/bin/start.sh \
             /usr/local/bin/codex_minimax_config.sh \
             /usr/local/bin/codex_mcp_config.sh \
             /usr/local/bin/render_provider_toml.py \
             /usr/local/bin/codex_auth_refresh.sh

# Build-time smoke check for the OAuth refresh watchdog (PR#24
# regression-pin). Pre-PR#24 the script hardcoded
# /opt/molecule-venv/bin/python3, a path that does NOT exist in this
# image (we build FROM python:3.11-slim → python3 at
# /usr/local/bin/python3). Every helper invocation exited 127, OAuth
# refresh never fired, id_token expired silently, Researcher wedged
# upstream of stdout (ae2c3012 diagnosis). This RUN executes the
# watchdog's `--once` path against an absent CODEX_HOME — which exercises
# the python3 resolver AND the absent-auth.json skip branch. Expected
# rc=1 (skip:no_auth_json); rc=127 means the python3 path regressed and
# the image must fail to build, NEVER ship.
RUN set -eux; \
    bash -n /usr/local/bin/codex_auth_refresh.sh; \
    rc=0; \
    CODEX_HOME=/tmp/.codex-smoke-no-auth /usr/local/bin/codex_auth_refresh.sh --once || rc=$?; \
    rm -rf /tmp/.codex-smoke-no-auth; \
    if [ "$rc" -eq 127 ]; then \
      echo "FATAL: codex_auth_refresh.sh exited 127 at image-build smoke — python3 helper not located. PR#19 OAuth auto-refresh would ship broken (PR#24 regression-pin)." >&2; \
      exit 1; \
    fi; \
    if [ "$rc" -ne 1 ]; then \
      echo "FATAL: codex_auth_refresh.sh smoke produced rc=$rc (expected rc=1 skip:no_auth_json). Image-build watchdog smoke failed." >&2; \
      exit 1; \
    fi; \
    echo "[image-build smoke] codex_auth_refresh.sh OAuth watchdog OK (rc=1 skip:no_auth_json — python3 helper resolves)."

# --- Install the OpenAI Codex CLI globally as root (binary lives in
# /usr/lib/node_modules and symlinks into /usr/bin/codex; available to
# both root and the agent user).
#
# Pinned EXACTLY to 0.130.0 (not a `~`/`^` range). Rationale:
#   * 0.130.0 is the npm `latest` dist-tag — the current stable line
#     (0.131.x is alpha-only at the time of this change; we do not
#     ship a pre-release CLI in a prod runtime image).
#   * The previous `~0.57` pin PREDATES `codex login --device-auth` /
#     ChatGPT-subscription OAuth: it cannot consume the modern
#     `auth.json` shape ({auth_mode:"chatgpt", tokens:{id_token,
#     access_token,refresh_token,account_id}, last_refresh}) and
#     ignores `forced_login_method = "chatgpt"`. The subscription
#     OAuth credential we now materialize (see start.sh Mode C) is
#     only usable on a CLI that supports this format — 0.130.0 does.
#   * config.yaml's default model (`gpt-5.5`) and the May-2026 roster
#     were already live-verified against codex-cli 0.130.0
#     linux/amd64 (thread/start returned "model":"gpt-5.5").
#   * codex's app-server protocol is `experimental` and breaks across
#     minor versions, so we pin the EXACT patch release rather than a
#     range — a bump is a deliberate, reviewed, re-verified change.
RUN npm install -g @openai/codex@0.130.0

USER agent
WORKDIR /home/agent
USER root
WORKDIR /app

ENV ADAPTER_MODULE=adapter \
    PYTHONPATH=/app

# start.sh is intentionally minimal — codex doesn't need a separate
# daemon to boot; the app-server is a stdio child spawned by
# executor.py on the first A2A turn. start.sh also generates the
# MiniMax provider config + molecule MCP block and (as root, before
# the gosu drop) makes /configs agent-owned so the runtime AND the MCP
# child resolve the same agent-owned .auth_token.
ENTRYPOINT ["/usr/local/bin/start.sh"]