diff --git a/.gitea/workflows/publish-workspace-server-image.yml b/.gitea/workflows/publish-workspace-server-image.yml index 8e493dde..7aca8132 100644 --- a/.gitea/workflows/publish-workspace-server-image.yml +++ b/.gitea/workflows/publish-workspace-server-image.yml @@ -78,13 +78,13 @@ env: # touching every workflow). Pattern mirrors `vars.CP_URL || 'literal'` already in # use below in this repo's staging-verify.yml. IMAGE_NAME: ${{ vars.ECR_REGISTRY || '153263036946.dkr.ecr.us-east-2.amazonaws.com' }}/molecule-ai/platform - # Concierge (kind=platform) image: the base platform image + the platform-agent - # template's identity (config.yaml/prompts/mcp_servers/identity-fallback.sh) - # baked in via Dockerfile.platform-agent. Built from .tenant-bundle-deps/ - # workspace-configs-templates/platform-agent (staged by clone-manifest.sh from - # the manifest platform-agent entry). The CP selects this image for kind=platform - # (core#2495). Without this build the concierge boots with no identity (#2919/#2955). - PLATFORM_AGENT_IMAGE_NAME: ${{ vars.ECR_REGISTRY || '153263036946.dkr.ecr.us-east-2.amazonaws.com' }}/molecule-ai/molecule-platform-agent + # Concierge (kind=platform) image (molecule-ai/molecule-platform-agent) is + # built by the TEMPLATE repo (workspace-template-claude-code, publish-image.yml + # → publish-platform-agent) FROM the claude-code runtime image with the + # org-management MCP baked at /opt/molecule-mcp-server (RFC platform-agent §5.7). + # Core does NOT build it: the prior FROM-platform-tenant build here shipped an + # image with no runtime + no MCP server, so the concierge never registered + # (#2970/#30). Pin promotion is operator-gated (POST /cp/admin/runtime-image/promote). TENANT_IMAGE_NAME: ${{ vars.ECR_REGISTRY || '153263036946.dkr.ecr.us-east-2.amazonaws.com' }}/molecule-ai/platform-tenant STAGING_TENANT_IMAGE_NAME: ${{ vars.STAGING_ECR_REGISTRY || '004947743811.dkr.ecr.us-east-2.amazonaws.com' }}/molecule-ai/platform-tenant @@ -327,55 +327,6 @@ jobs: fi done - # Build + push the CONCIERGE platform-agent image. Extends the LIVE - # platform-tenant image (built just above) with the concierge identity - # baked from the platform-agent template (staged at .tenant-bundle-deps/ - # workspace-configs-templates/platform-agent by the Pre-clone step from - # the manifest platform-agent entry). MUST run AFTER the tenant build — - # FROM ${TENANT_IMAGE_NAME}:${TAG_SHA}, the live workspace-server image - # concierges already run (it has /entrypoint.sh = entrypoint-tenant.sh, - # which the platform-agent wrapper chains to). The dead molecule-ai/platform - # base (unbuilt since 2026-05-15) is deliberately NOT used. CP selects this - # image for kind=platform (core#2495); without it the concierge boots with - # no identity (#2919 image-bake / #2955 identity-fallback.sh). - - name: Build & push platform-agent image to ECR (staging- + staging-latest) - env: - TENANT_IMAGE_NAME: ${{ env.TENANT_IMAGE_NAME }} - PLATFORM_AGENT_IMAGE_NAME: ${{ env.PLATFORM_AGENT_IMAGE_NAME }} - TAG_SHA: staging-${{ steps.tags.outputs.sha }} - TAG_LATEST: staging-latest - GIT_SHA: ${{ steps.tags.outputs.sha }} - REPO: ${{ github.event.repository.name }} - GITHUB_RUN_ID: ${{ github.run_id }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: us-east-2 - run: | - set -euo pipefail - ECR_REGISTRY="${PLATFORM_AGENT_IMAGE_NAME%%/*}" - aws ecr get-login-password --region us-east-2 | \ - docker login --username AWS --password-stdin "${ECR_REGISTRY}" - builder="pa-builder-${GITHUB_RUN_ID}" - docker buildx create --name "${builder}" --use >/dev/null 2>&1 || true - docker buildx build \ - --builder "${builder}" \ - --file ./workspace-server/Dockerfile.platform-agent \ - --build-arg BASE_IMAGE="${TENANT_IMAGE_NAME}:${TAG_SHA}" \ - --provenance=false \ - --sbom=false \ - --build-arg GIT_SHA="${GIT_SHA}" \ - --label "org.opencontainers.image.source=https://git.moleculesai.app/molecule-ai/${REPO}" \ - --label "org.opencontainers.image.revision=${GIT_SHA}" \ - --label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ - --label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \ - --cache-from "type=registry,ref=${PLATFORM_AGENT_IMAGE_NAME}:buildcache" \ - --cache-to "type=registry,ref=${PLATFORM_AGENT_IMAGE_NAME}:buildcache,mode=max,image-manifest=true,oci-mediatypes=true,ignore-error=true" \ - --tag "${PLATFORM_AGENT_IMAGE_NAME}:${TAG_SHA}" \ - --tag "${PLATFORM_AGENT_IMAGE_NAME}:${TAG_LATEST}" \ - --push . || { docker buildx rm "${builder}" >/dev/null 2>&1 || true; echo "::error::platform-agent image build failed"; exit 1; } - docker buildx rm "${builder}" >/dev/null 2>&1 || true - echo "::notice::platform-agent image pushed: ${PLATFORM_AGENT_IMAGE_NAME}:${TAG_SHA}" - # Staging auto-deploy: every workspace-server image publish on main should # roll out to the staging fleet so code fixes reach staging without a # manual workflow_dispatch. Gitea 1.22.6 does not support workflow_run, so diff --git a/workspace-server/Dockerfile.platform-agent b/workspace-server/Dockerfile.platform-agent deleted file mode 100644 index 368af1ba..00000000 --- a/workspace-server/Dockerfile.platform-agent +++ /dev/null @@ -1,156 +0,0 @@ -# Platform-agent image variant (RFC #2843 §10a IMAGE-BAKED). -# -# The platform-agent image is the concierge's dedicated image. The base -# platform image (Dockerfile) is the ordinary /platform image; the -# platform-agent variant EXTENDS the base with the concierge's IDENTITY -# baked in, sourced FROM the platform-agent TEMPLATE REPO -# (molecule-ai/molecule-ai-workspace-template-platform-agent) — the -# SAME SSOT that the asset-channel delivers post-#29-activation. -# -# Why a dedicated image: the concierge is a platform-managed agent -# (NOT a user template) with a different threat model and a different -# identity-delivery requirement. The asset channel (PR-B, #2900+#2903) -# works in SaaS+token, in SaaS+public-fetch, and (post-#29-activation) -# in any tenant. The IMAGE-BAKED variant covers the remaining gap: -# self-hosted deployments (no MOLECULE_TEMPLATE_REPO_TOKEN) and the -# pre-#29-activation bootstrap window, where neither the asset channel -# nor the local template path is guaranteed to be available. -# -# SSOT contract (driver hard-requirement on the IMAGE-BAKED impl): -# The image-baked content (config.yaml + prompts/concierge.md + -# mcp_servers.yaml + identity-fallback.sh) is SOURCED FROM the -# platform-agent TEMPLATE REPO, NOT vendored/duplicated in core. A CI -# DRIFT-GATE -# (workspace-server/internal/provisioner/platform_agent_image_drift_test.go, -# pinned against /opt/molecule-platform-agent-template/{config.yaml, -# mcp_servers.yaml,prompts/concierge.md,identity-fallback.sh} vs the -# pre-cloned .tenant-bundle-deps/workspace-configs-templates/platform- -# agent/ source) asserts byte-equal at build time. A future drift -# would fail the drift-gate test (go test -run -# TestPlatformAgentImageDriftGate), catching it BEFORE the image is -# published — so image snapshot + template can NEVER diverge in -# production without a CI-red signal. -# -# Build context: same as Dockerfile. The platform-agent template -# content is pre-cloned by scripts/clone-manifest.sh into -# .tenant-bundle-deps/workspace-configs-templates/platform-agent/ -# (the platform-agent template is a manifest.json workspace_templates -# entry per RFC #2843 §10a), and this Dockerfile reads it via the -# PLATFORM_AGENT_TEMPLATE_DIR build-arg (default = canonical CI path). -# -# Usage (operator / CI): -# docker buildx build \ -# --build-arg PLATFORM_AGENT_TEMPLATE_DIR=.tenant-bundle-deps/workspace-configs-templates/platform-agent \ -# --tag ${REGISTRY}/molecule-ai/platform-platform-agent:staging-${GIT_SHA} \ -# -f workspace-server/Dockerfile.platform-agent \ -# . -# -# Runtime contract: a container started from this image has the -# concierge identity at /opt/molecule-platform-agent-template/. The -# pre-#29/self-host fallback path in the workspace-server's -# applyConciergeProvisionConfig hook (workspace-server/internal/ -# handlers/platform_agent.go) reads from this path when the asset- -# channel deliver is unavailable. Post-#29 activation, the asset -# channel remains the SSOT-delivery path; the image-baked copy is -# a last-resort fallback (intentionally NOT a parallel SSOT — the -# drift-gate enforces single-SSOT). -# -# The identity-fallback.sh script is the WORKING fallback (the -# IMAGE_BAKED_IDENTITY_PRESENT echo-only marker that the #2919 PR -# shipped was a log line that did nothing — this PR's companion -# template-platform-agent PR adds the real script). The platform- -# agent entrypoint sources the script at boot, BEFORE handing off -# to the base image's /entrypoint.sh. Fill-absent-only: a delivered -# /configs/* (asset-channel SSOT) is NEVER overwritten; the image- -# baked copy is the safety net for self-host + pre-#29-bootstrap -# windows where neither the asset channel nor the local template -# path is guaranteed to be available. See -# template-platform-agent #2 (identity-fallback.sh) for the script -# semantics — SRC=/opt/molecule-platform-agent-template, DST=/configs, -# fail-soft on missing SRC. - -ARG BASE_IMAGE=molecule-local/platform:latest -FROM ${BASE_IMAGE} - -# PLATFORM-AGENT TEMPLATE CONTENT — SSOT-sourced from the pre-cloned -# template repo. The default path mirrors where scripts/clone-manifest.sh -# places workspace_templates entries -# (.tenant-bundle-deps/workspace-configs-templates//). The -# platform-agent template is in manifest.json's workspace_templates -# (per RFC #2843 §10a), so the existing pre-clone step in -# publish-workspace-server-image.yml populates this path with no -# additional CI work. -# -# The build-arg exists for operators / staging mirrors that pre-clone -# to a different dir (e.g. a shallow --depth=1 mirror for fast CI). -# Default value is the canonical CI path; override only when the -# pre-clone layout differs. -# -# Why build-arg, not ENV: the path is a BUILD-TIME input, not a -# runtime config; build-args are the right tool and the value never -# enters the running container. -ARG PLATFORM_AGENT_TEMPLATE_DIR=.tenant-bundle-deps/workspace-configs-templates/platform-agent -COPY ${PLATFORM_AGENT_TEMPLATE_DIR}/config.yaml /opt/molecule-platform-agent-template/config.yaml -COPY ${PLATFORM_AGENT_TEMPLATE_DIR}/mcp_servers.yaml /opt/molecule-platform-agent-template/mcp_servers.yaml -COPY ${PLATFORM_AGENT_TEMPLATE_DIR}/prompts/ /opt/molecule-platform-agent-template/prompts/ -# The boot-time identity-fallback script. Sourced at container start -# (see /entrypoint-platform-agent.sh below) to fill ABSENT files at -# /configs/ from the image-baked /opt path. The script is the SSOT in -# the platform-agent TEMPLATE REPO — drift-gate -# (platform_agent_image_drift_test.go) catches content drift between -# this COPY source and the image-baked destination. -# -# RCA 12124 (DRIVER-ESCALATED live prod identity): the script MUST -# write /configs/system-prompt.md (the file the -# conciergeIdentityPresent probe at platform_agent.go:399 reads) — -# NOT just /configs/prompts/concierge.md. Prior shape had a -# conditional write (`if [ ! -s "$DST/system-prompt.md" ]`) which -# could fail to fire after a partial-template run; the fixed script -# in the template-platform-agent repo (PR-side, merged to template -# main) is unconditional: always writes /configs/system-prompt.md -# from prompts/concierge.md + {{CONCIERGE_NAME}} substitution. -# COPY --chmod sets +x at copy time (buildx-native). A `RUN chmod` fails with -# "Operation not permitted" when the base image runs as a non-root user — the -# live platform-tenant base does, whereas the dead molecule-ai/platform base was -# root, which masked this. --chmod works regardless of base USER. -COPY --chmod=0755 ${PLATFORM_AGENT_TEMPLATE_DIR}/identity-fallback.sh /opt/molecule-platform-agent-template/identity-fallback.sh - -# PLATFORM-AGENT ENTRYPOINT — runs identity-fallback.sh FIRST (fills -# absent /configs/ files from the image-baked /opt path; the -# asset-channel deliver is the SSOT post-#29-activation, this is the -# self-host + pre-#29-bootstrap safety net), then hands off to the -# base image's /entrypoint.sh (which does docker-socket group setup, -# memory-plugin sidecar spawn-gate, then exec su-exec platform -# /platform). -# -# Why a separate entrypoint (not extending /entrypoint.sh in core): -# the IMAGE-BAKED identity-fallback is a platform-agent-specific -# concern — the base /entrypoint.sh stays the single runtime entry -# for the ordinary /platform image, and the platform-agent variant -# overrides only the boot hook. The script is sourced (not exec'd) -# so a missing-script failure bubbles up cleanly (su-exec will still -# run /platform; the runtime's MISSING_MODEL fail-closed surfaces -# the operator-visible error in that case). -COPY --chmod=0755 <<'ENTRY' /entrypoint-platform-agent.sh -#!/bin/sh -# /opt/molecule-platform-agent-template/identity-fallback.sh: per- -# file copy of ABSENT files from the image-baked SSOT path to -# /configs/. The asset-channel deliver (post-#29-activation) is the -# authoritative path when it lands; this is the safety net for self- -# host + pre-#29-bootstrap windows where neither the asset channel -# nor the local template path is guaranteed to be available. -if [ -x /opt/molecule-platform-agent-template/identity-fallback.sh ]; then - /opt/molecule-platform-agent-template/identity-fallback.sh || { - echo "platform-agent: ⚠️ identity-fallback.sh failed (see prior log lines); continuing boot — runtime will MISSING_MODEL fail-closed if /configs is empty" >&2 - } -else - echo "platform-agent: identity-fallback.sh not present (image built without it); skipping fallback (runtime will MISSING_MODEL fail-closed)" >&2 -fi -# Hand off to the base image's entrypoint (docker-socket group -# setup, memory-plugin sidecar spawn-gate, then su-exec platform -# /platform). Pass through any CMD args (the platform-agent image -# is invoked the same way as the base — operator/CI sets CMD as -# needed; this entrypoint is transparent to the args). -exec /entrypoint.sh "$@" -ENTRY -ENTRYPOINT ["/entrypoint-platform-agent.sh"]