fix(concierge): stop core building the wrong molecule-platform-agent image (#2970/#30) #3027

Merged
core-devops merged 1 commits from fix/concierge-remove-core-wrong-build into main 2026-06-18 03:46:38 +00:00
2 changed files with 7 additions and 212 deletions
@@ -78,13 +78,13 @@ env:
# touching every workflow). Pattern mirrors `vars.CP_URL || 'literal'` already in
# use below in this repo's staging-verify.yml.
IMAGE_NAME: ${{ vars.ECR_REGISTRY || '153263036946.dkr.ecr.us-east-2.amazonaws.com' }}/molecule-ai/platform
# Concierge (kind=platform) image: the base platform image + the platform-agent
# template's identity (config.yaml/prompts/mcp_servers/identity-fallback.sh)
# baked in via Dockerfile.platform-agent. Built from .tenant-bundle-deps/
# workspace-configs-templates/platform-agent (staged by clone-manifest.sh from
# the manifest platform-agent entry). The CP selects this image for kind=platform
# (core#2495). Without this build the concierge boots with no identity (#2919/#2955).
PLATFORM_AGENT_IMAGE_NAME: ${{ vars.ECR_REGISTRY || '153263036946.dkr.ecr.us-east-2.amazonaws.com' }}/molecule-ai/molecule-platform-agent
# Concierge (kind=platform) image (molecule-ai/molecule-platform-agent) is
# built by the TEMPLATE repo (workspace-template-claude-code, publish-image.yml
# → publish-platform-agent) FROM the claude-code runtime image with the
# org-management MCP baked at /opt/molecule-mcp-server (RFC platform-agent §5.7).
# Core does NOT build it: the prior FROM-platform-tenant build here shipped an
# image with no runtime + no MCP server, so the concierge never registered
# (#2970/#30). Pin promotion is operator-gated (POST /cp/admin/runtime-image/promote).
TENANT_IMAGE_NAME: ${{ vars.ECR_REGISTRY || '153263036946.dkr.ecr.us-east-2.amazonaws.com' }}/molecule-ai/platform-tenant
STAGING_TENANT_IMAGE_NAME: ${{ vars.STAGING_ECR_REGISTRY || '004947743811.dkr.ecr.us-east-2.amazonaws.com' }}/molecule-ai/platform-tenant
@@ -327,55 +327,6 @@ jobs:
fi
done
# Build + push the CONCIERGE platform-agent image. Extends the LIVE
# platform-tenant image (built just above) with the concierge identity
# baked from the platform-agent template (staged at .tenant-bundle-deps/
# workspace-configs-templates/platform-agent by the Pre-clone step from
# the manifest platform-agent entry). MUST run AFTER the tenant build —
# FROM ${TENANT_IMAGE_NAME}:${TAG_SHA}, the live workspace-server image
# concierges already run (it has /entrypoint.sh = entrypoint-tenant.sh,
# which the platform-agent wrapper chains to). The dead molecule-ai/platform
# base (unbuilt since 2026-05-15) is deliberately NOT used. CP selects this
# image for kind=platform (core#2495); without it the concierge boots with
# no identity (#2919 image-bake / #2955 identity-fallback.sh).
- name: Build & push platform-agent image to ECR (staging-<sha> + staging-latest)
env:
TENANT_IMAGE_NAME: ${{ env.TENANT_IMAGE_NAME }}
PLATFORM_AGENT_IMAGE_NAME: ${{ env.PLATFORM_AGENT_IMAGE_NAME }}
TAG_SHA: staging-${{ steps.tags.outputs.sha }}
TAG_LATEST: staging-latest
GIT_SHA: ${{ steps.tags.outputs.sha }}
REPO: ${{ github.event.repository.name }}
GITHUB_RUN_ID: ${{ github.run_id }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-east-2
run: |
set -euo pipefail
ECR_REGISTRY="${PLATFORM_AGENT_IMAGE_NAME%%/*}"
aws ecr get-login-password --region us-east-2 | \
docker login --username AWS --password-stdin "${ECR_REGISTRY}"
builder="pa-builder-${GITHUB_RUN_ID}"
docker buildx create --name "${builder}" --use >/dev/null 2>&1 || true
docker buildx build \
--builder "${builder}" \
--file ./workspace-server/Dockerfile.platform-agent \
--build-arg BASE_IMAGE="${TENANT_IMAGE_NAME}:${TAG_SHA}" \
--provenance=false \
--sbom=false \
--build-arg GIT_SHA="${GIT_SHA}" \
--label "org.opencontainers.image.source=https://git.moleculesai.app/molecule-ai/${REPO}" \
--label "org.opencontainers.image.revision=${GIT_SHA}" \
--label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
--label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \
--cache-from "type=registry,ref=${PLATFORM_AGENT_IMAGE_NAME}:buildcache" \
--cache-to "type=registry,ref=${PLATFORM_AGENT_IMAGE_NAME}:buildcache,mode=max,image-manifest=true,oci-mediatypes=true,ignore-error=true" \
--tag "${PLATFORM_AGENT_IMAGE_NAME}:${TAG_SHA}" \
--tag "${PLATFORM_AGENT_IMAGE_NAME}:${TAG_LATEST}" \
--push . || { docker buildx rm "${builder}" >/dev/null 2>&1 || true; echo "::error::platform-agent image build failed"; exit 1; }
docker buildx rm "${builder}" >/dev/null 2>&1 || true
echo "::notice::platform-agent image pushed: ${PLATFORM_AGENT_IMAGE_NAME}:${TAG_SHA}"
# Staging auto-deploy: every workspace-server image publish on main should
# roll out to the staging fleet so code fixes reach staging without a
# manual workflow_dispatch. Gitea 1.22.6 does not support workflow_run, so
-156
View File
@@ -1,156 +0,0 @@
# Platform-agent image variant (RFC #2843 §10a IMAGE-BAKED).
#
# The platform-agent image is the concierge's dedicated image. The base
# platform image (Dockerfile) is the ordinary /platform image; the
# platform-agent variant EXTENDS the base with the concierge's IDENTITY
# baked in, sourced FROM the platform-agent TEMPLATE REPO
# (molecule-ai/molecule-ai-workspace-template-platform-agent) — the
# SAME SSOT that the asset-channel delivers post-#29-activation.
#
# Why a dedicated image: the concierge is a platform-managed agent
# (NOT a user template) with a different threat model and a different
# identity-delivery requirement. The asset channel (PR-B, #2900+#2903)
# works in SaaS+token, in SaaS+public-fetch, and (post-#29-activation)
# in any tenant. The IMAGE-BAKED variant covers the remaining gap:
# self-hosted deployments (no MOLECULE_TEMPLATE_REPO_TOKEN) and the
# pre-#29-activation bootstrap window, where neither the asset channel
# nor the local template path is guaranteed to be available.
#
# SSOT contract (driver hard-requirement on the IMAGE-BAKED impl):
# The image-baked content (config.yaml + prompts/concierge.md +
# mcp_servers.yaml + identity-fallback.sh) is SOURCED FROM the
# platform-agent TEMPLATE REPO, NOT vendored/duplicated in core. A CI
# DRIFT-GATE
# (workspace-server/internal/provisioner/platform_agent_image_drift_test.go,
# pinned against /opt/molecule-platform-agent-template/{config.yaml,
# mcp_servers.yaml,prompts/concierge.md,identity-fallback.sh} vs the
# pre-cloned .tenant-bundle-deps/workspace-configs-templates/platform-
# agent/ source) asserts byte-equal at build time. A future drift
# would fail the drift-gate test (go test -run
# TestPlatformAgentImageDriftGate), catching it BEFORE the image is
# published — so image snapshot + template can NEVER diverge in
# production without a CI-red signal.
#
# Build context: same as Dockerfile. The platform-agent template
# content is pre-cloned by scripts/clone-manifest.sh into
# .tenant-bundle-deps/workspace-configs-templates/platform-agent/
# (the platform-agent template is a manifest.json workspace_templates
# entry per RFC #2843 §10a), and this Dockerfile reads it via the
# PLATFORM_AGENT_TEMPLATE_DIR build-arg (default = canonical CI path).
#
# Usage (operator / CI):
# docker buildx build \
# --build-arg PLATFORM_AGENT_TEMPLATE_DIR=.tenant-bundle-deps/workspace-configs-templates/platform-agent \
# --tag ${REGISTRY}/molecule-ai/platform-platform-agent:staging-${GIT_SHA} \
# -f workspace-server/Dockerfile.platform-agent \
# .
#
# Runtime contract: a container started from this image has the
# concierge identity at /opt/molecule-platform-agent-template/. The
# pre-#29/self-host fallback path in the workspace-server's
# applyConciergeProvisionConfig hook (workspace-server/internal/
# handlers/platform_agent.go) reads from this path when the asset-
# channel deliver is unavailable. Post-#29 activation, the asset
# channel remains the SSOT-delivery path; the image-baked copy is
# a last-resort fallback (intentionally NOT a parallel SSOT — the
# drift-gate enforces single-SSOT).
#
# The identity-fallback.sh script is the WORKING fallback (the
# IMAGE_BAKED_IDENTITY_PRESENT echo-only marker that the #2919 PR
# shipped was a log line that did nothing — this PR's companion
# template-platform-agent PR adds the real script). The platform-
# agent entrypoint sources the script at boot, BEFORE handing off
# to the base image's /entrypoint.sh. Fill-absent-only: a delivered
# /configs/* (asset-channel SSOT) is NEVER overwritten; the image-
# baked copy is the safety net for self-host + pre-#29-bootstrap
# windows where neither the asset channel nor the local template
# path is guaranteed to be available. See
# template-platform-agent #2 (identity-fallback.sh) for the script
# semantics — SRC=/opt/molecule-platform-agent-template, DST=/configs,
# fail-soft on missing SRC.
ARG BASE_IMAGE=molecule-local/platform:latest
FROM ${BASE_IMAGE}
# PLATFORM-AGENT TEMPLATE CONTENT — SSOT-sourced from the pre-cloned
# template repo. The default path mirrors where scripts/clone-manifest.sh
# places workspace_templates entries
# (.tenant-bundle-deps/workspace-configs-templates/<name>/). The
# platform-agent template is in manifest.json's workspace_templates
# (per RFC #2843 §10a), so the existing pre-clone step in
# publish-workspace-server-image.yml populates this path with no
# additional CI work.
#
# The build-arg exists for operators / staging mirrors that pre-clone
# to a different dir (e.g. a shallow --depth=1 mirror for fast CI).
# Default value is the canonical CI path; override only when the
# pre-clone layout differs.
#
# Why build-arg, not ENV: the path is a BUILD-TIME input, not a
# runtime config; build-args are the right tool and the value never
# enters the running container.
ARG PLATFORM_AGENT_TEMPLATE_DIR=.tenant-bundle-deps/workspace-configs-templates/platform-agent
COPY ${PLATFORM_AGENT_TEMPLATE_DIR}/config.yaml /opt/molecule-platform-agent-template/config.yaml
COPY ${PLATFORM_AGENT_TEMPLATE_DIR}/mcp_servers.yaml /opt/molecule-platform-agent-template/mcp_servers.yaml
COPY ${PLATFORM_AGENT_TEMPLATE_DIR}/prompts/ /opt/molecule-platform-agent-template/prompts/
# The boot-time identity-fallback script. Sourced at container start
# (see /entrypoint-platform-agent.sh below) to fill ABSENT files at
# /configs/ from the image-baked /opt path. The script is the SSOT in
# the platform-agent TEMPLATE REPO — drift-gate
# (platform_agent_image_drift_test.go) catches content drift between
# this COPY source and the image-baked destination.
#
# RCA 12124 (DRIVER-ESCALATED live prod identity): the script MUST
# write /configs/system-prompt.md (the file the
# conciergeIdentityPresent probe at platform_agent.go:399 reads) —
# NOT just /configs/prompts/concierge.md. Prior shape had a
# conditional write (`if [ ! -s "$DST/system-prompt.md" ]`) which
# could fail to fire after a partial-template run; the fixed script
# in the template-platform-agent repo (PR-side, merged to template
# main) is unconditional: always writes /configs/system-prompt.md
# from prompts/concierge.md + {{CONCIERGE_NAME}} substitution.
# COPY --chmod sets +x at copy time (buildx-native). A `RUN chmod` fails with
# "Operation not permitted" when the base image runs as a non-root user — the
# live platform-tenant base does, whereas the dead molecule-ai/platform base was
# root, which masked this. --chmod works regardless of base USER.
COPY --chmod=0755 ${PLATFORM_AGENT_TEMPLATE_DIR}/identity-fallback.sh /opt/molecule-platform-agent-template/identity-fallback.sh
# PLATFORM-AGENT ENTRYPOINT — runs identity-fallback.sh FIRST (fills
# absent /configs/ files from the image-baked /opt path; the
# asset-channel deliver is the SSOT post-#29-activation, this is the
# self-host + pre-#29-bootstrap safety net), then hands off to the
# base image's /entrypoint.sh (which does docker-socket group setup,
# memory-plugin sidecar spawn-gate, then exec su-exec platform
# /platform).
#
# Why a separate entrypoint (not extending /entrypoint.sh in core):
# the IMAGE-BAKED identity-fallback is a platform-agent-specific
# concern — the base /entrypoint.sh stays the single runtime entry
# for the ordinary /platform image, and the platform-agent variant
# overrides only the boot hook. The script is sourced (not exec'd)
# so a missing-script failure bubbles up cleanly (su-exec will still
# run /platform; the runtime's MISSING_MODEL fail-closed surfaces
# the operator-visible error in that case).
COPY --chmod=0755 <<'ENTRY' /entrypoint-platform-agent.sh
#!/bin/sh
# /opt/molecule-platform-agent-template/identity-fallback.sh: per-
# file copy of ABSENT files from the image-baked SSOT path to
# /configs/. The asset-channel deliver (post-#29-activation) is the
# authoritative path when it lands; this is the safety net for self-
# host + pre-#29-bootstrap windows where neither the asset channel
# nor the local template path is guaranteed to be available.
if [ -x /opt/molecule-platform-agent-template/identity-fallback.sh ]; then
/opt/molecule-platform-agent-template/identity-fallback.sh || {
echo "platform-agent: ⚠️ identity-fallback.sh failed (see prior log lines); continuing boot — runtime will MISSING_MODEL fail-closed if /configs is empty" >&2
}
else
echo "platform-agent: identity-fallback.sh not present (image built without it); skipping fallback (runtime will MISSING_MODEL fail-closed)" >&2
fi
# Hand off to the base image's entrypoint (docker-socket group
# setup, memory-plugin sidecar spawn-gate, then su-exec platform
# /platform). Pass through any CMD args (the platform-agent image
# is invoked the same way as the base — operator/CI sets CMD as
# needed; this entrypoint is transparent to the args).
exec /entrypoint.sh "$@"
ENTRY
ENTRYPOINT ["/entrypoint-platform-agent.sh"]