fix(concierge): stop core building the wrong molecule-platform-agent image (#2970/#30) #3027
@@ -78,13 +78,13 @@ env:
|
||||
# touching every workflow). Pattern mirrors `vars.CP_URL || 'literal'` already in
|
||||
# use below in this repo's staging-verify.yml.
|
||||
IMAGE_NAME: ${{ vars.ECR_REGISTRY || '153263036946.dkr.ecr.us-east-2.amazonaws.com' }}/molecule-ai/platform
|
||||
# Concierge (kind=platform) image: the base platform image + the platform-agent
|
||||
# template's identity (config.yaml/prompts/mcp_servers/identity-fallback.sh)
|
||||
# baked in via Dockerfile.platform-agent. Built from .tenant-bundle-deps/
|
||||
# workspace-configs-templates/platform-agent (staged by clone-manifest.sh from
|
||||
# the manifest platform-agent entry). The CP selects this image for kind=platform
|
||||
# (core#2495). Without this build the concierge boots with no identity (#2919/#2955).
|
||||
PLATFORM_AGENT_IMAGE_NAME: ${{ vars.ECR_REGISTRY || '153263036946.dkr.ecr.us-east-2.amazonaws.com' }}/molecule-ai/molecule-platform-agent
|
||||
# Concierge (kind=platform) image (molecule-ai/molecule-platform-agent) is
|
||||
# built by the TEMPLATE repo (workspace-template-claude-code, publish-image.yml
|
||||
# → publish-platform-agent) FROM the claude-code runtime image with the
|
||||
# org-management MCP baked at /opt/molecule-mcp-server (RFC platform-agent §5.7).
|
||||
# Core does NOT build it: the prior FROM-platform-tenant build here shipped an
|
||||
# image with no runtime + no MCP server, so the concierge never registered
|
||||
# (#2970/#30). Pin promotion is operator-gated (POST /cp/admin/runtime-image/promote).
|
||||
TENANT_IMAGE_NAME: ${{ vars.ECR_REGISTRY || '153263036946.dkr.ecr.us-east-2.amazonaws.com' }}/molecule-ai/platform-tenant
|
||||
STAGING_TENANT_IMAGE_NAME: ${{ vars.STAGING_ECR_REGISTRY || '004947743811.dkr.ecr.us-east-2.amazonaws.com' }}/molecule-ai/platform-tenant
|
||||
|
||||
@@ -327,55 +327,6 @@ jobs:
|
||||
fi
|
||||
done
|
||||
|
||||
# Build + push the CONCIERGE platform-agent image. Extends the LIVE
|
||||
# platform-tenant image (built just above) with the concierge identity
|
||||
# baked from the platform-agent template (staged at .tenant-bundle-deps/
|
||||
# workspace-configs-templates/platform-agent by the Pre-clone step from
|
||||
# the manifest platform-agent entry). MUST run AFTER the tenant build —
|
||||
# FROM ${TENANT_IMAGE_NAME}:${TAG_SHA}, the live workspace-server image
|
||||
# concierges already run (it has /entrypoint.sh = entrypoint-tenant.sh,
|
||||
# which the platform-agent wrapper chains to). The dead molecule-ai/platform
|
||||
# base (unbuilt since 2026-05-15) is deliberately NOT used. CP selects this
|
||||
# image for kind=platform (core#2495); without it the concierge boots with
|
||||
# no identity (#2919 image-bake / #2955 identity-fallback.sh).
|
||||
- name: Build & push platform-agent image to ECR (staging-<sha> + staging-latest)
|
||||
env:
|
||||
TENANT_IMAGE_NAME: ${{ env.TENANT_IMAGE_NAME }}
|
||||
PLATFORM_AGENT_IMAGE_NAME: ${{ env.PLATFORM_AGENT_IMAGE_NAME }}
|
||||
TAG_SHA: staging-${{ steps.tags.outputs.sha }}
|
||||
TAG_LATEST: staging-latest
|
||||
GIT_SHA: ${{ steps.tags.outputs.sha }}
|
||||
REPO: ${{ github.event.repository.name }}
|
||||
GITHUB_RUN_ID: ${{ github.run_id }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
AWS_DEFAULT_REGION: us-east-2
|
||||
run: |
|
||||
set -euo pipefail
|
||||
ECR_REGISTRY="${PLATFORM_AGENT_IMAGE_NAME%%/*}"
|
||||
aws ecr get-login-password --region us-east-2 | \
|
||||
docker login --username AWS --password-stdin "${ECR_REGISTRY}"
|
||||
builder="pa-builder-${GITHUB_RUN_ID}"
|
||||
docker buildx create --name "${builder}" --use >/dev/null 2>&1 || true
|
||||
docker buildx build \
|
||||
--builder "${builder}" \
|
||||
--file ./workspace-server/Dockerfile.platform-agent \
|
||||
--build-arg BASE_IMAGE="${TENANT_IMAGE_NAME}:${TAG_SHA}" \
|
||||
--provenance=false \
|
||||
--sbom=false \
|
||||
--build-arg GIT_SHA="${GIT_SHA}" \
|
||||
--label "org.opencontainers.image.source=https://git.moleculesai.app/molecule-ai/${REPO}" \
|
||||
--label "org.opencontainers.image.revision=${GIT_SHA}" \
|
||||
--label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
|
||||
--label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \
|
||||
--cache-from "type=registry,ref=${PLATFORM_AGENT_IMAGE_NAME}:buildcache" \
|
||||
--cache-to "type=registry,ref=${PLATFORM_AGENT_IMAGE_NAME}:buildcache,mode=max,image-manifest=true,oci-mediatypes=true,ignore-error=true" \
|
||||
--tag "${PLATFORM_AGENT_IMAGE_NAME}:${TAG_SHA}" \
|
||||
--tag "${PLATFORM_AGENT_IMAGE_NAME}:${TAG_LATEST}" \
|
||||
--push . || { docker buildx rm "${builder}" >/dev/null 2>&1 || true; echo "::error::platform-agent image build failed"; exit 1; }
|
||||
docker buildx rm "${builder}" >/dev/null 2>&1 || true
|
||||
echo "::notice::platform-agent image pushed: ${PLATFORM_AGENT_IMAGE_NAME}:${TAG_SHA}"
|
||||
|
||||
# Staging auto-deploy: every workspace-server image publish on main should
|
||||
# roll out to the staging fleet so code fixes reach staging without a
|
||||
# manual workflow_dispatch. Gitea 1.22.6 does not support workflow_run, so
|
||||
|
||||
@@ -1,156 +0,0 @@
|
||||
# Platform-agent image variant (RFC #2843 §10a IMAGE-BAKED).
|
||||
#
|
||||
# The platform-agent image is the concierge's dedicated image. The base
|
||||
# platform image (Dockerfile) is the ordinary /platform image; the
|
||||
# platform-agent variant EXTENDS the base with the concierge's IDENTITY
|
||||
# baked in, sourced FROM the platform-agent TEMPLATE REPO
|
||||
# (molecule-ai/molecule-ai-workspace-template-platform-agent) — the
|
||||
# SAME SSOT that the asset-channel delivers post-#29-activation.
|
||||
#
|
||||
# Why a dedicated image: the concierge is a platform-managed agent
|
||||
# (NOT a user template) with a different threat model and a different
|
||||
# identity-delivery requirement. The asset channel (PR-B, #2900+#2903)
|
||||
# works in SaaS+token, in SaaS+public-fetch, and (post-#29-activation)
|
||||
# in any tenant. The IMAGE-BAKED variant covers the remaining gap:
|
||||
# self-hosted deployments (no MOLECULE_TEMPLATE_REPO_TOKEN) and the
|
||||
# pre-#29-activation bootstrap window, where neither the asset channel
|
||||
# nor the local template path is guaranteed to be available.
|
||||
#
|
||||
# SSOT contract (driver hard-requirement on the IMAGE-BAKED impl):
|
||||
# The image-baked content (config.yaml + prompts/concierge.md +
|
||||
# mcp_servers.yaml + identity-fallback.sh) is SOURCED FROM the
|
||||
# platform-agent TEMPLATE REPO, NOT vendored/duplicated in core. A CI
|
||||
# DRIFT-GATE
|
||||
# (workspace-server/internal/provisioner/platform_agent_image_drift_test.go,
|
||||
# pinned against /opt/molecule-platform-agent-template/{config.yaml,
|
||||
# mcp_servers.yaml,prompts/concierge.md,identity-fallback.sh} vs the
|
||||
# pre-cloned .tenant-bundle-deps/workspace-configs-templates/platform-
|
||||
# agent/ source) asserts byte-equal at build time. A future drift
|
||||
# would fail the drift-gate test (go test -run
|
||||
# TestPlatformAgentImageDriftGate), catching it BEFORE the image is
|
||||
# published — so image snapshot + template can NEVER diverge in
|
||||
# production without a CI-red signal.
|
||||
#
|
||||
# Build context: same as Dockerfile. The platform-agent template
|
||||
# content is pre-cloned by scripts/clone-manifest.sh into
|
||||
# .tenant-bundle-deps/workspace-configs-templates/platform-agent/
|
||||
# (the platform-agent template is a manifest.json workspace_templates
|
||||
# entry per RFC #2843 §10a), and this Dockerfile reads it via the
|
||||
# PLATFORM_AGENT_TEMPLATE_DIR build-arg (default = canonical CI path).
|
||||
#
|
||||
# Usage (operator / CI):
|
||||
# docker buildx build \
|
||||
# --build-arg PLATFORM_AGENT_TEMPLATE_DIR=.tenant-bundle-deps/workspace-configs-templates/platform-agent \
|
||||
# --tag ${REGISTRY}/molecule-ai/platform-platform-agent:staging-${GIT_SHA} \
|
||||
# -f workspace-server/Dockerfile.platform-agent \
|
||||
# .
|
||||
#
|
||||
# Runtime contract: a container started from this image has the
|
||||
# concierge identity at /opt/molecule-platform-agent-template/. The
|
||||
# pre-#29/self-host fallback path in the workspace-server's
|
||||
# applyConciergeProvisionConfig hook (workspace-server/internal/
|
||||
# handlers/platform_agent.go) reads from this path when the asset-
|
||||
# channel deliver is unavailable. Post-#29 activation, the asset
|
||||
# channel remains the SSOT-delivery path; the image-baked copy is
|
||||
# a last-resort fallback (intentionally NOT a parallel SSOT — the
|
||||
# drift-gate enforces single-SSOT).
|
||||
#
|
||||
# The identity-fallback.sh script is the WORKING fallback (the
|
||||
# IMAGE_BAKED_IDENTITY_PRESENT echo-only marker that the #2919 PR
|
||||
# shipped was a log line that did nothing — this PR's companion
|
||||
# template-platform-agent PR adds the real script). The platform-
|
||||
# agent entrypoint sources the script at boot, BEFORE handing off
|
||||
# to the base image's /entrypoint.sh. Fill-absent-only: a delivered
|
||||
# /configs/* (asset-channel SSOT) is NEVER overwritten; the image-
|
||||
# baked copy is the safety net for self-host + pre-#29-bootstrap
|
||||
# windows where neither the asset channel nor the local template
|
||||
# path is guaranteed to be available. See
|
||||
# template-platform-agent #2 (identity-fallback.sh) for the script
|
||||
# semantics — SRC=/opt/molecule-platform-agent-template, DST=/configs,
|
||||
# fail-soft on missing SRC.
|
||||
|
||||
ARG BASE_IMAGE=molecule-local/platform:latest
|
||||
FROM ${BASE_IMAGE}
|
||||
|
||||
# PLATFORM-AGENT TEMPLATE CONTENT — SSOT-sourced from the pre-cloned
|
||||
# template repo. The default path mirrors where scripts/clone-manifest.sh
|
||||
# places workspace_templates entries
|
||||
# (.tenant-bundle-deps/workspace-configs-templates/<name>/). The
|
||||
# platform-agent template is in manifest.json's workspace_templates
|
||||
# (per RFC #2843 §10a), so the existing pre-clone step in
|
||||
# publish-workspace-server-image.yml populates this path with no
|
||||
# additional CI work.
|
||||
#
|
||||
# The build-arg exists for operators / staging mirrors that pre-clone
|
||||
# to a different dir (e.g. a shallow --depth=1 mirror for fast CI).
|
||||
# Default value is the canonical CI path; override only when the
|
||||
# pre-clone layout differs.
|
||||
#
|
||||
# Why build-arg, not ENV: the path is a BUILD-TIME input, not a
|
||||
# runtime config; build-args are the right tool and the value never
|
||||
# enters the running container.
|
||||
ARG PLATFORM_AGENT_TEMPLATE_DIR=.tenant-bundle-deps/workspace-configs-templates/platform-agent
|
||||
COPY ${PLATFORM_AGENT_TEMPLATE_DIR}/config.yaml /opt/molecule-platform-agent-template/config.yaml
|
||||
COPY ${PLATFORM_AGENT_TEMPLATE_DIR}/mcp_servers.yaml /opt/molecule-platform-agent-template/mcp_servers.yaml
|
||||
COPY ${PLATFORM_AGENT_TEMPLATE_DIR}/prompts/ /opt/molecule-platform-agent-template/prompts/
|
||||
# The boot-time identity-fallback script. Sourced at container start
|
||||
# (see /entrypoint-platform-agent.sh below) to fill ABSENT files at
|
||||
# /configs/ from the image-baked /opt path. The script is the SSOT in
|
||||
# the platform-agent TEMPLATE REPO — drift-gate
|
||||
# (platform_agent_image_drift_test.go) catches content drift between
|
||||
# this COPY source and the image-baked destination.
|
||||
#
|
||||
# RCA 12124 (DRIVER-ESCALATED live prod identity): the script MUST
|
||||
# write /configs/system-prompt.md (the file the
|
||||
# conciergeIdentityPresent probe at platform_agent.go:399 reads) —
|
||||
# NOT just /configs/prompts/concierge.md. Prior shape had a
|
||||
# conditional write (`if [ ! -s "$DST/system-prompt.md" ]`) which
|
||||
# could fail to fire after a partial-template run; the fixed script
|
||||
# in the template-platform-agent repo (PR-side, merged to template
|
||||
# main) is unconditional: always writes /configs/system-prompt.md
|
||||
# from prompts/concierge.md + {{CONCIERGE_NAME}} substitution.
|
||||
# COPY --chmod sets +x at copy time (buildx-native). A `RUN chmod` fails with
|
||||
# "Operation not permitted" when the base image runs as a non-root user — the
|
||||
# live platform-tenant base does, whereas the dead molecule-ai/platform base was
|
||||
# root, which masked this. --chmod works regardless of base USER.
|
||||
COPY --chmod=0755 ${PLATFORM_AGENT_TEMPLATE_DIR}/identity-fallback.sh /opt/molecule-platform-agent-template/identity-fallback.sh
|
||||
|
||||
# PLATFORM-AGENT ENTRYPOINT — runs identity-fallback.sh FIRST (fills
|
||||
# absent /configs/ files from the image-baked /opt path; the
|
||||
# asset-channel deliver is the SSOT post-#29-activation, this is the
|
||||
# self-host + pre-#29-bootstrap safety net), then hands off to the
|
||||
# base image's /entrypoint.sh (which does docker-socket group setup,
|
||||
# memory-plugin sidecar spawn-gate, then exec su-exec platform
|
||||
# /platform).
|
||||
#
|
||||
# Why a separate entrypoint (not extending /entrypoint.sh in core):
|
||||
# the IMAGE-BAKED identity-fallback is a platform-agent-specific
|
||||
# concern — the base /entrypoint.sh stays the single runtime entry
|
||||
# for the ordinary /platform image, and the platform-agent variant
|
||||
# overrides only the boot hook. The script is sourced (not exec'd)
|
||||
# so a missing-script failure bubbles up cleanly (su-exec will still
|
||||
# run /platform; the runtime's MISSING_MODEL fail-closed surfaces
|
||||
# the operator-visible error in that case).
|
||||
COPY --chmod=0755 <<'ENTRY' /entrypoint-platform-agent.sh
|
||||
#!/bin/sh
|
||||
# /opt/molecule-platform-agent-template/identity-fallback.sh: per-
|
||||
# file copy of ABSENT files from the image-baked SSOT path to
|
||||
# /configs/. The asset-channel deliver (post-#29-activation) is the
|
||||
# authoritative path when it lands; this is the safety net for self-
|
||||
# host + pre-#29-bootstrap windows where neither the asset channel
|
||||
# nor the local template path is guaranteed to be available.
|
||||
if [ -x /opt/molecule-platform-agent-template/identity-fallback.sh ]; then
|
||||
/opt/molecule-platform-agent-template/identity-fallback.sh || {
|
||||
echo "platform-agent: ⚠️ identity-fallback.sh failed (see prior log lines); continuing boot — runtime will MISSING_MODEL fail-closed if /configs is empty" >&2
|
||||
}
|
||||
else
|
||||
echo "platform-agent: identity-fallback.sh not present (image built without it); skipping fallback (runtime will MISSING_MODEL fail-closed)" >&2
|
||||
fi
|
||||
# Hand off to the base image's entrypoint (docker-socket group
|
||||
# setup, memory-plugin sidecar spawn-gate, then su-exec platform
|
||||
# /platform). Pass through any CMD args (the platform-agent image
|
||||
# is invoked the same way as the base — operator/CI sets CMD as
|
||||
# needed; this entrypoint is transparent to the args).
|
||||
exec /entrypoint.sh "$@"
|
||||
ENTRY
|
||||
ENTRYPOINT ["/entrypoint-platform-agent.sh"]
|
||||
Reference in New Issue
Block a user