fix(image): build platform-agent FROM live platform-tenant base (unblocks concierge identity) #2982

Merged
core-devops merged 1 commits from fix/platform-agent-base-on-tenant into main 2026-06-16 02:22:30 +00:00
@@ -218,7 +218,14 @@ jobs:
# - cache-from on a missing tag (first run) is a warning, not an error.
# - Concurrent publishes overwrite :buildcache last-writer-wins —
# same best-effort semantics as :staging-latest.
# NOTE: molecule-ai/platform is now ORPHANED — the concierge image was
# repointed to FROM platform-tenant (the live workspace-server image), and
# nothing else consumes molecule-ai/platform (it has not built since
# 2026-05-15). continue-on-error so a failure of this dead base build can
# never block the tenant + platform-agent builds below. Remove this step
# entirely in a follow-up once confirmed no consumer remains.
- name: Build & push platform image to ECR (staging-<sha> + staging-latest)
continue-on-error: true
env:
IMAGE_NAME: ${{ env.IMAGE_NAME }}
TAG_SHA: staging-${{ steps.tags.outputs.sha }}
@@ -248,44 +255,6 @@ jobs:
--tag "${IMAGE_NAME}:${TAG_LATEST}" \
--push .
# Build + push the CONCIERGE platform-agent image. Extends the base
# platform image (just built above, passed via BASE_IMAGE) with the
# concierge identity baked from the platform-agent template (staged at
# .tenant-bundle-deps/workspace-configs-templates/platform-agent by the
# Pre-clone step from the manifest platform-agent entry). MUST run AFTER
# the base platform build (FROM ${IMAGE_NAME}:${TAG_SHA}). The CP selects
# this image for kind=platform (core#2495); without it the concierge boots
# with no identity (#2919 image-bake / #2955 identity-fallback.sh).
- name: Build & push platform-agent image to ECR (staging-<sha> + staging-latest)
env:
IMAGE_NAME: ${{ env.IMAGE_NAME }}
PLATFORM_AGENT_IMAGE_NAME: ${{ env.PLATFORM_AGENT_IMAGE_NAME }}
TAG_SHA: staging-${{ steps.tags.outputs.sha }}
TAG_LATEST: staging-latest
GIT_SHA: ${{ steps.tags.outputs.sha }}
REPO: ${{ github.event.repository.name }}
GITHUB_RUN_ID: ${{ github.run_id }}
run: |
set -euo pipefail
ECR_REGISTRY="${PLATFORM_AGENT_IMAGE_NAME%%/*}"
aws ecr get-login-password --region us-east-2 | \
docker login --username AWS --password-stdin "${ECR_REGISTRY}"
docker buildx build \
--file ./workspace-server/Dockerfile.platform-agent \
--build-arg BASE_IMAGE="${IMAGE_NAME}:${TAG_SHA}" \
--provenance=false \
--sbom=false \
--build-arg GIT_SHA="${GIT_SHA}" \
--label "org.opencontainers.image.source=https://git.moleculesai.app/molecule-ai/${REPO}" \
--label "org.opencontainers.image.revision=${GIT_SHA}" \
--label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
--label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \
--cache-from "type=registry,ref=${PLATFORM_AGENT_IMAGE_NAME}:buildcache" \
--cache-to "type=registry,ref=${PLATFORM_AGENT_IMAGE_NAME}:buildcache,mode=max,image-manifest=true,oci-mediatypes=true,ignore-error=true" \
--tag "${PLATFORM_AGENT_IMAGE_NAME}:${TAG_SHA}" \
--tag "${PLATFORM_AGENT_IMAGE_NAME}:${TAG_LATEST}" \
--push .
# Build + push tenant image (Go platform + Next.js canvas in one image).
# Push the same build to the staging account too so fresh staging/E2E
# tenants can pull without cross-account ECR reads. The staging ECR repo
@@ -358,6 +327,55 @@ jobs:
fi
done
# Build + push the CONCIERGE platform-agent image. Extends the LIVE
# platform-tenant image (built just above) with the concierge identity
# baked from the platform-agent template (staged at .tenant-bundle-deps/
# workspace-configs-templates/platform-agent by the Pre-clone step from
# the manifest platform-agent entry). MUST run AFTER the tenant build —
# FROM ${TENANT_IMAGE_NAME}:${TAG_SHA}, the live workspace-server image
# concierges already run (it has /entrypoint.sh = entrypoint-tenant.sh,
# which the platform-agent wrapper chains to). The dead molecule-ai/platform
# base (unbuilt since 2026-05-15) is deliberately NOT used. CP selects this
# image for kind=platform (core#2495); without it the concierge boots with
# no identity (#2919 image-bake / #2955 identity-fallback.sh).
- name: Build & push platform-agent image to ECR (staging-<sha> + staging-latest)
env:
TENANT_IMAGE_NAME: ${{ env.TENANT_IMAGE_NAME }}
PLATFORM_AGENT_IMAGE_NAME: ${{ env.PLATFORM_AGENT_IMAGE_NAME }}
TAG_SHA: staging-${{ steps.tags.outputs.sha }}
TAG_LATEST: staging-latest
GIT_SHA: ${{ steps.tags.outputs.sha }}
REPO: ${{ github.event.repository.name }}
GITHUB_RUN_ID: ${{ github.run_id }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-east-2
run: |
set -euo pipefail
ECR_REGISTRY="${PLATFORM_AGENT_IMAGE_NAME%%/*}"
aws ecr get-login-password --region us-east-2 | \
docker login --username AWS --password-stdin "${ECR_REGISTRY}"
builder="pa-builder-${GITHUB_RUN_ID}"
docker buildx create --name "${builder}" --use >/dev/null 2>&1 || true
docker buildx build \
--builder "${builder}" \
--file ./workspace-server/Dockerfile.platform-agent \
--build-arg BASE_IMAGE="${TENANT_IMAGE_NAME}:${TAG_SHA}" \
--provenance=false \
--sbom=false \
--build-arg GIT_SHA="${GIT_SHA}" \
--label "org.opencontainers.image.source=https://git.moleculesai.app/molecule-ai/${REPO}" \
--label "org.opencontainers.image.revision=${GIT_SHA}" \
--label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
--label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \
--cache-from "type=registry,ref=${PLATFORM_AGENT_IMAGE_NAME}:buildcache" \
--cache-to "type=registry,ref=${PLATFORM_AGENT_IMAGE_NAME}:buildcache,mode=max,image-manifest=true,oci-mediatypes=true,ignore-error=true" \
--tag "${PLATFORM_AGENT_IMAGE_NAME}:${TAG_SHA}" \
--tag "${PLATFORM_AGENT_IMAGE_NAME}:${TAG_LATEST}" \
--push . || { docker buildx rm "${builder}" >/dev/null 2>&1 || true; echo "::error::platform-agent image build failed"; exit 1; }
docker buildx rm "${builder}" >/dev/null 2>&1 || true
echo "::notice::platform-agent image pushed: ${PLATFORM_AGENT_IMAGE_NAME}:${TAG_SHA}"
# Staging auto-deploy: every workspace-server image publish on main should
# roll out to the staging fleet so code fixes reach staging without a
# manual workflow_dispatch. Gitea 1.22.6 does not support workflow_run, so