ci(ecr): auto-apply canonical image lifecycle policy on prod ECR pushes #3137
@@ -231,6 +231,21 @@ jobs:
|
||||
org.opencontainers.image.revision=${{ github.sha }}
|
||||
org.opencontainers.image.description=Molecule AI canvas (Next.js 15 + React Flow)
|
||||
|
||||
# Keep the prod ECR storage bill bounded: declare the canonical image
|
||||
# lifecycle policy on the canvas repo we just pushed. Idempotent +
|
||||
# fail-soft (never fails the publish). SSOT JSON lives in the script.
|
||||
- name: Ensure ECR lifecycle policy (canvas)
|
||||
env:
|
||||
IMAGE_NAME: ${{ env.IMAGE_NAME }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
AWS_DEFAULT_REGION: us-east-2
|
||||
run: |
|
||||
set -uo pipefail
|
||||
# Strip the registry host; pass the bare molecule-ai/<repo> name.
|
||||
REPO_NAME="${IMAGE_NAME#*/}"
|
||||
bash scripts/ops/ensure-ecr-lifecycle.sh "${REPO_NAME}"
|
||||
|
||||
# bp-exempt: post-merge canvas promote side-effect; merge is gated by CI /
|
||||
# all-required and this job waits for green push CI on the SHA before acting.
|
||||
promote-canvas:
|
||||
|
||||
@@ -255,6 +255,22 @@ jobs:
|
||||
--tag "${IMAGE_NAME}:${TAG_LATEST}" \
|
||||
--push .
|
||||
|
||||
# Keep the prod ECR storage bill bounded: declare the canonical image
|
||||
# lifecycle policy on the repo we just pushed. Idempotent + fail-soft
|
||||
# (never fails the publish). SSOT JSON lives in the script. The platform
|
||||
# build above is continue-on-error (dead base), so this runs regardless.
|
||||
- name: Ensure ECR lifecycle policy (platform)
|
||||
env:
|
||||
IMAGE_NAME: ${{ env.IMAGE_NAME }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
AWS_DEFAULT_REGION: us-east-2
|
||||
run: |
|
||||
set -uo pipefail
|
||||
# Strip the registry host; pass the bare molecule-ai/<repo> name.
|
||||
REPO_NAME="${IMAGE_NAME#*/}"
|
||||
bash scripts/ops/ensure-ecr-lifecycle.sh "${REPO_NAME}"
|
||||
|
||||
# Build + push tenant image (Go platform + Next.js canvas in one image).
|
||||
# Push the same build to the staging account too so fresh staging/E2E
|
||||
# tenants can pull without cross-account ECR reads. The staging ECR repo
|
||||
@@ -604,6 +620,22 @@ jobs:
|
||||
docker push "${ref}"
|
||||
done
|
||||
|
||||
# Keep the prod ECR storage bill bounded: declare the canonical image
|
||||
# lifecycle policy on the prod tenant repo we just pushed. platform-tenant
|
||||
# is the biggest accumulator (70+ images / 12GB+). Idempotent + fail-soft.
|
||||
# Only the PROD repo is targeted here (staging ECR has its own account).
|
||||
- name: Ensure ECR lifecycle policy (platform-tenant)
|
||||
env:
|
||||
TENANT_IMAGE_NAME: ${{ env.TENANT_IMAGE_NAME }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
AWS_DEFAULT_REGION: us-east-2
|
||||
run: |
|
||||
set -uo pipefail
|
||||
# Strip the registry host; pass the bare molecule-ai/<repo> name.
|
||||
REPO_NAME="${TENANT_IMAGE_NAME#*/}"
|
||||
bash scripts/ops/ensure-ecr-lifecycle.sh "${REPO_NAME}"
|
||||
|
||||
# Staging auto-deploy: every workspace-server image publish on main should
|
||||
# roll out to the staging fleet so code fixes reach staging without a
|
||||
# manual workflow_dispatch. Gitea 1.22.6 does not support workflow_run, so
|
||||
|
||||
Executable
+87
@@ -0,0 +1,87 @@
|
||||
#!/usr/bin/env bash
|
||||
# ensure-ecr-lifecycle.sh — idempotently apply the canonical ECR image
|
||||
# lifecycle policy to a prod ECR repository, called from the publish
|
||||
# pipelines right after they push an image.
|
||||
#
|
||||
# Why this exists: the prod ECR repos under
|
||||
# 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/* had their
|
||||
# lifecycle policies set out-of-band (no IaC managed them), so the prod
|
||||
# ECR storage bill (~$56/mo, account 153263036946) kept growing —
|
||||
# platform-tenant alone accumulated 70+ images / 12GB+. Every untagged
|
||||
# layer and every superseded :sha-<...> tag lingered forever.
|
||||
#
|
||||
# The durable fix: the publish workflows ALREADY authenticate to prod ECR
|
||||
# and push images, so they already hold the right creds + region. This
|
||||
# script just adds a `put-lifecycle-policy` call after each push. ECR's
|
||||
# own lifecycle engine then expires old images on its schedule — no
|
||||
# deletes happen here, this only DECLARES the policy. Re-applying the
|
||||
# same policy on every build keeps it in lockstep with this file (IaC),
|
||||
# so an out-of-band edit is corrected on the next publish.
|
||||
#
|
||||
# SSOT: the canonical policy JSON below is the single source of truth.
|
||||
# It is intentionally duplicated byte-for-byte into the equivalent script
|
||||
# in each repo whose publish workflow pushes to prod ECR (a workflow can
|
||||
# only call a script in its own checkout); keep them identical. The policy
|
||||
# was validated on the operator account before rollout.
|
||||
#
|
||||
# Policy:
|
||||
# rule 1 — expire untagged images 1 day after push (build cache churn,
|
||||
# orphaned layers from re-pushed tags)
|
||||
# rule 2 — keep only the last 10 tagged images for the sha-/v/latest/
|
||||
# staging/main tag families (per-prefix retention; ECR keeps
|
||||
# the N most-recent by push time and expires older)
|
||||
#
|
||||
# Fail-soft by design: a publish MUST NOT fail because policy application
|
||||
# errored (e.g. transient ECR API blip, IAM gap). On any error this logs
|
||||
# a warning and exits 0. The policy is reapplied on the next publish.
|
||||
#
|
||||
# Usage:
|
||||
# scripts/ops/ensure-ecr-lifecycle.sh <repository-name>
|
||||
# e.g. scripts/ops/ensure-ecr-lifecycle.sh molecule-ai/platform-tenant
|
||||
#
|
||||
# Env (all optional — sane defaults match the publish workflows):
|
||||
# AWS_REGION / AWS_DEFAULT_REGION — ECR region (default: us-east-2)
|
||||
# AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY — provided by the publish step
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 — policy applied, already current, or fail-soft no-op (always 0)
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
REPO="${1:-}"
|
||||
REGION="${AWS_REGION:-${AWS_DEFAULT_REGION:-us-east-2}}"
|
||||
|
||||
if [ -z "${REPO}" ]; then
|
||||
echo "::warning::ensure-ecr-lifecycle: no repository name given; skipping" >&2
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# --- Canonical lifecycle policy (SSOT) -------------------------------------
|
||||
# Keep this JSON identical across every repo's copy of this script.
|
||||
read -r -d '' LIFECYCLE_POLICY <<'JSON' || true
|
||||
{"rules":[
|
||||
{"rulePriority":1,"description":"Expire untagged after 1 day","selection":{"tagStatus":"untagged","countType":"sinceImagePushed","countUnit":"days","countNumber":1},"action":{"type":"expire"}},
|
||||
{"rulePriority":2,"description":"Keep last 10 tagged","selection":{"tagStatus":"tagged","tagPrefixList":["sha-","v","latest","staging","main"],"countType":"imageCountMoreThan","countNumber":10},"action":{"type":"expire"}}
|
||||
]}
|
||||
JSON
|
||||
|
||||
if ! command -v aws >/dev/null 2>&1; then
|
||||
echo "::warning::ensure-ecr-lifecycle: aws CLI not found; skipping policy for ${REPO}" >&2
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "::notice::ensure-ecr-lifecycle: applying canonical lifecycle policy to ${REPO} (region ${REGION})"
|
||||
|
||||
if aws ecr put-lifecycle-policy \
|
||||
--repository-name "${REPO}" \
|
||||
--region "${REGION}" \
|
||||
--lifecycle-policy-text "${LIFECYCLE_POLICY}" >/dev/null 2>/tmp/ecr-lifecycle-err.$$; then
|
||||
echo "::notice::ensure-ecr-lifecycle: policy applied to ${REPO}"
|
||||
else
|
||||
echo "::warning::ensure-ecr-lifecycle: put-lifecycle-policy failed for ${REPO} (non-fatal — policy reapplies next publish)" >&2
|
||||
sed 's/^/::warning::ensure-ecr-lifecycle: /' /tmp/ecr-lifecycle-err.$$ >&2 2>/dev/null || true
|
||||
fi
|
||||
rm -f /tmp/ecr-lifecycle-err.$$ 2>/dev/null || true
|
||||
|
||||
# Always succeed — never break a publish on lifecycle-policy errors.
|
||||
exit 0
|
||||
Reference in New Issue
Block a user