From 5a54d34c75e67473b90fb78c6d7c38172cb7b80c Mon Sep 17 00:00:00 2001 From: core-devops Date: Sun, 21 Jun 2026 18:12:26 -0700 Subject: [PATCH] ci(ecr): auto-apply canonical image lifecycle policy on prod ECR pushes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The prod ECR repos under 153263036946.dkr.ecr.us-east-2.amazonaws.com/ molecule-ai/* had lifecycle policies set out-of-band (no IaC), so the prod ECR storage bill (~$56/mo) kept growing — platform-tenant alone accumulated 70+ images / 12GB+ of untagged + superseded sha- tags. Durable fix: the publish workflows already authenticate to prod ECR and push images (right creds + region), so apply the lifecycle policy right after each push. ECR's lifecycle engine then expires old images on its own schedule — this only DECLARES policy, no deletes happen here. - scripts/ops/ensure-ecr-lifecycle.sh: shared, idempotent, fail-soft helper (always exit 0 so a policy error never breaks a publish). The canonical policy JSON is SSOT in this one file: expire untagged after 1 day; keep last 10 tagged for sha-/v/latest/staging/main prefixes. - publish-workspace-server-image.yml: apply to molecule-ai/platform (after platform push) + molecule-ai/platform-tenant (after tenant push). - publish-canvas-image.yml: apply to molecule-ai/canvas after push. Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitea/workflows/publish-canvas-image.yml | 15 ++++ .../publish-workspace-server-image.yml | 32 +++++++ scripts/ops/ensure-ecr-lifecycle.sh | 87 +++++++++++++++++++ 3 files changed, 134 insertions(+) create mode 100755 scripts/ops/ensure-ecr-lifecycle.sh diff --git a/.gitea/workflows/publish-canvas-image.yml b/.gitea/workflows/publish-canvas-image.yml index 5bb7f2bc..299bf66b 100644 --- a/.gitea/workflows/publish-canvas-image.yml +++ b/.gitea/workflows/publish-canvas-image.yml @@ -231,6 +231,21 @@ jobs: org.opencontainers.image.revision=${{ github.sha }} org.opencontainers.image.description=Molecule AI canvas (Next.js 15 + React Flow) + # Keep the prod ECR storage bill bounded: declare the canonical image + # lifecycle policy on the canvas repo we just pushed. Idempotent + + # fail-soft (never fails the publish). SSOT JSON lives in the script. + - name: Ensure ECR lifecycle policy (canvas) + env: + IMAGE_NAME: ${{ env.IMAGE_NAME }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-east-2 + run: | + set -uo pipefail + # Strip the registry host; pass the bare molecule-ai/ name. + REPO_NAME="${IMAGE_NAME#*/}" + bash scripts/ops/ensure-ecr-lifecycle.sh "${REPO_NAME}" + # bp-exempt: post-merge canvas promote side-effect; merge is gated by CI / # all-required and this job waits for green push CI on the SHA before acting. promote-canvas: diff --git a/.gitea/workflows/publish-workspace-server-image.yml b/.gitea/workflows/publish-workspace-server-image.yml index 6fa3c748..ad3a3436 100644 --- a/.gitea/workflows/publish-workspace-server-image.yml +++ b/.gitea/workflows/publish-workspace-server-image.yml @@ -255,6 +255,22 @@ jobs: --tag "${IMAGE_NAME}:${TAG_LATEST}" \ --push . + # Keep the prod ECR storage bill bounded: declare the canonical image + # lifecycle policy on the repo we just pushed. Idempotent + fail-soft + # (never fails the publish). SSOT JSON lives in the script. The platform + # build above is continue-on-error (dead base), so this runs regardless. + - name: Ensure ECR lifecycle policy (platform) + env: + IMAGE_NAME: ${{ env.IMAGE_NAME }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-east-2 + run: | + set -uo pipefail + # Strip the registry host; pass the bare molecule-ai/ name. + REPO_NAME="${IMAGE_NAME#*/}" + bash scripts/ops/ensure-ecr-lifecycle.sh "${REPO_NAME}" + # Build + push tenant image (Go platform + Next.js canvas in one image). # Push the same build to the staging account too so fresh staging/E2E # tenants can pull without cross-account ECR reads. The staging ECR repo @@ -604,6 +620,22 @@ jobs: docker push "${ref}" done + # Keep the prod ECR storage bill bounded: declare the canonical image + # lifecycle policy on the prod tenant repo we just pushed. platform-tenant + # is the biggest accumulator (70+ images / 12GB+). Idempotent + fail-soft. + # Only the PROD repo is targeted here (staging ECR has its own account). + - name: Ensure ECR lifecycle policy (platform-tenant) + env: + TENANT_IMAGE_NAME: ${{ env.TENANT_IMAGE_NAME }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-east-2 + run: | + set -uo pipefail + # Strip the registry host; pass the bare molecule-ai/ name. + REPO_NAME="${TENANT_IMAGE_NAME#*/}" + bash scripts/ops/ensure-ecr-lifecycle.sh "${REPO_NAME}" + # Staging auto-deploy: every workspace-server image publish on main should # roll out to the staging fleet so code fixes reach staging without a # manual workflow_dispatch. Gitea 1.22.6 does not support workflow_run, so diff --git a/scripts/ops/ensure-ecr-lifecycle.sh b/scripts/ops/ensure-ecr-lifecycle.sh new file mode 100755 index 00000000..f1a7815a --- /dev/null +++ b/scripts/ops/ensure-ecr-lifecycle.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash +# ensure-ecr-lifecycle.sh — idempotently apply the canonical ECR image +# lifecycle policy to a prod ECR repository, called from the publish +# pipelines right after they push an image. +# +# Why this exists: the prod ECR repos under +# 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/* had their +# lifecycle policies set out-of-band (no IaC managed them), so the prod +# ECR storage bill (~$56/mo, account 153263036946) kept growing — +# platform-tenant alone accumulated 70+ images / 12GB+. Every untagged +# layer and every superseded :sha-<...> tag lingered forever. +# +# The durable fix: the publish workflows ALREADY authenticate to prod ECR +# and push images, so they already hold the right creds + region. This +# script just adds a `put-lifecycle-policy` call after each push. ECR's +# own lifecycle engine then expires old images on its schedule — no +# deletes happen here, this only DECLARES the policy. Re-applying the +# same policy on every build keeps it in lockstep with this file (IaC), +# so an out-of-band edit is corrected on the next publish. +# +# SSOT: the canonical policy JSON below is the single source of truth. +# It is intentionally duplicated byte-for-byte into the equivalent script +# in each repo whose publish workflow pushes to prod ECR (a workflow can +# only call a script in its own checkout); keep them identical. The policy +# was validated on the operator account before rollout. +# +# Policy: +# rule 1 — expire untagged images 1 day after push (build cache churn, +# orphaned layers from re-pushed tags) +# rule 2 — keep only the last 10 tagged images for the sha-/v/latest/ +# staging/main tag families (per-prefix retention; ECR keeps +# the N most-recent by push time and expires older) +# +# Fail-soft by design: a publish MUST NOT fail because policy application +# errored (e.g. transient ECR API blip, IAM gap). On any error this logs +# a warning and exits 0. The policy is reapplied on the next publish. +# +# Usage: +# scripts/ops/ensure-ecr-lifecycle.sh +# e.g. scripts/ops/ensure-ecr-lifecycle.sh molecule-ai/platform-tenant +# +# Env (all optional — sane defaults match the publish workflows): +# AWS_REGION / AWS_DEFAULT_REGION — ECR region (default: us-east-2) +# AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY — provided by the publish step +# +# Exit codes: +# 0 — policy applied, already current, or fail-soft no-op (always 0) + +set -uo pipefail + +REPO="${1:-}" +REGION="${AWS_REGION:-${AWS_DEFAULT_REGION:-us-east-2}}" + +if [ -z "${REPO}" ]; then + echo "::warning::ensure-ecr-lifecycle: no repository name given; skipping" >&2 + exit 0 +fi + +# --- Canonical lifecycle policy (SSOT) ------------------------------------- +# Keep this JSON identical across every repo's copy of this script. +read -r -d '' LIFECYCLE_POLICY <<'JSON' || true +{"rules":[ + {"rulePriority":1,"description":"Expire untagged after 1 day","selection":{"tagStatus":"untagged","countType":"sinceImagePushed","countUnit":"days","countNumber":1},"action":{"type":"expire"}}, + {"rulePriority":2,"description":"Keep last 10 tagged","selection":{"tagStatus":"tagged","tagPrefixList":["sha-","v","latest","staging","main"],"countType":"imageCountMoreThan","countNumber":10},"action":{"type":"expire"}} +]} +JSON + +if ! command -v aws >/dev/null 2>&1; then + echo "::warning::ensure-ecr-lifecycle: aws CLI not found; skipping policy for ${REPO}" >&2 + exit 0 +fi + +echo "::notice::ensure-ecr-lifecycle: applying canonical lifecycle policy to ${REPO} (region ${REGION})" + +if aws ecr put-lifecycle-policy \ + --repository-name "${REPO}" \ + --region "${REGION}" \ + --lifecycle-policy-text "${LIFECYCLE_POLICY}" >/dev/null 2>/tmp/ecr-lifecycle-err.$$; then + echo "::notice::ensure-ecr-lifecycle: policy applied to ${REPO}" +else + echo "::warning::ensure-ecr-lifecycle: put-lifecycle-policy failed for ${REPO} (non-fatal — policy reapplies next publish)" >&2 + sed 's/^/::warning::ensure-ecr-lifecycle: /' /tmp/ecr-lifecycle-err.$$ >&2 2>/dev/null || true +fi +rm -f /tmp/ecr-lifecycle-err.$$ 2>/dev/null || true + +# Always succeed — never break a publish on lifecycle-policy errors. +exit 0 -- 2.52.0