From 9d4ab7b1a2a51f785a2d8552e4fba6f7cd9af15a Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 28 Apr 2026 13:46:39 -0700 Subject: [PATCH] =?UTF-8?q?feat(ci):=20auto-promote-on-e2e=20=E2=80=94=20r?= =?UTF-8?q?etag=20:latest=20on=20green=20E2E=20Staging=20SaaS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the final gap in the SaaS pipeline. After auto-promote-staging fast-forwards main, publish-workspace-server-image builds new `:staging-` images, but `:latest` (what prod tenants pull) only moves on either a manual `promote-latest.yml` dispatch or a canary- verify retag (gated on Phase 2 fleet that doesn't exist). This workflow closes that gap by retagging `platform:staging-` + `platform-tenant:staging-` → `:latest` whenever E2E Staging SaaS passes for a `main` push. Uses crane (no Docker daemon needed). Verifies both images exist before retagging either, so a half-published state is impossible. Why trigger only on `main` (not staging): - `:latest` is what prod tenants pull. Only SHAs that have reached `main` (via auto-promote-staging) should advance `:latest`. - Triggering on staging would let a staging-only revert advance `:latest` to a SHA that never reaches `main`, breaking the invariant "production runs what's on `main`". Why a separate workflow rather than folding into e2e-staging-saas.yml: - Test concerns and release concerns separate. - Disabling promote during an incident is one workflow toggle, not an edit to the long E2E file. - When Phase 2 canary work eventually lands, the canary path can replace this trigger without touching the E2E workflow. Doc-aligned: per molecule-controlplane/docs/canary-tenants.md, "green staging E2E → :latest" is the recommended approach for the current scale (≤20 paying tenants); canary fleet is deferred until blast radius grows. Pipeline after this lands is fully self-healing: staging push → 4 gates green → auto-promote fast-forwards main → publish-workspace-server-image → E2E Staging SaaS → THIS WORKFLOW retags :latest → tenant fleet auto-pulls in 5 min (or redeploy-tenants-on-main fans out faster) Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/auto-promote-on-e2e.yml | 114 ++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 .github/workflows/auto-promote-on-e2e.yml diff --git a/.github/workflows/auto-promote-on-e2e.yml b/.github/workflows/auto-promote-on-e2e.yml new file mode 100644 index 00000000..21f901e9 --- /dev/null +++ b/.github/workflows/auto-promote-on-e2e.yml @@ -0,0 +1,114 @@ +name: Auto-promote :latest on E2E green + +# Retags `ghcr.io/molecule-ai/{platform,platform-tenant}:staging-` +# → `:latest` whenever E2E Staging SaaS passes for a `main` push. +# +# This is the doc-aligned alternative to the (deferred) Phase 2 canary +# fleet — staging E2E catches ~90% of what canary would catch at 0% +# ongoing infra cost. See `molecule-controlplane/docs/canary-tenants.md` +# section "Do we actually need canary right now?" — recommended +# sequencing for the current scale (≤20 paying tenants). +# +# Why a separate workflow rather than folding into e2e-staging-saas.yml: +# - Keeps test concerns separate from release concerns. +# - Disabling promote (e.g. during an incident) is one toggle, not an +# edit to the long E2E workflow file. +# - When Phase 2 canary work eventually lands, the canary path can +# replace this file's trigger without touching the E2E workflow. +# +# Why trigger on `main` only: +# - `:latest` is what prod tenants pull. We only want SHAs that have +# reached `main` (via auto-promote-staging) to advance `:latest`. +# - Triggering on staging would let a staging-only revert advance +# `:latest` to a SHA that never reaches `main`, breaking the +# "production runs what's on `main`" invariant. + +on: + workflow_run: + workflows: ['E2E Staging SaaS (full lifecycle)'] + types: [completed] + branches: [main] + workflow_dispatch: + inputs: + sha: + description: 'Short sha to promote (override; defaults to upstream workflow_run head_sha)' + required: false + type: string + +permissions: + contents: read + packages: write + +env: + IMAGE_NAME: ghcr.io/molecule-ai/platform + TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant + +jobs: + promote: + # Skip if E2E failed — `:latest` stays on the prior known-good + # digest. Manual dispatch always proceeds (the operator already + # decided to promote). + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') + runs-on: ubuntu-latest + steps: + - name: Compute short sha + id: sha + run: | + set -euo pipefail + if [ -n "${{ github.event.inputs.sha }}" ]; then + FULL="${{ github.event.inputs.sha }}" + else + FULL="${{ github.event.workflow_run.head_sha }}" + fi + echo "short=${FULL:0:7}" >> "$GITHUB_OUTPUT" + echo "full=${FULL}" >> "$GITHUB_OUTPUT" + + - uses: imjasonh/setup-crane@v0.4 + + - name: GHCR login + run: | + echo "${{ secrets.GITHUB_TOKEN }}" | \ + crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin + + - name: Verify :staging- exists for both images + # Better to fail fast with a clear message than to half-tag + # (platform retagged but platform-tenant missing → tenants pull + # a stale image). + run: | + set -euo pipefail + for img in "${IMAGE_NAME}" "${TENANT_IMAGE_NAME}"; do + tag="${img}:staging-${{ steps.sha.outputs.short }}" + if ! crane manifest "$tag" >/dev/null 2>&1; then + echo "::error::Missing tag: $tag" + echo "::error::publish-workspace-server-image must complete on this SHA before auto-promote-on-e2e can retag :latest." + exit 1 + fi + echo " ok: $tag exists" + done + + - name: Retag platform :staging- → :latest + run: | + crane tag "${IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest + + - name: Retag tenant :staging- → :latest + run: | + crane tag "${TENANT_IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest + + - name: Summary + run: | + { + echo "## E2E green → :latest promoted" + echo + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + echo "- Trigger: manual dispatch" + else + echo "- Upstream E2E run: ${{ github.event.workflow_run.html_url }}" + fi + echo "- platform:staging-${{ steps.sha.outputs.short }} → :latest" + echo "- platform-tenant:staging-${{ steps.sha.outputs.short }} → :latest" + echo + echo "Tenant fleet auto-pulls within 5 min via IMAGE_AUTO_REFRESH=true." + echo "Force immediate fanout: dispatch redeploy-tenants-on-main.yml." + } >> "$GITHUB_STEP_SUMMARY"