name: Auto-promote :latest after main image build # Retags `ghcr.io/molecule-ai/{platform,platform-tenant}:staging-` # → `:latest` after either the image build or E2E completes on a `main` # push, gated on E2E Staging SaaS not being red for that SHA. # # Why two triggers: # # `publish-workspace-server-image` and `e2e-staging-saas` are both # paths-filtered, but with DIFFERENT path sets: # # publish-workspace-server-image: # workspace-server/**, canvas/**, manifest.json # # e2e-staging-saas (full lifecycle): # workspace-server/internal/handlers/{registry,workspace_provision, # a2a_proxy}.go, workspace-server/internal/middleware/**, # workspace-server/internal/provisioner/**, tests/e2e/test_staging_full_saas.sh # # The E2E set is a strict SUBSET of the publish set. So: # - canvas/** changes → publish fires, E2E does not # - workspace-server/cmd/** changes → publish fires, E2E does not # - workspace-server/internal/sweep/** → publish fires, E2E does not # # The previous version triggered ONLY on E2E completion, which meant # non-E2E-path changes (canvas, cmd, sweep, etc.) rebuilt the image # but never advanced `:latest`. Result: as of 2026-04-28 this workflow # had run zero times since merge despite eight main pushes — `:latest` # was ~7 hours / 9 PRs behind main with no human realising. See # `molecule-core` Slack discussion 2026-04-28. # # Adding `publish-workspace-server-image` as a second trigger closes # the gap: any image rebuild on main eligibly advances `:latest`. # # Why E2E remains a kill-switch (not the trigger): # # When E2E DID run for this SHA and ended red, we abort — `:latest` # stays on the prior known-good digest. When E2E didn't run (paths # filtered out), we proceed: pre-merge gates already validated this # SHA on staging via auto-promote-staging requiring CI + E2E Canvas + # E2E API + CodeQL all green. Image content for non-E2E-paths # (canvas, cmd, sweep) is exercised by those staging gates. # # Why `main` only: # # `:latest` is what prod tenants pull. We only want SHAs that have # reached main (via auto-promote-staging) to advance `:latest`. # Triggering on staging would let a staging-only revert advance # `:latest` to a SHA that never reaches main, breaking the "production # runs what's on main" invariant. # # Idempotency: # # When a SHA touches paths that match BOTH publish and E2E, both # workflows fire and complete. Both trigger this workflow on # completion → two runs race. Both retag `:staging-` → # `:latest`. crane tag is idempotent (re-tagging the same digest is a # no-op), so the second run is harmless. concurrency group serializes # them anyway. on: workflow_run: workflows: - 'E2E Staging SaaS (full lifecycle)' - 'publish-workspace-server-image' types: [completed] branches: [main] workflow_dispatch: inputs: sha: description: 'Short sha to promote (override; defaults to upstream workflow_run head_sha)' required: false type: string permissions: contents: read packages: write concurrency: # Serialize promotes per-SHA so the publish+E2E both-fired race lands # cleanly. Different SHAs can promote in parallel. group: auto-promote-latest-${{ github.event.workflow_run.head_sha || github.event.inputs.sha || github.sha }} cancel-in-progress: false env: IMAGE_NAME: ghcr.io/molecule-ai/platform TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant jobs: promote: # Proceed if upstream succeeded OR manual dispatch. Upstream-failure # paths are filtered here; the E2E-was-red kill-switch lives in the # gate-check step below (covers the case where upstream is publish # success but E2E for the same SHA failed). if: | github.event_name == 'workflow_dispatch' || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') runs-on: ubuntu-latest steps: - name: Compute short sha id: sha run: | set -euo pipefail if [ -n "${{ github.event.inputs.sha }}" ]; then FULL="${{ github.event.inputs.sha }}" else FULL="${{ github.event.workflow_run.head_sha }}" fi echo "short=${FULL:0:7}" >> "$GITHUB_OUTPUT" echo "full=${FULL}" >> "$GITHUB_OUTPUT" - name: Gate — E2E Staging SaaS state for this SHA # When upstream IS E2E success, we know it's green (filtered by # the job-level `if` already). When upstream is publish, look up # E2E state for the same SHA. Four buckets: # # - completed/success: E2E confirmed safe → proceed # - completed/failure|cancelled|timed_out: E2E found a # regression → ABORT (exit 1), `:latest` stays put # - in_progress|queued|requested: E2E is RACING with publish # for a runtime-touching SHA. publish typically completes # ~5-10min before E2E (~10-15min). If we promote on the # publish signal here, a later E2E failure can't roll back # `:latest` — it'd already be wrongly advanced. So we DEFER: # skip subsequent steps (proceed=false) and let E2E's own # completion event re-fire this workflow, which then takes # the upstream-is-E2E path. exit 0 so the run shows as # success rather than a noisy fake-failure. # - none/none: E2E was paths-filtered out for this SHA (the # change touched canvas/cmd/sweep/etc. — paths covered by # publish but not by E2E). pre-merge gates on staging # already validated this SHA → proceed. # # Manual dispatch skips this check — operator override. id: gate env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPO: ${{ github.repository }} SHA: ${{ steps.sha.outputs.full }} UPSTREAM_NAME: ${{ github.event.workflow_run.name }} EVENT_NAME: ${{ github.event_name }} run: | set -euo pipefail if [ "$EVENT_NAME" = "workflow_dispatch" ]; then echo "proceed=true" >> "$GITHUB_OUTPUT" echo "::notice::Manual dispatch — skipping E2E gate (operator override)" exit 0 fi if [ "$UPSTREAM_NAME" = "E2E Staging SaaS (full lifecycle)" ]; then echo "proceed=true" >> "$GITHUB_OUTPUT" echo "::notice::Upstream is E2E itself (success per job-level if) — gate trivially satisfied" exit 0 fi # Upstream is publish-workspace-server-image. Check E2E state. # The jq filter must defend against TWO empty cases that gh # CLI emits indistinguishably: # 1. gh exits non-zero (network blip, auth issue) → handled # by the `|| echo "none/none"` fallback below. # 2. gh exits zero but returns `[]` (no E2E run on this # main SHA — the common case for canvas-only / cmd-only # / sweep-only changes whose paths don't trigger E2E). # Without `(.[0] // {})`, jq sees `null` and emits # "null/none" — which the case statement below has no # branch for, so it falls into *) → exit 1. # Surfaced 2026-04-30 the first time the App-token chain # (#2389) actually fired auto-promote-on-e2e from a publish # upstream — every prior run was E2E-upstream which # short-circuits before this gate. RESULT=$(gh run list \ --repo "$REPO" \ --workflow e2e-staging-saas.yml \ --branch main \ --commit "$SHA" \ --limit 1 \ --json status,conclusion \ --jq '(.[0] // {}) | "\(.status // "none")/\(.conclusion // "none")"' \ 2>/dev/null || echo "none/none") echo "E2E Staging SaaS for ${SHA:0:7}: $RESULT" case "$RESULT" in completed/success) echo "proceed=true" >> "$GITHUB_OUTPUT" echo "::notice::E2E green for this SHA — proceeding with promote" ;; completed/failure|completed/cancelled|completed/timed_out) echo "proceed=false" >> "$GITHUB_OUTPUT" { echo "## ❌ Auto-promote aborted — E2E Staging SaaS failed" echo echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\`" echo "\`:latest\` stays on the prior known-good digest." echo echo "If the failure was a flake, manually dispatch this workflow with the same sha to override." } >> "$GITHUB_STEP_SUMMARY" exit 1 ;; in_progress/*|queued/*|requested/*|waiting/*|pending/*) echo "proceed=false" >> "$GITHUB_OUTPUT" { echo "## ⏳ Auto-promote deferred — E2E Staging SaaS still running" echo echo "Publish completed before E2E for \`${SHA:0:7}\` (state: \`$RESULT\`)." echo "Skipping retag here — E2E's own completion event will re-fire this workflow." echo "If E2E ends green, that run promotes \`:latest\`. If red, it aborts." } >> "$GITHUB_STEP_SUMMARY" ;; none/none) echo "proceed=true" >> "$GITHUB_OUTPUT" echo "::notice::E2E paths-filtered out for this SHA — pre-merge staging gates carry" ;; *) echo "proceed=false" >> "$GITHUB_OUTPUT" { echo "## ❓ Auto-promote aborted — unexpected E2E state" echo echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\` (unhandled)" echo "Manual investigation needed; re-dispatch with the same sha once resolved." } >> "$GITHUB_STEP_SUMMARY" exit 1 ;; esac - if: steps.gate.outputs.proceed == 'true' uses: imjasonh/setup-crane@31b88efe9de28ae0ffa220711af4b60be9435f6e # v0.4 - name: GHCR login if: steps.gate.outputs.proceed == 'true' run: | echo "${{ secrets.GITHUB_TOKEN }}" | \ crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin - name: Verify :staging- exists for both images # Better to fail fast with a clear message than to half-tag # (platform retagged but platform-tenant missing → tenants pull # a stale image). if: steps.gate.outputs.proceed == 'true' run: | set -euo pipefail for img in "${IMAGE_NAME}" "${TENANT_IMAGE_NAME}"; do tag="${img}:staging-${{ steps.sha.outputs.short }}" if ! crane manifest "$tag" >/dev/null 2>&1; then echo "::error::Missing tag: $tag" echo "::error::publish-workspace-server-image must complete on this SHA before auto-promote can retag :latest." exit 1 fi echo " ok: $tag exists" done - name: Ancestry check — refuse to promote :latest backwards # #2244: workflow_run completions arrive in arbitrary order. If # SHA-A and SHA-B both reach main within ~10 min and SHA-B's E2E # completes before SHA-A's, this workflow can fire for SHA-A # AFTER it already promoted SHA-B → :latest goes backwards. The # orphan-reconciler "next run corrects it" doesn't apply: there's # no auto-corrective re-promote, :latest stays wrong until the # next main push lands. # # Detection: read current :latest's `org.opencontainers.image.revision` # label (set by publish-workspace-server-image.yml at build time) # and ask the GitHub compare API whether the candidate SHA is # ahead-of / identical-to / behind / diverged-from current. # Hard-fail on `behind` and `diverged` per the approved design — # silent-bypass is the class we're moving away from. Workflow # goes red, oncall sees it, operator decides how to recover # (manual dispatch with the right SHA, force-promote, etc.). # # Manual dispatch skips this check — operator override semantics # match the gate-check step above. # # Backward-compat: when current :latest carries no revision # label (legacy image pre-publish-with-label), skip-with-warning. # All :latest images on main are post-label as of 2026-04-29, so # this branch will be dead within 90 days; remove then. if: steps.gate.outputs.proceed == 'true' && github.event_name != 'workflow_dispatch' id: ancestry env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPO: ${{ github.repository }} TARGET_SHA: ${{ steps.sha.outputs.full }} run: | set -euo pipefail # Read the current :latest config and pull the revision label. # `crane config` returns the OCI image config blob (not the manifest); # labels live under `.config.Labels`. `// empty` makes jq return "" # rather than the literal "null" so the test below works. CURRENT_REVISION=$(crane config "${IMAGE_NAME}:latest" 2>/dev/null \ | jq -r '.config.Labels["org.opencontainers.image.revision"] // empty' \ || true) if [ -z "$CURRENT_REVISION" ]; then echo "decision=skip-no-label" >> "$GITHUB_OUTPUT" { echo "## ⚠ Ancestry check skipped — current :latest has no revision label" echo echo "Likely a legacy image built before \`org.opencontainers.image.revision\` was set." echo "Falling through to retag. After all \`:latest\` images are post-label (TODO 90 days), this branch is dead and should be removed." } >> "$GITHUB_STEP_SUMMARY" echo "::warning::Current :latest carries no revision label — skipping ancestry check (legacy image)" exit 0 fi if [ "$CURRENT_REVISION" = "$TARGET_SHA" ]; then echo "decision=identical" >> "$GITHUB_OUTPUT" echo "::notice:::latest already at ${TARGET_SHA:0:7} — retag will be a no-op" exit 0 fi # Ask GitHub which side of the merge graph TARGET_SHA sits on # relative to CURRENT_REVISION. Returns one of: ahead | identical # | behind | diverged. Network or auth errors collapse to "error" # via the explicit fallback so the case below always matches. STATUS=$(gh api \ "repos/${REPO}/compare/${CURRENT_REVISION}...${TARGET_SHA}" \ --jq '.status' 2>/dev/null || echo "error") echo "ancestry compare ${CURRENT_REVISION:0:7} → ${TARGET_SHA:0:7}: $STATUS" case "$STATUS" in ahead) echo "decision=ahead" >> "$GITHUB_OUTPUT" echo "::notice::Target ${TARGET_SHA:0:7} is ahead of current :latest (${CURRENT_REVISION:0:7}) — proceeding with retag" ;; identical) echo "decision=identical" >> "$GITHUB_OUTPUT" echo "::notice::Target identical to :latest — retag will be a no-op" ;; behind) echo "decision=behind" >> "$GITHUB_OUTPUT" { echo "## ❌ Auto-promote refused — target is BEHIND current :latest" echo echo "| Field | Value |" echo "|---|---|" echo "| Target SHA | \`$TARGET_SHA\` |" echo "| Current :latest revision | \`$CURRENT_REVISION\` |" echo "| GitHub compare status | \`behind\` |" echo echo "This guard catches the workflow_run-completion-order race (#2244):" echo "two rapid main pushes whose E2Es complete out-of-order can otherwise" echo "promote \`:latest\` backwards. \`:latest\` stays on \`${CURRENT_REVISION:0:7}\`." echo echo "**Recovery:** if this is a legitimate revert that should land on \`:latest\`," echo "manually dispatch this workflow with the target sha as input — the manual-dispatch" echo "path skips the ancestry check (operator override)." } >> "$GITHUB_STEP_SUMMARY" exit 1 ;; diverged) echo "decision=diverged" >> "$GITHUB_OUTPUT" { echo "## ❓ Auto-promote refused — history diverged" echo echo "| Field | Value |" echo "|---|---|" echo "| Target SHA | \`$TARGET_SHA\` |" echo "| Current :latest revision | \`$CURRENT_REVISION\` |" echo "| GitHub compare status | \`diverged\` |" echo echo "Likely cause: force-push rewrote main's history, leaving the previous" echo "\`:latest\` revision orphaned. Needs human review before \`:latest\` advances." } >> "$GITHUB_STEP_SUMMARY" exit 1 ;; error|*) echo "decision=error" >> "$GITHUB_OUTPUT" { echo "## ❌ Auto-promote aborted — ancestry-check API error" echo echo "\`gh api repos/${REPO}/compare/${CURRENT_REVISION}...${TARGET_SHA}\` returned unexpected status: \`$STATUS\`" echo echo "Manual dispatch with the target sha bypasses this check." } >> "$GITHUB_STEP_SUMMARY" exit 1 ;; esac - name: Retag platform :staging- → :latest if: steps.gate.outputs.proceed == 'true' run: | crane tag "${IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest - name: Retag tenant :staging- → :latest if: steps.gate.outputs.proceed == 'true' run: | crane tag "${TENANT_IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest - name: Summary if: steps.gate.outputs.proceed == 'true' run: | { echo "## :latest promoted to ${{ steps.sha.outputs.short }}" echo if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then echo "- Trigger: manual dispatch" else echo "- Upstream: \`${{ github.event.workflow_run.name }}\` ([run](${{ github.event.workflow_run.html_url }}))" fi echo "- platform:staging-${{ steps.sha.outputs.short }} → :latest" echo "- platform-tenant:staging-${{ steps.sha.outputs.short }} → :latest" echo echo "Tenant fleet auto-pulls within 5 min via IMAGE_AUTO_REFRESH=true." echo "Force immediate fanout: dispatch redeploy-tenants-on-main.yml." } >> "$GITHUB_STEP_SUMMARY"