molecule-core/.github/workflows/auto-promote-on-e2e.yml

name: Auto-promote :latest after main image build

# Retags `ghcr.io/molecule-ai/{platform,platform-tenant}:staging-<sha>`
# → `:latest` after either the image build or E2E completes on a `main`
# push, gated on E2E Staging SaaS not being red for that SHA.
#
# Why two triggers:
#
#   `publish-workspace-server-image` and `e2e-staging-saas` are both
#   paths-filtered, but with DIFFERENT path sets:
#
#     publish-workspace-server-image:
#       workspace-server/**, canvas/**, manifest.json
#
#     e2e-staging-saas (full lifecycle):
#       workspace-server/internal/handlers/{registry,workspace_provision,
#       a2a_proxy}.go, workspace-server/internal/middleware/**,
#       workspace-server/internal/provisioner/**, tests/e2e/test_staging_full_saas.sh
#
#   The E2E set is a strict SUBSET of the publish set. So:
#     - canvas/** changes → publish fires, E2E does not
#     - workspace-server/cmd/** changes → publish fires, E2E does not
#     - workspace-server/internal/sweep/** → publish fires, E2E does not
#
#   The previous version triggered ONLY on E2E completion, which meant
#   non-E2E-path changes (canvas, cmd, sweep, etc.) rebuilt the image
#   but never advanced `:latest`. Result: as of 2026-04-28 this workflow
#   had run zero times since merge despite eight main pushes — `:latest`
#   was ~7 hours / 9 PRs behind main with no human realising. See
#   `molecule-core` Slack discussion 2026-04-28.
#
#   Adding `publish-workspace-server-image` as a second trigger closes
#   the gap: any image rebuild on main eligibly advances `:latest`.
#
# Why E2E remains a kill-switch (not the trigger):
#
#   When E2E DID run for this SHA and ended red, we abort — `:latest`
#   stays on the prior known-good digest. When E2E didn't run (paths
#   filtered out), we proceed: pre-merge gates already validated this
#   SHA on staging via auto-promote-staging requiring CI + E2E Canvas +
#   E2E API + CodeQL all green. Image content for non-E2E-paths
#   (canvas, cmd, sweep) is exercised by those staging gates.
#
# Why `main` only:
#
#   `:latest` is what prod tenants pull. We only want SHAs that have
#   reached main (via auto-promote-staging) to advance `:latest`.
#   Triggering on staging would let a staging-only revert advance
#   `:latest` to a SHA that never reaches main, breaking the "production
#   runs what's on main" invariant.
#
# Idempotency:
#
#   When a SHA touches paths that match BOTH publish and E2E, both
#   workflows fire and complete. Both trigger this workflow on
#   completion → two runs race. Both retag `:staging-<sha>` →
#   `:latest`. crane tag is idempotent (re-tagging the same digest is a
#   no-op), so the second run is harmless. concurrency group serializes
#   them anyway.

on:
  workflow_run:
    workflows:
      - 'E2E Staging SaaS (full lifecycle)'
      - 'publish-workspace-server-image'
    types: [completed]
    branches: [main]
  workflow_dispatch:
    inputs:
      sha:
        description: 'Short sha to promote (override; defaults to upstream workflow_run head_sha)'
        required: false
        type: string

permissions:
  contents: read
  packages: write

concurrency:
  # Serialize promotes per-SHA so the publish+E2E both-fired race lands
  # cleanly. Different SHAs can promote in parallel.
  group: auto-promote-latest-${{ github.event.workflow_run.head_sha || github.event.inputs.sha || github.sha }}
  cancel-in-progress: false

env:
  IMAGE_NAME: ghcr.io/molecule-ai/platform
  TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant

jobs:
  promote:
    # Proceed if upstream succeeded OR manual dispatch. Upstream-failure
    # paths are filtered here; the E2E-was-red kill-switch lives in the
    # gate-check step below (covers the case where upstream is publish
    # success but E2E for the same SHA failed).
    if: |
      github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
    runs-on: ubuntu-latest
    steps:
      - name: Compute short sha
        id: sha
        run: |
          set -euo pipefail
          if [ -n "${{ github.event.inputs.sha }}" ]; then
            FULL="${{ github.event.inputs.sha }}"
          else
            FULL="${{ github.event.workflow_run.head_sha }}"
          fi
          echo "short=${FULL:0:7}" >> "$GITHUB_OUTPUT"
          echo "full=${FULL}" >> "$GITHUB_OUTPUT"

      - name: Gate — E2E Staging SaaS state for this SHA
        # When upstream IS E2E success, we know it's green (filtered by
        # the job-level `if` already). When upstream is publish, look up
        # E2E state for the same SHA. Four buckets:
        #
        #   - completed/success: E2E confirmed safe → proceed
        #   - completed/failure|cancelled|timed_out: E2E found a
        #     regression → ABORT (exit 1), `:latest` stays put
        #   - in_progress|queued|requested: E2E is RACING with publish
        #     for a runtime-touching SHA. publish typically completes
        #     ~5-10min before E2E (~10-15min). If we promote on the
        #     publish signal here, a later E2E failure can't roll back
        #     `:latest` — it'd already be wrongly advanced. So we DEFER:
        #     skip subsequent steps (proceed=false) and let E2E's own
        #     completion event re-fire this workflow, which then takes
        #     the upstream-is-E2E path. exit 0 so the run shows as
        #     success rather than a noisy fake-failure.
        #   - none/none: E2E was paths-filtered out for this SHA (the
        #     change touched canvas/cmd/sweep/etc. — paths covered by
        #     publish but not by E2E). pre-merge gates on staging
        #     already validated this SHA → proceed.
        #
        # Manual dispatch skips this check — operator override.
        id: gate
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          REPO: ${{ github.repository }}
          SHA: ${{ steps.sha.outputs.full }}
          UPSTREAM_NAME: ${{ github.event.workflow_run.name }}
          EVENT_NAME: ${{ github.event_name }}
        run: |
          set -euo pipefail

          if [ "$EVENT_NAME" = "workflow_dispatch" ]; then
            echo "proceed=true" >> "$GITHUB_OUTPUT"
            echo "::notice::Manual dispatch — skipping E2E gate (operator override)"
            exit 0
          fi

          if [ "$UPSTREAM_NAME" = "E2E Staging SaaS (full lifecycle)" ]; then
            echo "proceed=true" >> "$GITHUB_OUTPUT"
            echo "::notice::Upstream is E2E itself (success per job-level if) — gate trivially satisfied"
            exit 0
          fi

          # Upstream is publish-workspace-server-image. Check E2E state.
          RESULT=$(gh run list \
            --repo "$REPO" \
            --workflow e2e-staging-saas.yml \
            --branch main \
            --commit "$SHA" \
            --limit 1 \
            --json status,conclusion \
            --jq '.[0] | "\(.status)/\(.conclusion // "none")"' \
            2>/dev/null || echo "none/none")

          echo "E2E Staging SaaS for ${SHA:0:7}: $RESULT"

          case "$RESULT" in
            completed/success)
              echo "proceed=true" >> "$GITHUB_OUTPUT"
              echo "::notice::E2E green for this SHA — proceeding with promote"
              ;;
            completed/failure|completed/cancelled|completed/timed_out)
              echo "proceed=false" >> "$GITHUB_OUTPUT"
              {
                echo "## ❌ Auto-promote aborted — E2E Staging SaaS failed"
                echo
                echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\`"
                echo "\`:latest\` stays on the prior known-good digest."
                echo
                echo "If the failure was a flake, manually dispatch this workflow with the same sha to override."
              } >> "$GITHUB_STEP_SUMMARY"
              exit 1
              ;;
            in_progress/*|queued/*|requested/*|waiting/*|pending/*)
              echo "proceed=false" >> "$GITHUB_OUTPUT"
              {
                echo "## ⏳ Auto-promote deferred — E2E Staging SaaS still running"
                echo
                echo "Publish completed before E2E for \`${SHA:0:7}\` (state: \`$RESULT\`)."
                echo "Skipping retag here — E2E's own completion event will re-fire this workflow."
                echo "If E2E ends green, that run promotes \`:latest\`. If red, it aborts."
              } >> "$GITHUB_STEP_SUMMARY"
              ;;
            none/none)
              echo "proceed=true" >> "$GITHUB_OUTPUT"
              echo "::notice::E2E paths-filtered out for this SHA — pre-merge staging gates carry"
              ;;
            *)
              echo "proceed=false" >> "$GITHUB_OUTPUT"
              {
                echo "## ❓ Auto-promote aborted — unexpected E2E state"
                echo
                echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\` (unhandled)"
                echo "Manual investigation needed; re-dispatch with the same sha once resolved."
              } >> "$GITHUB_STEP_SUMMARY"
              exit 1
              ;;
          esac

      - if: steps.gate.outputs.proceed == 'true'
        uses: imjasonh/setup-crane@31b88efe9de28ae0ffa220711af4b60be9435f6e # v0.4

      - name: GHCR login
        if: steps.gate.outputs.proceed == 'true'
        run: |
          echo "${{ secrets.GITHUB_TOKEN }}" | \
            crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin

      - name: Verify :staging-<sha> exists for both images
        # Better to fail fast with a clear message than to half-tag
        # (platform retagged but platform-tenant missing → tenants pull
        # a stale image).
        if: steps.gate.outputs.proceed == 'true'
        run: |
          set -euo pipefail
          for img in "${IMAGE_NAME}" "${TENANT_IMAGE_NAME}"; do
            tag="${img}:staging-${{ steps.sha.outputs.short }}"
            if ! crane manifest "$tag" >/dev/null 2>&1; then
              echo "::error::Missing tag: $tag"
              echo "::error::publish-workspace-server-image must complete on this SHA before auto-promote can retag :latest."
              exit 1
            fi
            echo "  ok: $tag exists"
          done

      - name: Retag platform :staging-<sha> → :latest
        if: steps.gate.outputs.proceed == 'true'
        run: |
          crane tag "${IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest

      - name: Retag tenant :staging-<sha> → :latest
        if: steps.gate.outputs.proceed == 'true'
        run: |
          crane tag "${TENANT_IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest

      - name: Summary
        if: steps.gate.outputs.proceed == 'true'
        run: |
          {
            echo "## :latest promoted to ${{ steps.sha.outputs.short }}"
            echo
            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
              echo "- Trigger: manual dispatch"
            else
              echo "- Upstream: \`${{ github.event.workflow_run.name }}\` ([run](${{ github.event.workflow_run.html_url }}))"
            fi
            echo "- platform:staging-${{ steps.sha.outputs.short }} → :latest"
            echo "- platform-tenant:staging-${{ steps.sha.outputs.short }} → :latest"
            echo
            echo "Tenant fleet auto-pulls within 5 min via IMAGE_AUTO_REFRESH=true."
            echo "Force immediate fanout: dispatch redeploy-tenants-on-main.yml."
          } >> "$GITHUB_STEP_SUMMARY"