name: Auto-promote :latest after main image build # Retags `ghcr.io/molecule-ai/{platform,platform-tenant}:staging-` # → `:latest` after either the image build or E2E completes on a `main` # push, gated on E2E Staging SaaS not being red for that SHA. # # Why two triggers: # # `publish-workspace-server-image` and `e2e-staging-saas` are both # paths-filtered, but with DIFFERENT path sets: # # publish-workspace-server-image: # workspace-server/**, canvas/**, manifest.json # # e2e-staging-saas (full lifecycle): # workspace-server/internal/handlers/{registry,workspace_provision, # a2a_proxy}.go, workspace-server/internal/middleware/**, # workspace-server/internal/provisioner/**, tests/e2e/test_staging_full_saas.sh # # The E2E set is a strict SUBSET of the publish set. So: # - canvas/** changes → publish fires, E2E does not # - workspace-server/cmd/** changes → publish fires, E2E does not # - workspace-server/internal/sweep/** → publish fires, E2E does not # # The previous version triggered ONLY on E2E completion, which meant # non-E2E-path changes (canvas, cmd, sweep, etc.) rebuilt the image # but never advanced `:latest`. Result: as of 2026-04-28 this workflow # had run zero times since merge despite eight main pushes — `:latest` # was ~7 hours / 9 PRs behind main with no human realising. See # `molecule-core` Slack discussion 2026-04-28. # # Adding `publish-workspace-server-image` as a second trigger closes # the gap: any image rebuild on main eligibly advances `:latest`. # # Why E2E remains a kill-switch (not the trigger): # # When E2E DID run for this SHA and ended red, we abort — `:latest` # stays on the prior known-good digest. When E2E didn't run (paths # filtered out), we proceed: pre-merge gates already validated this # SHA on staging via auto-promote-staging requiring CI + E2E Canvas + # E2E API + CodeQL all green. Image content for non-E2E-paths # (canvas, cmd, sweep) is exercised by those staging gates. # # Why `main` only: # # `:latest` is what prod tenants pull. We only want SHAs that have # reached main (via auto-promote-staging) to advance `:latest`. # Triggering on staging would let a staging-only revert advance # `:latest` to a SHA that never reaches main, breaking the "production # runs what's on main" invariant. # # Idempotency: # # When a SHA touches paths that match BOTH publish and E2E, both # workflows fire and complete. Both trigger this workflow on # completion → two runs race. Both retag `:staging-` → # `:latest`. crane tag is idempotent (re-tagging the same digest is a # no-op), so the second run is harmless. concurrency group serializes # them anyway. on: workflow_run: workflows: - 'E2E Staging SaaS (full lifecycle)' - 'publish-workspace-server-image' types: [completed] branches: [main] workflow_dispatch: inputs: sha: description: 'Short sha to promote (override; defaults to upstream workflow_run head_sha)' required: false type: string permissions: contents: read packages: write concurrency: # Serialize promotes per-SHA so the publish+E2E both-fired race lands # cleanly. Different SHAs can promote in parallel. group: auto-promote-latest-${{ github.event.workflow_run.head_sha || github.event.inputs.sha || github.sha }} cancel-in-progress: false env: IMAGE_NAME: ghcr.io/molecule-ai/platform TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant jobs: promote: # Proceed if upstream succeeded OR manual dispatch. Upstream-failure # paths are filtered here; the E2E-was-red kill-switch lives in the # gate-check step below (covers the case where upstream is publish # success but E2E for the same SHA failed). if: | github.event_name == 'workflow_dispatch' || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') runs-on: ubuntu-latest steps: - name: Compute short sha id: sha run: | set -euo pipefail if [ -n "${{ github.event.inputs.sha }}" ]; then FULL="${{ github.event.inputs.sha }}" else FULL="${{ github.event.workflow_run.head_sha }}" fi echo "short=${FULL:0:7}" >> "$GITHUB_OUTPUT" echo "full=${FULL}" >> "$GITHUB_OUTPUT" - name: Gate — E2E Staging SaaS state for this SHA # When upstream IS E2E success, we know it's green (filtered by # the job-level `if` already). When upstream is publish, look up # E2E state for the same SHA. Four buckets: # # - completed/success: E2E confirmed safe → proceed # - completed/failure|cancelled|timed_out: E2E found a # regression → ABORT (exit 1), `:latest` stays put # - in_progress|queued|requested: E2E is RACING with publish # for a runtime-touching SHA. publish typically completes # ~5-10min before E2E (~10-15min). If we promote on the # publish signal here, a later E2E failure can't roll back # `:latest` — it'd already be wrongly advanced. So we DEFER: # skip subsequent steps (proceed=false) and let E2E's own # completion event re-fire this workflow, which then takes # the upstream-is-E2E path. exit 0 so the run shows as # success rather than a noisy fake-failure. # - none/none: E2E was paths-filtered out for this SHA (the # change touched canvas/cmd/sweep/etc. — paths covered by # publish but not by E2E). pre-merge gates on staging # already validated this SHA → proceed. # # Manual dispatch skips this check — operator override. id: gate env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPO: ${{ github.repository }} SHA: ${{ steps.sha.outputs.full }} UPSTREAM_NAME: ${{ github.event.workflow_run.name }} EVENT_NAME: ${{ github.event_name }} run: | set -euo pipefail if [ "$EVENT_NAME" = "workflow_dispatch" ]; then echo "proceed=true" >> "$GITHUB_OUTPUT" echo "::notice::Manual dispatch — skipping E2E gate (operator override)" exit 0 fi if [ "$UPSTREAM_NAME" = "E2E Staging SaaS (full lifecycle)" ]; then echo "proceed=true" >> "$GITHUB_OUTPUT" echo "::notice::Upstream is E2E itself (success per job-level if) — gate trivially satisfied" exit 0 fi # Upstream is publish-workspace-server-image. Check E2E state. RESULT=$(gh run list \ --repo "$REPO" \ --workflow e2e-staging-saas.yml \ --branch main \ --commit "$SHA" \ --limit 1 \ --json status,conclusion \ --jq '.[0] | "\(.status)/\(.conclusion // "none")"' \ 2>/dev/null || echo "none/none") echo "E2E Staging SaaS for ${SHA:0:7}: $RESULT" case "$RESULT" in completed/success) echo "proceed=true" >> "$GITHUB_OUTPUT" echo "::notice::E2E green for this SHA — proceeding with promote" ;; completed/failure|completed/cancelled|completed/timed_out) echo "proceed=false" >> "$GITHUB_OUTPUT" { echo "## ❌ Auto-promote aborted — E2E Staging SaaS failed" echo echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\`" echo "\`:latest\` stays on the prior known-good digest." echo echo "If the failure was a flake, manually dispatch this workflow with the same sha to override." } >> "$GITHUB_STEP_SUMMARY" exit 1 ;; in_progress/*|queued/*|requested/*|waiting/*|pending/*) echo "proceed=false" >> "$GITHUB_OUTPUT" { echo "## ⏳ Auto-promote deferred — E2E Staging SaaS still running" echo echo "Publish completed before E2E for \`${SHA:0:7}\` (state: \`$RESULT\`)." echo "Skipping retag here — E2E's own completion event will re-fire this workflow." echo "If E2E ends green, that run promotes \`:latest\`. If red, it aborts." } >> "$GITHUB_STEP_SUMMARY" ;; none/none) echo "proceed=true" >> "$GITHUB_OUTPUT" echo "::notice::E2E paths-filtered out for this SHA — pre-merge staging gates carry" ;; *) echo "proceed=false" >> "$GITHUB_OUTPUT" { echo "## ❓ Auto-promote aborted — unexpected E2E state" echo echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\` (unhandled)" echo "Manual investigation needed; re-dispatch with the same sha once resolved." } >> "$GITHUB_STEP_SUMMARY" exit 1 ;; esac - if: steps.gate.outputs.proceed == 'true' uses: imjasonh/setup-crane@31b88efe9de28ae0ffa220711af4b60be9435f6e # v0.4 - name: GHCR login if: steps.gate.outputs.proceed == 'true' run: | echo "${{ secrets.GITHUB_TOKEN }}" | \ crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin - name: Verify :staging- exists for both images # Better to fail fast with a clear message than to half-tag # (platform retagged but platform-tenant missing → tenants pull # a stale image). if: steps.gate.outputs.proceed == 'true' run: | set -euo pipefail for img in "${IMAGE_NAME}" "${TENANT_IMAGE_NAME}"; do tag="${img}:staging-${{ steps.sha.outputs.short }}" if ! crane manifest "$tag" >/dev/null 2>&1; then echo "::error::Missing tag: $tag" echo "::error::publish-workspace-server-image must complete on this SHA before auto-promote can retag :latest." exit 1 fi echo " ok: $tag exists" done - name: Retag platform :staging- → :latest if: steps.gate.outputs.proceed == 'true' run: | crane tag "${IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest - name: Retag tenant :staging- → :latest if: steps.gate.outputs.proceed == 'true' run: | crane tag "${TENANT_IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest - name: Summary if: steps.gate.outputs.proceed == 'true' run: | { echo "## :latest promoted to ${{ steps.sha.outputs.short }}" echo if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then echo "- Trigger: manual dispatch" else echo "- Upstream: \`${{ github.event.workflow_run.name }}\` ([run](${{ github.event.workflow_run.html_url }}))" fi echo "- platform:staging-${{ steps.sha.outputs.short }} → :latest" echo "- platform-tenant:staging-${{ steps.sha.outputs.short }} → :latest" echo echo "Tenant fleet auto-pulls within 5 min via IMAGE_AUTO_REFRESH=true." echo "Force immediate fanout: dispatch redeploy-tenants-on-main.yml." } >> "$GITHUB_STEP_SUMMARY"