Completes the canary release train. Before this, publish-workspace- server-image.yml pushed both :staging-<sha> and :latest on every main merge — meaning the prod tenant fleet auto-pulled every image immediately, before any post-deploy smoke test. A broken image (think: this morning's E2E current_task drift, but shipped at 3am instead of caught in CI) would have fanned out to every running tenant within 5 min. Now: - publish workflow pushes :staging-<sha> ONLY - canary tenants are configured to track :staging-<sha>; they pick up the new image on their next auto-update cycle - canary-verify.yml runs the smoke suite (Phase 2) after the sleep - on green: a new promote-to-latest job uses crane to remotely retag :staging-<sha> → :latest for both platform and tenant images - prod tenants auto-update to the newly-retagged :latest within their usual 5-min window - on red: :latest stays frozen on prior good digest; prod is untouched crane is pulled onto the runner (~4 MB, GitHub release) rather than docker-daemon retag so the workflow doesn't need a privileged runner. Rollback: if canary passed but something surfaces post-promotion, operator runs "crane tag ghcr.io/molecule-ai/platform:<prior-good-sha> latest" manually. A follow-up can wrap that in a Phase 4 admin endpoint / script. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
114 lines
4.2 KiB
YAML
114 lines
4.2 KiB
YAML
name: canary-verify
|
|
|
|
# Runs the canary smoke suite against the staging canary tenant fleet
|
|
# after a new :staging-<sha> image lands in GHCR. On green, promotes
|
|
# :staging-<sha> → :latest so the prod tenant fleet's 5-minute
|
|
# auto-updater picks up the verified digest. On red, :latest stays
|
|
# on the prior known-good digest and prod is untouched.
|
|
#
|
|
# Dependencies:
|
|
# - publish-workspace-server-image.yml publishes :staging-<sha>
|
|
# (NOT :latest) on main merge
|
|
# - canary tenants are configured to pull :staging-<sha> as their
|
|
# tenant image (set TENANT_IMAGE=ghcr.io/…:staging-<sha> on the
|
|
# canary provisioner code path OR rotate via an admin endpoint)
|
|
# - Repo secrets CANARY_TENANT_URLS / CANARY_ADMIN_TOKENS /
|
|
# CANARY_CP_SHARED_SECRET are populated
|
|
|
|
on:
|
|
workflow_run:
|
|
workflows: ["publish-workspace-server-image"]
|
|
types: [completed]
|
|
workflow_dispatch:
|
|
|
|
permissions:
|
|
contents: read
|
|
packages: write
|
|
actions: read
|
|
|
|
env:
|
|
IMAGE_NAME: ghcr.io/molecule-ai/platform
|
|
TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant
|
|
|
|
jobs:
|
|
canary-smoke:
|
|
# Skip when the upstream workflow failed — no image to test against.
|
|
if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }}
|
|
runs-on: ubuntu-latest
|
|
outputs:
|
|
sha: ${{ steps.compute.outputs.sha }}
|
|
steps:
|
|
- name: Checkout
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Compute sha
|
|
id: compute
|
|
run: echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
|
|
|
|
- name: Wait for canary tenants to pick up :staging-<sha>
|
|
# Tenant auto-updater runs every 5 min. Sleep 6 min to give every
|
|
# canary time to pull + restart. Cheaper than polling.
|
|
run: sleep 360
|
|
|
|
- name: Run canary smoke suite
|
|
env:
|
|
CANARY_TENANT_URLS: ${{ secrets.CANARY_TENANT_URLS }}
|
|
CANARY_ADMIN_TOKENS: ${{ secrets.CANARY_ADMIN_TOKENS }}
|
|
CANARY_CP_BASE_URL: https://staging-api.moleculesai.app
|
|
CANARY_CP_SHARED_SECRET: ${{ secrets.CANARY_CP_SHARED_SECRET }}
|
|
run: bash scripts/canary-smoke.sh
|
|
|
|
- name: Summary on failure
|
|
if: ${{ failure() }}
|
|
run: |
|
|
{
|
|
echo "## Canary smoke FAILED"
|
|
echo
|
|
echo "Canary tenants rejected image \`staging-${{ steps.compute.outputs.sha }}\`."
|
|
echo ":latest stays pinned to the prior good digest — prod is untouched."
|
|
echo
|
|
echo "Fix forward and merge again, or investigate the specific failed"
|
|
echo "assertions in the canary-smoke step log above."
|
|
} >> "$GITHUB_STEP_SUMMARY"
|
|
|
|
promote-to-latest:
|
|
# On green, retag :staging-<sha> → :latest for BOTH images.
|
|
# crane is a lightweight registry client (no Docker daemon needed on
|
|
# the runner) that can retag remotely with a single API call each.
|
|
needs: canary-smoke
|
|
if: ${{ needs.canary-smoke.result == 'success' }}
|
|
runs-on: ubuntu-latest
|
|
steps:
|
|
- name: Install crane
|
|
run: |
|
|
curl -fsSL https://github.com/google/go-containerregistry/releases/download/v0.20.2/go-containerregistry_Linux_x86_64.tar.gz | \
|
|
tar xz -C /usr/local/bin crane
|
|
|
|
- name: GHCR login
|
|
run: |
|
|
echo "${{ secrets.GITHUB_TOKEN }}" | \
|
|
crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin
|
|
|
|
- name: Retag platform :staging-<sha> → :latest
|
|
run: |
|
|
crane tag \
|
|
"${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \
|
|
latest
|
|
|
|
- name: Retag tenant :staging-<sha> → :latest
|
|
run: |
|
|
crane tag \
|
|
"${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \
|
|
latest
|
|
|
|
- name: Summary
|
|
run: |
|
|
{
|
|
echo "## Canary verified — :latest promoted"
|
|
echo
|
|
echo "- \`${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${IMAGE_NAME}:latest\`"
|
|
echo "- \`${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${TENANT_IMAGE_NAME}:latest\`"
|
|
echo
|
|
echo "Prod tenant fleet will pick up the new digest on its next 5-min auto-update cycle."
|
|
} >> "$GITHUB_STEP_SUMMARY"
|