name: canary-verify # Runs the canary smoke suite against the staging canary tenant fleet # after a new :staging- image lands in GHCR. On green, promotes # :staging- → :latest so the prod tenant fleet's 5-minute # auto-updater picks up the verified digest. On red, :latest stays # on the prior known-good digest and prod is untouched. # # Dependencies: # - publish-workspace-server-image.yml publishes :staging- # (NOT :latest) on main merge # - canary tenants are configured to pull :staging- as their # tenant image (set TENANT_IMAGE=ghcr.io/…:staging- on the # canary provisioner code path OR rotate via an admin endpoint) # - Repo secrets CANARY_TENANT_URLS / CANARY_ADMIN_TOKENS / # CANARY_CP_SHARED_SECRET are populated on: workflow_run: workflows: ["publish-workspace-server-image"] types: [completed] workflow_dispatch: permissions: contents: read packages: write actions: read env: IMAGE_NAME: ghcr.io/molecule-ai/platform TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant jobs: canary-smoke: # Skip when the upstream workflow failed — no image to test against. if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }} runs-on: ubuntu-latest outputs: sha: ${{ steps.compute.outputs.sha }} smoke_ran: ${{ steps.smoke.outputs.ran }} steps: - name: Checkout uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - name: Compute sha id: compute run: echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT" - name: Wait for canary tenants to pick up :staging- # Poll canary health endpoints every 30s for up to 7 min instead # of a fixed 6-min sleep. Exits as soon as ALL canaries report # the new SHA (~2-3 min typical vs 6 min fixed). Falls back to # proceeding after 7 min even if not all canaries responded — # the smoke suite will catch any that didn't update. env: CANARY_TENANT_URLS: ${{ secrets.CANARY_TENANT_URLS }} EXPECTED_SHA: ${{ steps.compute.outputs.sha }} run: | if [ -z "$CANARY_TENANT_URLS" ]; then echo "No canary URLs configured — falling back to 60s wait" sleep 60 exit 0 fi IFS=',' read -ra URLS <<< "$CANARY_TENANT_URLS" MAX_WAIT=420 # 7 minutes INTERVAL=30 ELAPSED=0 while [ $ELAPSED -lt $MAX_WAIT ]; do ALL_READY=true for url in "${URLS[@]}"; do HEALTH=$(curl -s --max-time 5 "${url}/health" 2>/dev/null || echo "{}") SHA=$(echo "$HEALTH" | grep -o "\"sha\":\"[^\"]*\"" | head -1 | cut -d'"' -f4) if [ "$SHA" != "$EXPECTED_SHA" ]; then ALL_READY=false break fi done if $ALL_READY; then echo "All canaries running staging-${EXPECTED_SHA} after ${ELAPSED}s" exit 0 fi echo "Waiting for canaries... (${ELAPSED}s / ${MAX_WAIT}s)" sleep $INTERVAL ELAPSED=$((ELAPSED + INTERVAL)) done echo "Timeout after ${MAX_WAIT}s — proceeding anyway (smoke suite will validate)" - name: Run canary smoke suite id: smoke # Graceful-skip when no canary fleet is configured (Phase 2 not yet # stood up — see molecule-controlplane/docs/canary-tenants.md). # Sets `ran=false` on skip so promote-to-latest stays off (we don't # want every main merge auto-promoting without gating). Manual # promote-latest.yml is the release gate while canary is absent. # Once the fleet is real: delete the early-exit branch. env: CANARY_TENANT_URLS: ${{ secrets.CANARY_TENANT_URLS }} CANARY_ADMIN_TOKENS: ${{ secrets.CANARY_ADMIN_TOKENS }} CANARY_CP_BASE_URL: https://staging-api.moleculesai.app CANARY_CP_SHARED_SECRET: ${{ secrets.CANARY_CP_SHARED_SECRET }} run: | set -euo pipefail if [ -z "${CANARY_TENANT_URLS:-}" ] \ || [ -z "${CANARY_ADMIN_TOKENS:-}" ] \ || [ -z "${CANARY_CP_SHARED_SECRET:-}" ]; then { echo "## ⚠️ canary-verify skipped" echo echo "One or more canary secrets are unset (\`CANARY_TENANT_URLS\`, \`CANARY_ADMIN_TOKENS\`, \`CANARY_CP_SHARED_SECRET\`)." echo "Phase 2 canary fleet has not been stood up yet —" echo "see [canary-tenants.md](https://github.com/Molecule-AI/molecule-controlplane/blob/main/docs/canary-tenants.md)." echo echo "**Skipped — promote-to-latest will NOT auto-fire.** Dispatch \`promote-latest.yml\` manually when ready." } >> "$GITHUB_STEP_SUMMARY" echo "ran=false" >> "$GITHUB_OUTPUT" echo "::notice::canary-verify: skipped — no canary fleet configured" exit 0 fi bash scripts/canary-smoke.sh echo "ran=true" >> "$GITHUB_OUTPUT" - name: Summary on failure if: ${{ failure() }} run: | { echo "## Canary smoke FAILED" echo echo "Canary tenants rejected image \`staging-${{ steps.compute.outputs.sha }}\`." echo ":latest stays pinned to the prior good digest — prod is untouched." echo echo "Fix forward and merge again, or investigate the specific failed" echo "assertions in the canary-smoke step log above." } >> "$GITHUB_STEP_SUMMARY" promote-to-latest: # On green, retag :staging- → :latest for BOTH images. # crane is a lightweight registry client (no Docker daemon needed on # the runner) that can retag remotely with a single API call each. # Gated on smoke_ran=true — without a real canary fleet the smoke # step no-ops with success, and we don't want that to silently # auto-promote every main merge. needs: canary-smoke if: ${{ needs.canary-smoke.result == 'success' && needs.canary-smoke.outputs.smoke_ran == 'true' }} runs-on: ubuntu-latest steps: - uses: imjasonh/setup-crane@31b88efe9de28ae0ffa220711af4b60be9435f6e # v0.4 - name: GHCR login run: | echo "${{ secrets.GITHUB_TOKEN }}" | \ crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin - name: Retag platform :staging- → :latest run: | crane tag \ "${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \ latest - name: Retag tenant :staging- → :latest run: | crane tag \ "${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \ latest - name: Summary run: | { echo "## Canary verified — :latest promoted" echo echo "- \`${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${IMAGE_NAME}:latest\`" echo "- \`${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${TENANT_IMAGE_NAME}:latest\`" echo echo "Prod tenant fleet will pick up the new digest on its next 5-min auto-update cycle." } >> "$GITHUB_STEP_SUMMARY"