name: canary-verify # Runs the canary smoke suite against the staging canary tenant fleet # after a new :staging- image lands in ECR. On green, calls the # CP redeploy-fleet endpoint to promote :staging- → :latest so # the prod tenant fleet's 5-minute auto-updater picks up the verified # digest. On red, :latest stays on the prior known-good digest and # prod is untouched. # # Registry note (2026-05-10): This workflow previously used GHCR # (ghcr.io/molecule-ai/platform-tenant) — that registry was retired # during the 2026-05-06 Gitea suspension migration when publish- # workspace-server-image.yml switched to the operator's ECR org # (153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/ # platform-tenant). The GHCR → ECR migration was never applied to # this file, so canary-verify was silently smoke-testing the stale # GHCR image while the actual staging/prod tenants ran the ECR image. # Result: smoke tests could not catch a broken ECR build. Fix: # - Wait step: reads SHA from running canary /health (tenant- # agnostic, works regardless of registry). # - Promote step: calls CP redeploy-fleet endpoint with target_tag= # staging-, same mechanism as redeploy-tenants-on-main.yml. # No longer attempts GHCR crane ops. # # Dependencies: # - publish-workspace-server-image.yml publishes :staging- # to ECR on staging and main merges. # - Canary tenants are configured to pull :staging- from ECR # (TENANT_IMAGE env set to the ECR :staging- tag). # - Repo secrets CANARY_TENANT_URLS / CANARY_ADMIN_TOKENS / # CANARY_CP_SHARED_SECRET are populated. on: workflow_run: workflows: ["publish-workspace-server-image"] types: [completed] workflow_dispatch: permissions: contents: read packages: write actions: read env: # ECR registry (post-2026-05-06 SSOT for tenant images). # publish-workspace-server-image.yml pushes here. IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant # CP endpoint for redeploy-fleet (used in promote step below). CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }} jobs: canary-smoke: # Skip when the upstream workflow failed — no image to test against. if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }} runs-on: ubuntu-latest outputs: sha: ${{ steps.compute.outputs.sha }} smoke_ran: ${{ steps.smoke.outputs.ran }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Compute sha id: compute run: echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT" - name: Wait for canary tenants to pick up :staging- # Poll canary health endpoints every 30s for up to 7 min instead # of a fixed 6-min sleep. Exits as soon as ALL canaries report # the new SHA (~2-3 min typical vs 6 min fixed). Falls back to # proceeding after 7 min even if not all canaries responded — # the smoke suite will catch any that didn't update. # # NOTE: The SHA is read from the running tenant's /health response, # NOT from a registry lookup. This is registry-agnostic and works # regardless of whether the tenant pulls from ECR, GHCR, or any # other registry — the canary is telling us what it's actually # running, which is the ground truth for smoke testing. env: CANARY_TENANT_URLS: ${{ secrets.CANARY_TENANT_URLS }} EXPECTED_SHA: ${{ steps.compute.outputs.sha }} run: | if [ -z "$CANARY_TENANT_URLS" ]; then echo "No canary URLs configured — falling back to 60s wait" sleep 60 exit 0 fi IFS=',' read -ra URLS <<< "$CANARY_TENANT_URLS" MAX_WAIT=420 # 7 minutes INTERVAL=30 ELAPSED=0 while [ $ELAPSED -lt $MAX_WAIT ]; do ALL_READY=true for url in "${URLS[@]}"; do HEALTH=$(curl -s --max-time 5 "${url}/health" 2>/dev/null || echo "{}") SHA=$(echo "$HEALTH" | grep -o "\"sha\":\"[^\"]*\"" | head -1 | cut -d'"' -f4) if [ "$SHA" != "$EXPECTED_SHA" ]; then ALL_READY=false break fi done if $ALL_READY; then echo "All canaries running staging-${EXPECTED_SHA} after ${ELAPSED}s" exit 0 fi echo "Waiting for canaries... (${ELAPSED}s / ${MAX_WAIT}s)" sleep $INTERVAL ELAPSED=$((ELAPSED + INTERVAL)) done echo "Timeout after ${MAX_WAIT}s — proceeding anyway (smoke suite will validate)" - name: Run canary smoke suite id: smoke # Graceful-skip when no canary fleet is configured (Phase 2 not yet # stood up — see molecule-controlplane/docs/canary-tenants.md). # Sets `ran=false` on skip so promote-to-latest stays off (we don't # want every main merge auto-promoting without gating). Manual # promote-latest.yml is the release gate while canary is absent. # Once the fleet is real: delete the early-exit branch. env: CANARY_TENANT_URLS: ${{ secrets.CANARY_TENANT_URLS }} CANARY_ADMIN_TOKENS: ${{ secrets.CANARY_ADMIN_TOKENS }} CANARY_CP_BASE_URL: https://staging-api.moleculesai.app CANARY_CP_SHARED_SECRET: ${{ secrets.CANARY_CP_SHARED_SECRET }} run: | set -euo pipefail if [ -z "${CANARY_TENANT_URLS:-}" ] \ || [ -z "${CANARY_ADMIN_TOKENS:-}" ] \ || [ -z "${CANARY_CP_SHARED_SECRET:-}" ]; then { echo "## ⚠️ canary-verify skipped" echo echo "One or more canary secrets are unset (\`CANARY_TENANT_URLS\`, \`CANARY_ADMIN_TOKENS\`, \`CANARY_CP_SHARED_SECRET\`)." echo "Phase 2 canary fleet has not been stood up yet —" echo "see [canary-tenants.md](https://git.moleculesai.app/molecule-ai/molecule-controlplane/blob/main/docs/canary-tenants.md)." echo echo "**Skipped — promote-to-latest will NOT auto-fire.** Dispatch \`promote-latest.yml\` manually when ready." } >> "$GITHUB_STEP_SUMMARY" echo "ran=false" >> "$GITHUB_OUTPUT" echo "::notice::canary-verify: skipped — no canary fleet configured" exit 0 fi bash scripts/canary-smoke.sh echo "ran=true" >> "$GITHUB_OUTPUT" - name: Summary on failure if: ${{ failure() }} run: | { echo "## Canary smoke FAILED" echo echo "Canary tenants rejected image \`staging-${{ steps.compute.outputs.sha }}\`." echo ":latest stays pinned to the prior good digest — prod is untouched." echo echo "Fix forward and merge again, or investigate the specific failed" echo "assertions in the canary-smoke step log above." } >> "$GITHUB_STEP_SUMMARY" promote-to-latest: # On green, calls the CP redeploy-fleet endpoint with target_tag= # staging- to promote the verified ECR image. This is the same # mechanism as redeploy-tenants-on-main.yml — no GHCR crane ops. # # Pre-fix history: the old GHCR promote step used `crane tag` against # ghcr.io/molecule-ai/platform-tenant, but publish-workspace-server- # image.yml had already migrated to ECR on 2026-05-07 (commit # 10e510f5). The GHCR tags were never updated, so this step was # silently promoting a stale GHCR image while actual prod tenants # pulled from ECR. Canary smoke tests were GHCR-targeted and could # not catch a broken ECR build. needs: canary-smoke if: ${{ needs.canary-smoke.result == 'success' && needs.canary-smoke.outputs.smoke_ran == 'true' }} runs-on: ubuntu-latest env: SHA: ${{ needs.canary-smoke.outputs.sha }} CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }} # CP_ADMIN_API_TOKEN gates write access to the redeploy endpoint. # Stored at the repo level so all workflows pick it up automatically. CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }} # canary_slug pin: deploy the verified :staging- to the canary # first (soak 120s), then fan out to the rest of the fleet. CANARY_SLUG: ${{ vars.CANARY_PROMOTE_SLUG || '' }} SOAK_SECONDS: ${{ vars.CANARY_PROMOTE_SOAK || '120' }} BATCH_SIZE: ${{ vars.CANARY_PROMOTE_BATCH || '3' }} steps: - name: Check CP credentials run: | if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then echo "::error::CP_ADMIN_API_TOKEN secret is not set — promote step cannot call redeploy-fleet." echo "::error::Set it at: repo Settings → Actions → Variables and Secrets → New Secret." exit 1 fi - name: Promote verified ECR image to :latest run: | set -euo pipefail TARGET_TAG="staging-${SHA}" BODY=$(jq -nc \ --arg tag "$TARGET_TAG" \ --argjson soak "${SOAK_SECONDS:-120}" \ --argjson batch "${BATCH_SIZE:-3}" \ --argjson dry false \ '{ target_tag: $tag, soak_seconds: $soak, batch_size: $batch, dry_run: $dry }') if [ -n "${CANARY_SLUG:-}" ]; then BODY=$(jq '. * {canary_slug: $slug}' --arg slug "$CANARY_SLUG" <<<"$BODY") fi echo "Calling: POST $CP_URL/cp/admin/tenants/redeploy-fleet" echo " target_tag: $TARGET_TAG" echo " body: $BODY" HTTP_RESPONSE=$(mktemp) HTTP_CODE_FILE=$(mktemp) set +e curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \ -m 1200 \ -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \ -H "Content-Type: application/json" \ -X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \ -d "$BODY" >"$HTTP_CODE_FILE" CURL_EXIT=$? set -e HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000") [ -z "$HTTP_CODE" ] && HTTP_CODE="000" echo "HTTP $HTTP_CODE (curl exit $CURL_EXIT)" cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE" if [ "$HTTP_CODE" -ge 400 ]; then echo "::error::CP redeploy-fleet returned HTTP $HTTP_CODE — refusing to proceed." exit 1 fi - name: Summary run: | { echo "## Canary verified — :latest promoted via CP redeploy-fleet" echo "" echo "- **Target tag:** \`staging-${{ needs.canary-smoke.outputs.sha }}\`" echo "- **Registry:** ECR (\`${TENANT_IMAGE_NAME}\`)" echo "- **Canary slug:** \`${CANARY_SLUG:-}\` (soak ${SOAK_SECONDS}s)" echo "- **Batch size:** ${BATCH_SIZE:-3}" echo "" echo "CP redeploy-fleet is rolling out the verified image across the prod fleet." echo "The fleet's 5-minute health-check loop will pick up the update automatically." } >> "$GITHUB_STEP_SUMMARY"