forked from molecule-ai/molecule-core
The canary-verify workflow blocked the self-hosted runner for a fixed 6 minutes regardless of whether canaries had already updated. This wastes the runner slot when canaries update in 2-3 minutes. Fix: poll each canary's /health endpoint every 30s for up to 7 min. Exit early when all canaries report the expected SHA. Falls back to proceeding after timeout — the smoke suite validates regardless. Typical time saving: ~3-4 minutes per canary verify run. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
160 lines
6.1 KiB
YAML
160 lines
6.1 KiB
YAML
name: canary-verify
|
|
|
|
# Runs the canary smoke suite against the staging canary tenant fleet
|
|
# after a new :staging-<sha> image lands in GHCR. On green, promotes
|
|
# :staging-<sha> → :latest so the prod tenant fleet's 5-minute
|
|
# auto-updater picks up the verified digest. On red, :latest stays
|
|
# on the prior known-good digest and prod is untouched.
|
|
#
|
|
# Dependencies:
|
|
# - publish-workspace-server-image.yml publishes :staging-<sha>
|
|
# (NOT :latest) on main merge
|
|
# - canary tenants are configured to pull :staging-<sha> as their
|
|
# tenant image (set TENANT_IMAGE=ghcr.io/…:staging-<sha> on the
|
|
# canary provisioner code path OR rotate via an admin endpoint)
|
|
# - Repo secrets CANARY_TENANT_URLS / CANARY_ADMIN_TOKENS /
|
|
# CANARY_CP_SHARED_SECRET are populated
|
|
|
|
on:
|
|
workflow_run:
|
|
workflows: ["publish-workspace-server-image"]
|
|
types: [completed]
|
|
workflow_dispatch:
|
|
|
|
permissions:
|
|
contents: read
|
|
packages: write
|
|
actions: read
|
|
|
|
env:
|
|
IMAGE_NAME: ghcr.io/molecule-ai/platform
|
|
TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant
|
|
|
|
jobs:
|
|
canary-smoke:
|
|
# Skip when the upstream workflow failed — no image to test against.
|
|
if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }}
|
|
# Self-hosted mac mini — GitHub-hosted minutes are quota-blocked on
|
|
# this org (same reason publish/promote-latest moved earlier).
|
|
runs-on: [self-hosted, macos, arm64]
|
|
outputs:
|
|
sha: ${{ steps.compute.outputs.sha }}
|
|
steps:
|
|
- name: Checkout
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Compute sha
|
|
id: compute
|
|
run: echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
|
|
|
|
- name: Wait for canary tenants to pick up :staging-<sha>
|
|
# Poll canary health endpoints every 30s for up to 7 min instead
|
|
# of a fixed 6-min sleep. Exits as soon as ALL canaries report the
|
|
# new SHA, freeing the self-hosted runner slot sooner (~2-3 min
|
|
# typical vs 6 min fixed). Falls back to proceeding after 7 min
|
|
# even if not all canaries responded — the smoke suite will catch
|
|
# any that didn't update.
|
|
env:
|
|
CANARY_TENANT_URLS: ${{ secrets.CANARY_TENANT_URLS }}
|
|
EXPECTED_SHA: ${{ steps.compute.outputs.sha }}
|
|
run: |
|
|
if [ -z "$CANARY_TENANT_URLS" ]; then
|
|
echo "No canary URLs configured — falling back to 60s wait"
|
|
sleep 60
|
|
exit 0
|
|
fi
|
|
IFS=',' read -ra URLS <<< "$CANARY_TENANT_URLS"
|
|
MAX_WAIT=420 # 7 minutes
|
|
INTERVAL=30
|
|
ELAPSED=0
|
|
while [ $ELAPSED -lt $MAX_WAIT ]; do
|
|
ALL_READY=true
|
|
for url in "${URLS[@]}"; do
|
|
HEALTH=$(curl -s --max-time 5 "${url}/health" 2>/dev/null || echo "{}")
|
|
SHA=$(echo "$HEALTH" | grep -o "\"sha\":\"[^\"]*\"" | head -1 | cut -d'"' -f4)
|
|
if [ "$SHA" != "$EXPECTED_SHA" ]; then
|
|
ALL_READY=false
|
|
break
|
|
fi
|
|
done
|
|
if $ALL_READY; then
|
|
echo "All canaries running staging-${EXPECTED_SHA} after ${ELAPSED}s"
|
|
exit 0
|
|
fi
|
|
echo "Waiting for canaries... (${ELAPSED}s / ${MAX_WAIT}s)"
|
|
sleep $INTERVAL
|
|
ELAPSED=$((ELAPSED + INTERVAL))
|
|
done
|
|
echo "Timeout after ${MAX_WAIT}s — proceeding anyway (smoke suite will validate)"
|
|
|
|
- name: Run canary smoke suite
|
|
env:
|
|
CANARY_TENANT_URLS: ${{ secrets.CANARY_TENANT_URLS }}
|
|
CANARY_ADMIN_TOKENS: ${{ secrets.CANARY_ADMIN_TOKENS }}
|
|
CANARY_CP_BASE_URL: https://staging-api.moleculesai.app
|
|
CANARY_CP_SHARED_SECRET: ${{ secrets.CANARY_CP_SHARED_SECRET }}
|
|
run: bash scripts/canary-smoke.sh
|
|
|
|
- name: Summary on failure
|
|
if: ${{ failure() }}
|
|
run: |
|
|
{
|
|
echo "## Canary smoke FAILED"
|
|
echo
|
|
echo "Canary tenants rejected image \`staging-${{ steps.compute.outputs.sha }}\`."
|
|
echo ":latest stays pinned to the prior good digest — prod is untouched."
|
|
echo
|
|
echo "Fix forward and merge again, or investigate the specific failed"
|
|
echo "assertions in the canary-smoke step log above."
|
|
} >> "$GITHUB_STEP_SUMMARY"
|
|
|
|
promote-to-latest:
|
|
# On green, retag :staging-<sha> → :latest for BOTH images.
|
|
# crane is a lightweight registry client (no Docker daemon needed on
|
|
# the runner) that can retag remotely with a single API call each.
|
|
needs: canary-smoke
|
|
if: ${{ needs.canary-smoke.result == 'success' }}
|
|
runs-on: [self-hosted, macos, arm64]
|
|
steps:
|
|
- name: Ensure crane installed
|
|
# Matches the install pattern in promote-latest.yml — brew
|
|
# cleanup exits non-zero on the shared runner's /opt/homebrew
|
|
# symlinks, so skip it.
|
|
env:
|
|
HOMEBREW_NO_INSTALL_CLEANUP: "1"
|
|
HOMEBREW_NO_AUTO_UPDATE: "1"
|
|
HOMEBREW_NO_ENV_HINTS: "1"
|
|
run: |
|
|
if ! command -v crane >/dev/null 2>&1; then
|
|
brew install crane
|
|
fi
|
|
crane version
|
|
|
|
- name: GHCR login
|
|
run: |
|
|
echo "${{ secrets.GITHUB_TOKEN }}" | \
|
|
crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin
|
|
|
|
- name: Retag platform :staging-<sha> → :latest
|
|
run: |
|
|
crane tag \
|
|
"${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \
|
|
latest
|
|
|
|
- name: Retag tenant :staging-<sha> → :latest
|
|
run: |
|
|
crane tag \
|
|
"${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \
|
|
latest
|
|
|
|
- name: Summary
|
|
run: |
|
|
{
|
|
echo "## Canary verified — :latest promoted"
|
|
echo
|
|
echo "- \`${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${IMAGE_NAME}:latest\`"
|
|
echo "- \`${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${TENANT_IMAGE_NAME}:latest\`"
|
|
echo
|
|
echo "Prod tenant fleet will pick up the new digest on its next 5-min auto-update cycle."
|
|
} >> "$GITHUB_STEP_SUMMARY"
|