From 446f2d6e5150ce9d028b840ccb747edf3b2bb139 Mon Sep 17 00:00:00 2001 From: rabbitblood Date: Sun, 19 Apr 2026 19:29:15 -0700 Subject: [PATCH] fix(ci): replace sleep 360 with health-check poll in canary-verify (#1013) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The canary-verify workflow blocked the self-hosted runner for a fixed 6 minutes regardless of whether canaries had already updated. This wastes the runner slot when canaries update in 2-3 minutes. Fix: poll each canary's /health endpoint every 30s for up to 7 min. Exit early when all canaries report the expected SHA. Falls back to proceeding after timeout — the smoke suite validates regardless. Typical time saving: ~3-4 minutes per canary verify run. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/canary-verify.yml | 41 ++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/.github/workflows/canary-verify.yml b/.github/workflows/canary-verify.yml index d11890c6..daa6a206 100644 --- a/.github/workflows/canary-verify.yml +++ b/.github/workflows/canary-verify.yml @@ -48,9 +48,44 @@ jobs: run: echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT" - name: Wait for canary tenants to pick up :staging- - # Tenant auto-updater runs every 5 min. Sleep 6 min to give every - # canary time to pull + restart. Cheaper than polling. - run: sleep 360 + # Poll canary health endpoints every 30s for up to 7 min instead + # of a fixed 6-min sleep. Exits as soon as ALL canaries report the + # new SHA, freeing the self-hosted runner slot sooner (~2-3 min + # typical vs 6 min fixed). Falls back to proceeding after 7 min + # even if not all canaries responded — the smoke suite will catch + # any that didn't update. + env: + CANARY_TENANT_URLS: ${{ secrets.CANARY_TENANT_URLS }} + EXPECTED_SHA: ${{ steps.compute.outputs.sha }} + run: | + if [ -z "$CANARY_TENANT_URLS" ]; then + echo "No canary URLs configured — falling back to 60s wait" + sleep 60 + exit 0 + fi + IFS=',' read -ra URLS <<< "$CANARY_TENANT_URLS" + MAX_WAIT=420 # 7 minutes + INTERVAL=30 + ELAPSED=0 + while [ $ELAPSED -lt $MAX_WAIT ]; do + ALL_READY=true + for url in "${URLS[@]}"; do + HEALTH=$(curl -s --max-time 5 "${url}/health" 2>/dev/null || echo "{}") + SHA=$(echo "$HEALTH" | grep -o "\"sha\":\"[^\"]*\"" | head -1 | cut -d'"' -f4) + if [ "$SHA" != "$EXPECTED_SHA" ]; then + ALL_READY=false + break + fi + done + if $ALL_READY; then + echo "All canaries running staging-${EXPECTED_SHA} after ${ELAPSED}s" + exit 0 + fi + echo "Waiting for canaries... (${ELAPSED}s / ${MAX_WAIT}s)" + sleep $INTERVAL + ELAPSED=$((ELAPSED + INTERVAL)) + done + echo "Timeout after ${MAX_WAIT}s — proceeding anyway (smoke suite will validate)" - name: Run canary smoke suite env: