diff --git a/.github/workflows/canary-staging.yml b/.github/workflows/canary-staging.yml index f013ee21..9f1c31bb 100644 --- a/.github/workflows/canary-staging.yml +++ b/.github/workflows/canary-staging.yml @@ -159,14 +159,34 @@ jobs: ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} run: | set +e + # Slug prefix matches what test_staging_full_saas.sh emits + # in canary mode: + # SLUG="e2e-canary-$(date +%Y%m%d)-${RUN_ID_SUFFIX}" + # Earlier this was `e2e-{today}-canary-` — that was the + # full-mode pattern (date FIRST, mode SECOND); canary slugs + # have mode FIRST, date SECOND. The mismatch silently + # never matched, leaving every cancelled-canary EC2 alive + # until the once-an-hour sweep eventually caught it + # (incident 2026-04-26 21:03Z: 1h25m EC2 leak before manual + # cleanup; same gap on three earlier cancellations today). orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \ -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \ | python3 -c " - import json, sys + import json, sys, os + run_id = os.environ.get('GITHUB_RUN_ID', '') d = json.load(sys.stdin) today = __import__('datetime').date.today().strftime('%Y%m%d') + # Scope to slugs from THIS canary run when GITHUB_RUN_ID is + # available; the canary workflow sets E2E_RUN_ID='canary-\${run_id}' + # so the slug suffix is '-canary-\${run_id}-...'. Mirrors the + # full-mode safety net's per-run scoping (e2e-staging-saas.yml) + # added after the 2026-04-21 cross-run cleanup incident. + if run_id: + prefix = f'e2e-canary-{today}-canary-{run_id}' + else: + prefix = f'e2e-canary-{today}-' candidates = [o['slug'] for o in d.get('orgs', []) - if o.get('slug','').startswith(f'e2e-{today}-canary-') + if o.get('slug','').startswith(prefix) and o.get('status') not in ('purged',)] print('\n'.join(candidates)) " 2>/dev/null)