From 81c4c02547bd12bca836e23f68eb6b3eeaf66139 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 21 Apr 2026 08:16:12 -0700 Subject: [PATCH] fix(e2e): safety-net teardown only sweeps this run's orgs Previously matched every e2e-YYYYMMDD-* slug, which stomped parallel CI runs AND manual dev probes against staging. Incident 2026-04-21 15:02Z: this workflow's safety net deleted an unrelated manual tenant 1s after it hit 'running', timing out the dev run at 15min. Scope to f'e2e-{today}-{GITHUB_RUN_ID}-' so each run only cleans its own leftovers. Empty run_id (local invocation) keeps the old broader behaviour so dev safety-nets still sweep. Also fix: the previous filter used o.get('status') which doesn't exist on the admin API response. Now reads instance_status (the real field). Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/e2e-staging-saas.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-staging-saas.yml b/.github/workflows/e2e-staging-saas.yml index e52367cc..c43e1200 100644 --- a/.github/workflows/e2e-staging-saas.yml +++ b/.github/workflows/e2e-staging-saas.yml @@ -128,9 +128,15 @@ jobs: run_id = os.environ.get('GITHUB_RUN_ID', '') d = json.load(sys.stdin) today = __import__('datetime').date.today().strftime('%Y%m%d') + # ONLY sweep slugs from *this* CI run. Previously the filter was + # f'e2e-{today}-' which stomped on parallel CI runs AND any manual + # E2E probes a dev was running against staging (incident 2026-04-21 + # 15:02Z: this workflow's safety net deleted an unrelated manual + # run's tenant 1s after it hit 'running'). + prefix = f'e2e-{today}-{run_id}-' if run_id else f'e2e-{today}-' candidates = [o['slug'] for o in d.get('orgs', []) - if o.get('slug','').startswith(f'e2e-{today}-') - and o.get('status') not in ('purged',)] + if o.get('slug','').startswith(prefix) + and o.get('instance_status') not in ('purged',)] print('\n'.join(candidates)) " 2>/dev/null) for slug in $orgs; do