fix(e2e-canvas): kill teardown race that poisons concurrent runs
Setup wrote .playwright-staging-state.json at the END (step 7), only
after org create + provision-wait + TLS + workspace create + workspace-
online all succeeded. If setup crashed at steps 1-6, the org existed in
CP but the state file did not, so Playwright's globalTeardown bailed
out ("nothing to tear down") and the workflow safety-net pattern-swept
every e2e-canvas-<today>-* org to compensate. That sweep deleted
concurrent runs' live tenants — including their CF DNS records —
causing victims' next fetch to die with `getaddrinfo ENOTFOUND`.
Race observed 2026-04-30 on PR #2264 staging→main: three real-test
runs killed each other mid-test, blocking 68 commits of staging→main
promotion.
Fix: write the state file as setup's first action, right after slug
generation, before any CP call. Now:
- Crash before slug gen → no state file, no orphan to clean
- Crash during steps 1-6 → state file has slug; teardown deletes
it (DELETE 404s if org never created)
- Setup completes → state file has full state; teardown
deletes the slug
The workflow safety-net no longer pattern-sweeps; it reads the state
file and deletes only the recorded slug. Concurrent canvas-E2E runs no
longer poison each other.
Verified by:
- tsc --noEmit on staging-setup.ts + staging-teardown.ts
- YAML lint on e2e-staging-canvas.yml
- Code review: state file write moved to line 113 (post-makeSlug,
pre-CP) with the original line-249 write retained as a "promote
to full state" overwrite at the end
This commit is contained in:
parent
e3588d4934
commit
15b98c4916
54
.github/workflows/e2e-staging-canvas.yml
vendored
54
.github/workflows/e2e-staging-canvas.yml
vendored
@ -153,43 +153,39 @@ jobs:
|
||||
path: canvas/test-results/
|
||||
retention-days: 14
|
||||
|
||||
# Safety-net teardown mirrors the bash-harness workflow — if
|
||||
# globalTeardown didn't run (worker crash, runner cancel), this
|
||||
# step sweeps any e2e-canvas-* org tagged with today's date.
|
||||
# Safety-net teardown — fires only when Playwright's globalTeardown
|
||||
# didn't (worker crash, runner cancel). Reads the slug from
|
||||
# canvas/.playwright-staging-state.json (written by staging-setup
|
||||
# as its first action, before any CP call) and deletes only that
|
||||
# slug.
|
||||
#
|
||||
# Earlier versions of this step pattern-swept `e2e-canvas-<today>-*`
|
||||
# orgs to compensate for setup-crash-before-state-file-write. That
|
||||
# over-aggressive cleanup raced concurrent canvas-E2E runs and
|
||||
# poisoned each other's tenants — observed 2026-04-30 when three
|
||||
# real-test runs killed each other mid-test, surfacing as
|
||||
# `getaddrinfo ENOTFOUND` once CP had cleaned up the just-deleted
|
||||
# DNS record. Pattern-sweep removed; setup now writes the state
|
||||
# file before any CP work, so the slug is always recoverable.
|
||||
- name: Teardown safety net
|
||||
if: always() && needs.detect-changes.outputs.canvas == 'true'
|
||||
env:
|
||||
ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
||||
run: |
|
||||
set +e
|
||||
# Midnight-UTC rollover guard: a single-date filter misses
|
||||
# orgs created on the prior UTC day when the run crosses
|
||||
# midnight (incident 2026-04-26 23:46Z → 2026-04-27 00:12Z:
|
||||
# slug `e2e-canvas-20260426-1u8nz3` survived because the
|
||||
# safety-net step ran on the 27th, computed `today=20260427`,
|
||||
# and the filter `e2e-canvas-20260427-` never matched). Sweep
|
||||
# both today AND yesterday's dates so a cross-midnight run
|
||||
# still cleans up its own slug.
|
||||
orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
|
||||
| python3 -c "
|
||||
import json, sys, datetime
|
||||
d = json.load(sys.stdin)
|
||||
today = datetime.date.today()
|
||||
yesterday = today - datetime.timedelta(days=1)
|
||||
prefixes = (
|
||||
f'e2e-canvas-{today.strftime(\"%Y%m%d\")}-',
|
||||
f'e2e-canvas-{yesterday.strftime(\"%Y%m%d\")}-',
|
||||
)
|
||||
candidates = [o['slug'] for o in d.get('orgs', [])
|
||||
if any(o.get('slug','').startswith(p) for p in prefixes)
|
||||
and o.get('status') not in ('purged',)]
|
||||
print('\n'.join(candidates))
|
||||
" 2>/dev/null)
|
||||
for slug in $orgs; do
|
||||
STATE_FILE=".playwright-staging-state.json"
|
||||
if [ ! -f "$STATE_FILE" ]; then
|
||||
echo "::notice::No state file at canvas/$STATE_FILE — Playwright globalTeardown handled it (or setup never ran)."
|
||||
exit 0
|
||||
fi
|
||||
slug=$(python3 -c "import json; print(json.load(open('$STATE_FILE')).get('slug',''))")
|
||||
if [ -z "$slug" ]; then
|
||||
echo "::warning::State file present but slug missing; nothing to clean up."
|
||||
exit 0
|
||||
fi
|
||||
echo "Deleting orphan tenant: $slug"
|
||||
curl -sS -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"confirm\":\"$slug\"}" >/dev/null || true
|
||||
done
|
||||
exit 0
|
||||
|
||||
@ -111,6 +111,20 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
|
||||
const adminAuth = { Authorization: `Bearer ${ADMIN_TOKEN}` };
|
||||
console.log(`[staging-setup] Using slug=${slug}`);
|
||||
|
||||
// Write the state file FIRST, before any CP call. Teardown (both
|
||||
// Playwright globalTeardown and the workflow safety-net) reads this
|
||||
// file to identify the slug it must clean up. If we wait until the
|
||||
// end of setup to write it (the previous behavior), a crash during
|
||||
// any of steps 1-6 leaves the org orphaned in CP with no record on
|
||||
// disk — forcing the workflow safety-net into a pattern-sweep over
|
||||
// every `e2e-canvas-<date>-*` org, which races with concurrent
|
||||
// canvas-E2E runs and deletes their live tenants. Race observed
|
||||
// 2026-04-30 on PR #2264 staging→main: three real-test runs killed
|
||||
// each other's tenants mid-test, surfacing as `getaddrinfo ENOTFOUND`
|
||||
// when CP cleaned up the just-deleted DNS record.
|
||||
const stateFile = join(process.cwd(), ".playwright-staging-state.json");
|
||||
writeFileSync(stateFile, JSON.stringify({ slug }, null, 2));
|
||||
|
||||
// 1. Create org via admin endpoint — no WorkOS session needed
|
||||
const create = await jsonFetch(`${CP_URL}/cp/admin/orgs`, {
|
||||
method: "POST",
|
||||
@ -245,8 +259,8 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
|
||||
);
|
||||
console.log(`[staging-setup] Workspace online`);
|
||||
|
||||
// 7. Hand state off to tests + teardown
|
||||
const stateFile = join(process.cwd(), ".playwright-staging-state.json");
|
||||
// 7. Hand state off to tests + teardown — overwrite the slug-only
|
||||
// bootstrap state with the full state spec tests need.
|
||||
writeFileSync(
|
||||
stateFile,
|
||||
JSON.stringify({ slug, tenantURL, workspaceId, tenantToken }, null, 2),
|
||||
|
||||
@ -24,7 +24,11 @@ export default async function globalTeardown(): Promise<void> {
|
||||
|
||||
const stateFile = join(process.cwd(), ".playwright-staging-state.json");
|
||||
if (!existsSync(stateFile)) {
|
||||
console.warn("[staging-teardown] no state file — setup must have failed before org create; nothing to tear down");
|
||||
// staging-setup writes this file as its first action, before any
|
||||
// CP call. Missing here means setup never ran (CANVAS_E2E_STAGING
|
||||
// unset, or ran in a different cwd) — there's no slug we created
|
||||
// that needs cleaning up.
|
||||
console.warn("[staging-teardown] no state file — nothing to tear down");
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user