From 15b98c491664cd6668af1cde53013662700b7bef Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Wed, 29 Apr 2026 19:23:56 -0700 Subject: [PATCH] fix(e2e-canvas): kill teardown race that poisons concurrent runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Setup wrote .playwright-staging-state.json at the END (step 7), only after org create + provision-wait + TLS + workspace create + workspace- online all succeeded. If setup crashed at steps 1-6, the org existed in CP but the state file did not, so Playwright's globalTeardown bailed out ("nothing to tear down") and the workflow safety-net pattern-swept every e2e-canvas--* org to compensate. That sweep deleted concurrent runs' live tenants — including their CF DNS records — causing victims' next fetch to die with `getaddrinfo ENOTFOUND`. Race observed 2026-04-30 on PR #2264 staging→main: three real-test runs killed each other mid-test, blocking 68 commits of staging→main promotion. Fix: write the state file as setup's first action, right after slug generation, before any CP call. Now: - Crash before slug gen → no state file, no orphan to clean - Crash during steps 1-6 → state file has slug; teardown deletes it (DELETE 404s if org never created) - Setup completes → state file has full state; teardown deletes the slug The workflow safety-net no longer pattern-sweeps; it reads the state file and deletes only the recorded slug. Concurrent canvas-E2E runs no longer poison each other. Verified by: - tsc --noEmit on staging-setup.ts + staging-teardown.ts - YAML lint on e2e-staging-canvas.yml - Code review: state file write moved to line 113 (post-makeSlug, pre-CP) with the original line-249 write retained as a "promote to full state" overwrite at the end --- .github/workflows/e2e-staging-canvas.yml | 62 +++++++++++------------- canvas/e2e/staging-setup.ts | 18 ++++++- canvas/e2e/staging-teardown.ts | 6 ++- 3 files changed, 50 insertions(+), 36 deletions(-) diff --git a/.github/workflows/e2e-staging-canvas.yml b/.github/workflows/e2e-staging-canvas.yml index 634b8f4d..cd4bcd23 100644 --- a/.github/workflows/e2e-staging-canvas.yml +++ b/.github/workflows/e2e-staging-canvas.yml @@ -153,43 +153,39 @@ jobs: path: canvas/test-results/ retention-days: 14 - # Safety-net teardown mirrors the bash-harness workflow — if - # globalTeardown didn't run (worker crash, runner cancel), this - # step sweeps any e2e-canvas-* org tagged with today's date. + # Safety-net teardown — fires only when Playwright's globalTeardown + # didn't (worker crash, runner cancel). Reads the slug from + # canvas/.playwright-staging-state.json (written by staging-setup + # as its first action, before any CP call) and deletes only that + # slug. + # + # Earlier versions of this step pattern-swept `e2e-canvas--*` + # orgs to compensate for setup-crash-before-state-file-write. That + # over-aggressive cleanup raced concurrent canvas-E2E runs and + # poisoned each other's tenants — observed 2026-04-30 when three + # real-test runs killed each other mid-test, surfacing as + # `getaddrinfo ENOTFOUND` once CP had cleaned up the just-deleted + # DNS record. Pattern-sweep removed; setup now writes the state + # file before any CP work, so the slug is always recoverable. - name: Teardown safety net if: always() && needs.detect-changes.outputs.canvas == 'true' env: ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} run: | set +e - # Midnight-UTC rollover guard: a single-date filter misses - # orgs created on the prior UTC day when the run crosses - # midnight (incident 2026-04-26 23:46Z → 2026-04-27 00:12Z: - # slug `e2e-canvas-20260426-1u8nz3` survived because the - # safety-net step ran on the 27th, computed `today=20260427`, - # and the filter `e2e-canvas-20260427-` never matched). Sweep - # both today AND yesterday's dates so a cross-midnight run - # still cleans up its own slug. - orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \ - -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \ - | python3 -c " - import json, sys, datetime - d = json.load(sys.stdin) - today = datetime.date.today() - yesterday = today - datetime.timedelta(days=1) - prefixes = ( - f'e2e-canvas-{today.strftime(\"%Y%m%d\")}-', - f'e2e-canvas-{yesterday.strftime(\"%Y%m%d\")}-', - ) - candidates = [o['slug'] for o in d.get('orgs', []) - if any(o.get('slug','').startswith(p) for p in prefixes) - and o.get('status') not in ('purged',)] - print('\n'.join(candidates)) - " 2>/dev/null) - for slug in $orgs; do - curl -sS -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ - -H "Authorization: Bearer $ADMIN_TOKEN" \ - -H "Content-Type: application/json" \ - -d "{\"confirm\":\"$slug\"}" >/dev/null || true - done + STATE_FILE=".playwright-staging-state.json" + if [ ! -f "$STATE_FILE" ]; then + echo "::notice::No state file at canvas/$STATE_FILE — Playwright globalTeardown handled it (or setup never ran)." + exit 0 + fi + slug=$(python3 -c "import json; print(json.load(open('$STATE_FILE')).get('slug',''))") + if [ -z "$slug" ]; then + echo "::warning::State file present but slug missing; nothing to clean up." + exit 0 + fi + echo "Deleting orphan tenant: $slug" + curl -sS -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ + -H "Authorization: Bearer $ADMIN_TOKEN" \ + -H "Content-Type: application/json" \ + -d "{\"confirm\":\"$slug\"}" >/dev/null || true exit 0 diff --git a/canvas/e2e/staging-setup.ts b/canvas/e2e/staging-setup.ts index 5fc39225..77f7ef6e 100644 --- a/canvas/e2e/staging-setup.ts +++ b/canvas/e2e/staging-setup.ts @@ -111,6 +111,20 @@ export default async function globalSetup(_config: FullConfig): Promise { const adminAuth = { Authorization: `Bearer ${ADMIN_TOKEN}` }; console.log(`[staging-setup] Using slug=${slug}`); + // Write the state file FIRST, before any CP call. Teardown (both + // Playwright globalTeardown and the workflow safety-net) reads this + // file to identify the slug it must clean up. If we wait until the + // end of setup to write it (the previous behavior), a crash during + // any of steps 1-6 leaves the org orphaned in CP with no record on + // disk — forcing the workflow safety-net into a pattern-sweep over + // every `e2e-canvas--*` org, which races with concurrent + // canvas-E2E runs and deletes their live tenants. Race observed + // 2026-04-30 on PR #2264 staging→main: three real-test runs killed + // each other's tenants mid-test, surfacing as `getaddrinfo ENOTFOUND` + // when CP cleaned up the just-deleted DNS record. + const stateFile = join(process.cwd(), ".playwright-staging-state.json"); + writeFileSync(stateFile, JSON.stringify({ slug }, null, 2)); + // 1. Create org via admin endpoint — no WorkOS session needed const create = await jsonFetch(`${CP_URL}/cp/admin/orgs`, { method: "POST", @@ -245,8 +259,8 @@ export default async function globalSetup(_config: FullConfig): Promise { ); console.log(`[staging-setup] Workspace online`); - // 7. Hand state off to tests + teardown - const stateFile = join(process.cwd(), ".playwright-staging-state.json"); + // 7. Hand state off to tests + teardown — overwrite the slug-only + // bootstrap state with the full state spec tests need. writeFileSync( stateFile, JSON.stringify({ slug, tenantURL, workspaceId, tenantToken }, null, 2), diff --git a/canvas/e2e/staging-teardown.ts b/canvas/e2e/staging-teardown.ts index b573cb2d..87e1ed7f 100644 --- a/canvas/e2e/staging-teardown.ts +++ b/canvas/e2e/staging-teardown.ts @@ -24,7 +24,11 @@ export default async function globalTeardown(): Promise { const stateFile = join(process.cwd(), ".playwright-staging-state.json"); if (!existsSync(stateFile)) { - console.warn("[staging-teardown] no state file — setup must have failed before org create; nothing to tear down"); + // staging-setup writes this file as its first action, before any + // CP call. Missing here means setup never ran (CANVAS_E2E_STAGING + // unset, or ran in a different cwd) — there's no slug we created + // that needs cleaning up. + console.warn("[staging-teardown] no state file — nothing to tear down"); return; }