From 754f361c03770666f423526f6b74e7fb7c8aef79 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Fri, 24 Apr 2026 17:32:12 -0700 Subject: [PATCH] =?UTF-8?q?fix(e2e):=20poll=20instance=5Fstatus=20not=20st?= =?UTF-8?q?atus=20=E2=80=94=20waitFor=20never=20matched,=20masked=20real?= =?UTF-8?q?=20bugs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Staging Canvas Playwright E2E has been timing out at 1200s on every recent run. Found via /code-review-and-quality on the staging→main promotion chain. The CP /cp/admin/orgs response shape is (handlers/admin.go:118): type adminOrgSummary struct { ... InstanceStatus string `json:"instance_status,omitempty"` ... } There is NO top-level `status` field. The waitFor predicate compared `row.status === "running"` against undefined on every poll — the predicate could never resolve truthy. The harness invariably wedged on the 20-min timeout regardless of whether the tenant was actually provisioned. This bug has been double-edged: - It MASKED the #242 pq-cache-collision class for hours: the tenants WERE provisioning fine, but the test couldn't tell. - It survived #255, #257 (real CP fixes) — the test still timed out, making us suspect more CP bugs that didn't exist. Fix: poll `row.instance_status` instead. One-line change. Identical fix for the failed-state branch one line below. No new tests for the harness itself; the fix's correctness is verified by the next E2E run on the affected branch passing end-to-end. If it doesn't pass after this, there's a separate bug we can hunt cleanly. Co-Authored-By: Claude Opus 4.7 (1M context) --- canvas/e2e/staging-setup.ts | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/canvas/e2e/staging-setup.ts b/canvas/e2e/staging-setup.ts index 7147f4ea..d8e77521 100644 --- a/canvas/e2e/staging-setup.ts +++ b/canvas/e2e/staging-setup.ts @@ -105,15 +105,24 @@ export default async function globalSetup(_config: FullConfig): Promise { } console.log(`[staging-setup] Org created: ${slug}`); - // 2. Wait for tenant running (admin-orgs list is the status source) + // 2. Wait for tenant running (admin-orgs list is the status source). + // + // The CP /cp/admin/orgs endpoint returns each org with an + // `instance_status` field (handlers/admin.go:adminOrgSummary, + // sourced from `org_instances.status`). NOT `status` — there's no + // top-level `status` on the row at all. A previous version of this + // test polled `row.status`, which was always undefined, so this + // waitFor never resolved truthy and the harness invariably timed + // out at 1200s — masking real CP bugs (see #242 chain) AND + // surviving real CP fixes alike. await waitFor( async () => { const r = await jsonFetch(`${CP_URL}/cp/admin/orgs`, { headers: adminAuth }); if (r.status !== 200) return null; const row = (r.body?.orgs || []).find((o: any) => o.slug === slug); if (!row) return null; - if (row.status === "running") return true; - if (row.status === "failed") throw new Error(`provision failed: ${slug}`); + if (row.instance_status === "running") return true; + if (row.instance_status === "failed") throw new Error(`provision failed: ${slug}`); return null; }, PROVISION_TIMEOUT_MS,