From 9d45211fd3d36e7371d460175d886a31d268c2dd Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 3 May 2026 14:01:50 -0700 Subject: [PATCH] canvas/e2e: surface admin-orgs row + workspace body on failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two diagnostic upgrades to the Playwright staging-setup harness, both zero-behavior-change: 1. provision-failed throw now includes the full admin-orgs row (boot stage, last error, terraform/SSM state, etc) instead of just the slug. Every "provision failed: " in CI history was followed by a manual repro to find out WHY — that round-trip is gone. 2. workspace-failed throw dumps the full /workspaces/{id} body when last_sample_error is empty. Boot crashes, image-pull errors, missing PYTHONPATH, and OpenAI-quota-at-startup all surface as a bare "Workspace failed:" today (see #2632). Now they carry the boot_stage / image / last_error fields the API row exposes. No fix for the underlying flakes — those are tracked in #2632 (CP race) and #2578 (OpenAI quota). This just stops them looking identical in the CI log. Co-Authored-By: Claude Opus 4.7 (1M context) --- canvas/e2e/staging-setup.ts | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/canvas/e2e/staging-setup.ts b/canvas/e2e/staging-setup.ts index 77f7ef6e..873ac07b 100644 --- a/canvas/e2e/staging-setup.ts +++ b/canvas/e2e/staging-setup.ts @@ -169,7 +169,17 @@ export default async function globalSetup(_config: FullConfig): Promise { orgID = row.id; return true; } - if (row.instance_status === "failed") throw new Error(`provision failed: ${slug}`); + if (row.instance_status === "failed") { + // Dump every diagnostic field the admin row carries — boot stage, + // last error, terraform/SSM state, etc. The bare slug message used + // to surface ZERO context, so triaging a failed provision meant + // re-running locally to repro. Now the failure log carries enough + // to point at the right subsystem (CP/AWS/SSM/runtime) without a + // second round-trip. + throw new Error( + `provision failed: ${slug} — admin-orgs row: ${JSON.stringify(row)}`, + ); + } return null; }, PROVISION_TIMEOUT_MS, @@ -249,7 +259,17 @@ export default async function globalSetup(_config: FullConfig): Promise { if (r.status !== 200) return null; if (r.body?.status === "online") return true; if (r.body?.status === "failed") { - throw new Error(`Workspace failed: ${r.body.last_sample_error || ""}`); + // last_sample_error is often empty when the failure happens before + // the agent emits a sample (e.g. boot crash, image pull error, + // missing PYTHONPATH, OpenAI quota at startup). Dumping the full + // body gives triage the boot_stage / last_error / image fields it + // needs without a second probe. Otherwise this propagates as a + // bare "Workspace failed: " — the exact useless message that + // sent #2632 to the issue tracker. + const detail = r.body.last_sample_error + ? r.body.last_sample_error + : `(no last_sample_error) full body: ${JSON.stringify(r.body)}`; + throw new Error(`Workspace failed: ${detail}`); } return null; },