From d7193dfa3469540330b6db0698e26db14bb7b781 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 21 Apr 2026 04:34:11 -0700 Subject: [PATCH] feat(e2e): pivot to admin-bearer-only auth + add sanity self-check workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduces required secret surface from 2 (session cookie + admin token) to 1 (admin token). Pairs with molecule-controlplane#202 which adds: - POST /cp/admin/orgs — server-to-server org creation - GET /cp/admin/orgs/:slug/admin-token — per-tenant bearer fetch With those endpoints live, CI doesn't need to scrape a browser WorkOS session cookie. CP admin bearer (Railway CP_ADMIN_API_TOKEN) drives provision + tenant-token retrieval + teardown through a single credential. Changes ------- test_staging_full_saas.sh: admin bearer for provision/teardown, fetched per-tenant token drives all tenant API calls. Added E2E_INTENTIONAL_FAILURE=1 toggle that poisons the tenant token after provisioning so the teardown path gets exercised when the happy-path isn't. canvas/e2e/staging-setup.ts: same pivot; exports STAGING_TENANT_TOKEN instead of STAGING_SESSION_COOKIE. canvas/e2e/staging-tabs.spec.ts: context.setExtraHTTPHeaders with Authorization: Bearer on every page request, no cookie handling. All three workflows (e2e-staging-saas, canary-staging, e2e-staging-canvas): drop MOLECULE_STAGING_SESSION_COOKIE env + verification step. One secret to set. NEW e2e-staging-sanity.yml: weekly Mon 06:00 UTC. Runs the harness with E2E_INTENTIONAL_FAILURE=1 and inverts the pass condition — rc=1 is green, rc=0 (unexpected success) or rc=4 (leak) open a priority-high issue labelled e2e-safety-net. This is the answer to 'how do we know the teardown path still works when nothing else has failed recently.' STAGING_SAAS_E2E.md refreshed: single-secret setup, sanity workflow documented, canvas workflow added to the coverage matrix. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/canary-staging.yml | 7 +- .github/workflows/e2e-staging-canvas.yml | 7 +- .github/workflows/e2e-staging-saas.yml | 14 +- .github/workflows/e2e-staging-sanity.yml | 152 +++++++++++++++ canvas/e2e/staging-setup.ts | 131 ++++++------- canvas/e2e/staging-tabs.spec.ts | 113 +++++------ tests/e2e/STAGING_SAAS_E2E.md | 120 +++++++----- tests/e2e/test_staging_full_saas.sh | 235 ++++++++++------------- 8 files changed, 450 insertions(+), 329 deletions(-) create mode 100644 .github/workflows/e2e-staging-sanity.yml diff --git a/.github/workflows/canary-staging.yml b/.github/workflows/canary-staging.yml index 8036b855..c5374df2 100644 --- a/.github/workflows/canary-staging.yml +++ b/.github/workflows/canary-staging.yml @@ -42,7 +42,6 @@ jobs: env: MOLECULE_CP_URL: https://staging-api.moleculesai.app - MOLECULE_SESSION_COOKIE: ${{ secrets.MOLECULE_STAGING_SESSION_COOKIE }} MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} E2E_MODE: canary E2E_RUNTIME: hermes @@ -51,10 +50,10 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Verify required secrets + - name: Verify admin token present run: | - if [ -z "$MOLECULE_SESSION_COOKIE" ] || [ -z "$MOLECULE_ADMIN_TOKEN" ]; then - echo "::error::Canary secrets missing — set MOLECULE_STAGING_SESSION_COOKIE and MOLECULE_STAGING_ADMIN_TOKEN" + if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then + echo "::error::MOLECULE_STAGING_ADMIN_TOKEN not set" exit 2 fi diff --git a/.github/workflows/e2e-staging-canvas.yml b/.github/workflows/e2e-staging-canvas.yml index e5347fec..e3c667cf 100644 --- a/.github/workflows/e2e-staging-canvas.yml +++ b/.github/workflows/e2e-staging-canvas.yml @@ -39,7 +39,6 @@ jobs: env: CANVAS_E2E_STAGING: '1' MOLECULE_CP_URL: https://staging-api.moleculesai.app - MOLECULE_SESSION_COOKIE: ${{ secrets.MOLECULE_STAGING_SESSION_COOKIE }} MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} defaults: @@ -49,10 +48,10 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Verify required secrets + - name: Verify admin token present run: | - if [ -z "$MOLECULE_SESSION_COOKIE" ] || [ -z "$MOLECULE_ADMIN_TOKEN" ]; then - echo "::error::Missing MOLECULE_STAGING_SESSION_COOKIE or MOLECULE_STAGING_ADMIN_TOKEN" + if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then + echo "::error::Missing MOLECULE_STAGING_ADMIN_TOKEN" exit 2 fi diff --git a/.github/workflows/e2e-staging-saas.yml b/.github/workflows/e2e-staging-saas.yml index 5f776664..1780b72d 100644 --- a/.github/workflows/e2e-staging-saas.yml +++ b/.github/workflows/e2e-staging-saas.yml @@ -74,9 +74,9 @@ jobs: env: MOLECULE_CP_URL: https://staging-api.moleculesai.app - # Secrets referenced here must be configured in + # Single admin-bearer secret drives provision + tenant-token + # retrieval + teardown. Configure in # Settings → Secrets and variables → Actions → Repository secrets. - MOLECULE_SESSION_COOKIE: ${{ secrets.MOLECULE_STAGING_SESSION_COOKIE }} MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} E2E_RUNTIME: ${{ github.event.inputs.runtime || 'hermes' }} E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}" @@ -85,17 +85,13 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Verify required secrets + - name: Verify admin token present run: | - if [ -z "$MOLECULE_SESSION_COOKIE" ]; then - echo "::error::MOLECULE_STAGING_SESSION_COOKIE secret not set" - exit 2 - fi if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then - echo "::error::MOLECULE_STAGING_ADMIN_TOKEN secret not set" + echo "::error::MOLECULE_STAGING_ADMIN_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)" exit 2 fi - echo "Secrets present ✓" + echo "Admin token present ✓" - name: CP staging health preflight run: | diff --git a/.github/workflows/e2e-staging-sanity.yml b/.github/workflows/e2e-staging-sanity.yml new file mode 100644 index 00000000..f20f628b --- /dev/null +++ b/.github/workflows/e2e-staging-sanity.yml @@ -0,0 +1,152 @@ +name: E2E Staging Sanity (leak-detection self-check) + +# Periodic assertion that the teardown safety nets in e2e-staging-saas +# and canary-staging actually work. Runs the E2E harness with +# E2E_INTENTIONAL_FAILURE=1, which poisons the tenant admin token after +# the org is provisioned. The workspace-provision step then fails, the +# script exits non-zero, and the EXIT trap + workflow always()-step +# must still tear down cleanly. +# +# A green run means: +# - The script exited non-zero (intentional failure caught) +# - The trap fired teardown +# - The leak-detection poll found zero orphan orgs +# +# A red run means the teardown path itself is broken — act on this the +# same way you'd act on a canary failure (the whole E2E safety net is +# compromised until it's fixed). +# +# Cadence: once a week, Monday 06:00 UTC. Drift-slow, not per-PR — the +# teardown path rarely changes, and a weekly heartbeat is enough to +# catch silent regressions in cleanup code paths. + +on: + schedule: + - cron: '0 6 * * 1' + workflow_dispatch: + +concurrency: + # Shares the group with canary + full so they don't collide on + # staging org-create quota. + group: e2e-staging-sanity + cancel-in-progress: false + +permissions: + issues: write + contents: read + +jobs: + sanity: + name: Intentional-failure teardown sanity + runs-on: ubuntu-latest + timeout-minutes: 20 + + env: + MOLECULE_CP_URL: https://staging-api.moleculesai.app + MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} + E2E_MODE: canary # lean lifecycle; we only need the org to exist + E2E_RUNTIME: hermes + E2E_RUN_ID: "sanity-${{ github.run_id }}" + E2E_INTENTIONAL_FAILURE: "1" + + steps: + - uses: actions/checkout@v4 + + - name: Verify admin token present + run: | + if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then + echo "::error::MOLECULE_STAGING_ADMIN_TOKEN not set" + exit 2 + fi + + # Inverted assertion: the run MUST fail. If it passes, the + # E2E_INTENTIONAL_FAILURE path is broken (token not being + # poisoned correctly, or the harness silently recovered). + - name: Run harness — expecting exit !=0 + id: harness + run: | + set +e + bash tests/e2e/test_staging_full_saas.sh + rc=$? + echo "harness_rc=$rc" >> "$GITHUB_OUTPUT" + # The only acceptable outcomes: + # 1 — harness failed mid-run, teardown ran, leak-check passed + # (exit 4 means teardown left a leak — that's the real bug + # this sanity check exists to catch) + if [ "$rc" = "1" ]; then + echo "✓ Harness failed as expected (rc=1); teardown trap ran, leak-check passed" + exit 0 + elif [ "$rc" = "0" ]; then + echo "::error::Harness succeeded under E2E_INTENTIONAL_FAILURE=1 — the poisoning path is broken" + exit 1 + elif [ "$rc" = "4" ]; then + echo "::error::LEAK DETECTED (rc=4) — teardown failed to clean up the org. Safety net broken." + exit 4 + else + echo "::error::Unexpected rc=$rc — neither clean-failure nor leak. Investigate harness." + exit 1 + fi + + - name: Open issue if safety net is broken + if: failure() + uses: actions/github-script@v7 + with: + script: | + const title = "🚨 E2E teardown safety net broken"; + const runURL = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; + const body = + `The weekly sanity run (E2E_INTENTIONAL_FAILURE=1) did not exit ` + + `as expected. This means one of:\n` + + ` - poisoning didn't actually cause failure (test harness regression), OR\n` + + ` - teardown left an orphan org (leak detection caught a real bug)\n\n` + + `Run: ${runURL}\n\n` + + `This is higher priority than a canary failure — the whole ` + + `E2E safety net can't be trusted until this is resolved.`; + + const { data: existing } = await github.rest.issues.listForRepo({ + owner: context.repo.owner, repo: context.repo.repo, + state: 'open', labels: 'e2e-safety-net', + }); + const match = existing.find(i => i.title === title); + if (match) { + await github.rest.issues.createComment({ + owner: context.repo.owner, repo: context.repo.repo, + issue_number: match.number, + body: `Still broken. ${runURL}`, + }); + } else { + await github.rest.issues.create({ + owner: context.repo.owner, repo: context.repo.repo, + title, body, + labels: ['e2e-safety-net', 'bug', 'priority-high'], + }); + } + + # Belt-and-braces: if teardown left anything behind, nuke it here + # so we don't bleed staging quota. Different label from the + # always()-steps in the other workflows so sanity-only orgs get + # cleaned up by sanity runs. + - name: Teardown safety net + if: always() + env: + ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} + run: | + set +e + orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \ + -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \ + | python3 -c " + import json, sys + d = json.load(sys.stdin) + today = __import__('datetime').date.today().strftime('%Y%m%d') + candidates = [o['slug'] for o in d.get('orgs', []) + if o.get('slug','').startswith(f'e2e-canary-{today}-sanity-') + and o.get('status') not in ('purged',)] + print('\n'.join(candidates)) + " 2>/dev/null) + for slug in $orgs; do + curl -sS -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ + -H "Authorization: Bearer $ADMIN_TOKEN" \ + -H "Content-Type: application/json" \ + -d "{\"confirm_token\":\"$slug\"}" >/dev/null || true + done + exit 0 diff --git a/canvas/e2e/staging-setup.ts b/canvas/e2e/staging-setup.ts index 1850a426..598fb877 100644 --- a/canvas/e2e/staging-setup.ts +++ b/canvas/e2e/staging-setup.ts @@ -1,25 +1,21 @@ /** * Playwright global setup for the staging canvas E2E. * - * Provisions a fresh staging org per test run (via POST /cp/orgs against - * staging CP), waits for the tenant EC2 + cloudflared tunnel + TLS - * propagation, provisions one hermes workspace on the new tenant, waits - * for it to reach status=online, then exports: + * Provisions a fresh staging org per run (POST /cp/admin/orgs), fetches + * the per-tenant admin token, provisions one hermes workspace, waits + * for online, then exports: * - * STAGING_TENANT_URL — https://.moleculesai.app - * STAGING_WORKSPACE_ID — UUID of the provisioned hermes workspace - * STAGING_SLUG — org slug (for teardown) + * STAGING_TENANT_URL https://.moleculesai.app + * STAGING_WORKSPACE_ID UUID of the hermes workspace + * STAGING_TENANT_TOKEN per-tenant admin bearer (for spec requests) + * STAGING_SLUG org slug (used by teardown) * - * staging-teardown.ts consumes STAGING_SLUG to DELETE the org. - * - * Required env (set via GH Actions secrets in the workflow): - * MOLECULE_CP_URL default: https://staging-api.moleculesai.app - * MOLECULE_SESSION_COOKIE WorkOS session for the staging test user - * MOLECULE_ADMIN_TOKEN CP admin bearer for teardown (unused in setup - * but checked here so both halves fail fast) - * - * Runs only when CANVAS_E2E_STAGING=1 so local `pnpm playwright test` in - * dev doesn't try to provision against staging by accident. + * Required env: + * MOLECULE_CP_URL default: https://staging-api.moleculesai.app + * MOLECULE_ADMIN_TOKEN CP admin bearer (Railway staging + * CP_ADMIN_API_TOKEN). Drives provision + + * tenant-token retrieval + teardown via a + * single credential. */ import type { FullConfig } from "@playwright/test"; @@ -27,11 +23,10 @@ import { writeFileSync } from "fs"; import { join } from "path"; const CP_URL = process.env.MOLECULE_CP_URL || "https://staging-api.moleculesai.app"; -const SESSION = process.env.MOLECULE_SESSION_COOKIE; const ADMIN_TOKEN = process.env.MOLECULE_ADMIN_TOKEN; const STAGING = process.env.CANVAS_E2E_STAGING === "1"; -const PROVISION_TIMEOUT_MS = 15 * 60 * 1000; // 15 min cold-boot budget +const PROVISION_TIMEOUT_MS = 15 * 60 * 1000; const WORKSPACE_ONLINE_TIMEOUT_MS = 10 * 60 * 1000; const TLS_TIMEOUT_MS = 3 * 60 * 1000; @@ -41,10 +36,7 @@ async function jsonFetch( ): Promise<{ status: number; body: any }> { const res = await fetch(url, { ...init, - headers: { - "Content-Type": "application/json", - ...(init.headers || {}), - }, + headers: { "Content-Type": "application/json", ...(init.headers || {}) }, }); let body: any = null; try { @@ -71,8 +63,6 @@ async function waitFor( } function makeSlug(): string { - // Matches CP's ^[a-z][a-z0-9-]{2,31}$. The "e2e-" prefix lets auto-cleanup - // crons grep-find leftovers from crashed runs. const y = new Date().toISOString().slice(0, 10).replace(/-/g, ""); const rand = Math.random().toString(36).slice(2, 8); return `e2e-canvas-${y}-${rand}`.slice(0, 32); @@ -83,67 +73,65 @@ export default async function globalSetup(_config: FullConfig): Promise { console.log("[staging-setup] CANVAS_E2E_STAGING not set, skipping"); return; } - - if (!SESSION) { - throw new Error("MOLECULE_SESSION_COOKIE required for staging E2E"); - } if (!ADMIN_TOKEN) { throw new Error( - "MOLECULE_ADMIN_TOKEN required for staging E2E (teardown needs it)", + "MOLECULE_ADMIN_TOKEN required (Railway staging CP_ADMIN_API_TOKEN)", ); } const slug = makeSlug(); - const cookieHeader = `molecule_cp_session=${SESSION}`; + const adminAuth = { Authorization: `Bearer ${ADMIN_TOKEN}` }; console.log(`[staging-setup] Using slug=${slug}`); - // 1. Accept terms (idempotent — already-accepted returns 2xx or 400) - await jsonFetch(`${CP_URL}/cp/auth/accept-terms`, { + // 1. Create org via admin endpoint — no WorkOS session needed + const create = await jsonFetch(`${CP_URL}/cp/admin/orgs`, { method: "POST", - headers: { Cookie: cookieHeader }, - body: JSON.stringify({}), - }).catch(() => { - /* best-effort */ - }); - - // 2. Create org - const create = await jsonFetch(`${CP_URL}/cp/orgs`, { - method: "POST", - headers: { Cookie: cookieHeader }, - body: JSON.stringify({ slug, name: `E2E Canvas ${slug}` }), + headers: adminAuth, + body: JSON.stringify({ + slug, + name: `E2E Canvas ${slug}`, + owner_user_id: `e2e-runner:${slug}`, + }), }); if (create.status >= 400) { throw new Error( - `POST /cp/orgs returned ${create.status}: ${JSON.stringify(create.body)}`, + `POST /cp/admin/orgs ${create.status}: ${JSON.stringify(create.body)}`, ); } console.log(`[staging-setup] Org created: ${slug}`); - // 3. Wait for tenant provision (status=running) - const finalStatus = await waitFor<{ url?: string; status: string }>( + // 2. Wait for tenant running (admin-orgs list is the status source) + await waitFor( async () => { - const r = await jsonFetch( - `${CP_URL}/cp/orgs/${slug}/provision-status`, - { headers: { Cookie: cookieHeader } }, - ); + const r = await jsonFetch(`${CP_URL}/cp/admin/orgs`, { headers: adminAuth }); if (r.status !== 200) return null; - if (r.body?.status === "running") return r.body; - if (r.body?.status === "failed") { - throw new Error(`Provisioning failed: ${JSON.stringify(r.body)}`); - } + const row = (r.body?.orgs || []).find((o: any) => o.slug === slug); + if (!row) return null; + if (row.status === "running") return true; + if (row.status === "failed") throw new Error(`provision failed: ${slug}`); return null; }, PROVISION_TIMEOUT_MS, 15_000, "tenant provision", ); + console.log(`[staging-setup] Tenant running`); - const tenantURL = - finalStatus.url || - `https://${slug}.${CP_URL.includes("staging") ? "moleculesai.app" : "moleculesai.app"}`; + // 3. Fetch per-tenant admin token + const tokRes = await jsonFetch( + `${CP_URL}/cp/admin/orgs/${slug}/admin-token`, + { headers: adminAuth }, + ); + if (tokRes.status !== 200 || !tokRes.body?.admin_token) { + throw new Error( + `tenant-token fetch ${tokRes.status}: ${JSON.stringify(tokRes.body)}`, + ); + } + const tenantToken: string = tokRes.body.admin_token; + const tenantURL = `https://${slug}.moleculesai.app`; console.log(`[staging-setup] Tenant URL: ${tenantURL}`); - // 4. Wait for tenant TLS readiness + // 4. TLS readiness await waitFor( async () => { try { @@ -160,10 +148,11 @@ export default async function globalSetup(_config: FullConfig): Promise { "tenant TLS", ); - // 5. Provision one hermes workspace (cheapest, fastest-booting) + // 5. Provision workspace + const tenantAuth = { Authorization: `Bearer ${tenantToken}` }; const ws = await jsonFetch(`${tenantURL}/workspaces`, { method: "POST", - headers: { Cookie: cookieHeader }, + headers: tenantAuth, body: JSON.stringify({ name: "E2E Canvas Test", runtime: "hermes", @@ -172,9 +161,7 @@ export default async function globalSetup(_config: FullConfig): Promise { }), }); if (ws.status >= 400 || !ws.body?.id) { - throw new Error( - `Workspace create failed (${ws.status}): ${JSON.stringify(ws.body)}`, - ); + throw new Error(`Workspace create ${ws.status}: ${JSON.stringify(ws.body)}`); } const workspaceId = ws.body.id as string; console.log(`[staging-setup] Workspace created: ${workspaceId}`); @@ -183,14 +170,12 @@ export default async function globalSetup(_config: FullConfig): Promise { await waitFor( async () => { const r = await jsonFetch(`${tenantURL}/workspaces/${workspaceId}`, { - headers: { Cookie: cookieHeader }, + headers: tenantAuth, }); if (r.status !== 200) return null; if (r.body?.status === "online") return true; if (r.body?.status === "failed") { - throw new Error( - `Workspace ${workspaceId} failed: ${r.body.last_sample_error || ""}`, - ); + throw new Error(`Workspace failed: ${r.body.last_sample_error || ""}`); } return null; }, @@ -200,19 +185,15 @@ export default async function globalSetup(_config: FullConfig): Promise { ); console.log(`[staging-setup] Workspace online`); - // 7. Export via a state file so staging-teardown and the test spec can - // pick up the same slug / urls. Playwright's global setup can't - // export env to the test subprocess directly in all configurations. + // 7. Hand state off to tests + teardown const stateFile = join(process.cwd(), ".playwright-staging-state.json"); writeFileSync( stateFile, - JSON.stringify({ slug, tenantURL, workspaceId }, null, 2), + JSON.stringify({ slug, tenantURL, workspaceId, tenantToken }, null, 2), ); - // Also set env for in-process test reads. process.env.STAGING_SLUG = slug; process.env.STAGING_TENANT_URL = tenantURL; process.env.STAGING_WORKSPACE_ID = workspaceId; - process.env.STAGING_SESSION_COOKIE = SESSION; - + process.env.STAGING_TENANT_TOKEN = tenantToken; console.log(`[staging-setup] Ready — ${stateFile}`); } diff --git a/canvas/e2e/staging-tabs.spec.ts b/canvas/e2e/staging-tabs.spec.ts index 6e8b5d9c..412953a5 100644 --- a/canvas/e2e/staging-tabs.spec.ts +++ b/canvas/e2e/staging-tabs.spec.ts @@ -3,20 +3,24 @@ * fresh staging org provisioned in the global setup. Asserts each tab * renders without throwing and captures a screenshot for visual review. * - * Relies on `staging-setup.ts` to provision a tenant org, provision one - * hermes workspace on it, and hand us a tenant URL + workspace id via - * env (set by the setup file before tests run). Global teardown tears - * down the org. + * Auth model: the tenant platform's AdminAuth middleware accepts a bearer + * token OR a WorkOS session cookie. Playwright can't mint a WorkOS + * session, so we feed the per-tenant admin token (fetched in global + * setup via GET /cp/admin/orgs/:slug/admin-token) as an Authorization: + * Bearer header via context.setExtraHTTPHeaders(). Every browser + * request inherits the header. * - * Runs only when CANVAS_E2E_STAGING=1 — tests are skipped in local dev - * where the prerequisite env isn't set. + * Known SaaS gaps — documented in #1369 and allowed to render errored + * content without failing the test (the gate is "no hard crash, no + * 'Failed to load' toast"): + * - Files tab: empty (platform can't docker exec into a remote EC2) + * - Terminal tab: WS connect fails + * - Peers tab: 401 without workspace-scoped token */ import { test, expect } from "@playwright/test"; // Tab ids as declared in canvas/src/components/SidePanel.tsx TABS. -// Kept duplicated here (not imported) because Playwright tests run outside -// the Next.js bundler and can't import from @/components paths. const TAB_IDS = [ "chat", "activity", @@ -43,32 +47,21 @@ test.describe("staging canvas tabs", () => { context, }) => { const tenantURL = process.env.STAGING_TENANT_URL; - const sessionCookie = process.env.STAGING_SESSION_COOKIE; + const tenantToken = process.env.STAGING_TENANT_TOKEN; const workspaceId = process.env.STAGING_WORKSPACE_ID; - if (!tenantURL || !sessionCookie || !workspaceId) { + if (!tenantURL || !tenantToken || !workspaceId) { throw new Error( - "staging-setup.ts did not export STAGING_TENANT_URL / STAGING_SESSION_COOKIE / STAGING_WORKSPACE_ID — did global setup run?", + "staging-setup.ts did not export STAGING_TENANT_URL / STAGING_TENANT_TOKEN / STAGING_WORKSPACE_ID — did global setup run?", ); } - // The session cookie was minted by CP at sign-in; canvas on the tenant - // subdomain shares it via the parent-domain scope (.moleculesai.app). - // Playwright needs both the cookie and the cross-domain visibility. - const url = new URL(tenantURL); - await context.addCookies([ - { - name: "molecule_cp_session", - value: sessionCookie, - // Leading dot → valid on all subdomains. The staging WorkOS auth - // flow sets it this way, so we mirror. - domain: "." + url.hostname.replace(/^[^.]+\./, ""), - path: "/", - httpOnly: true, - secure: true, - sameSite: "Lax", - }, - ]); + // Attach the per-tenant admin bearer to every outbound request. + // The tenant platform's AdminAuth middleware accepts this; no + // WorkOS session needed. + await context.setExtraHTTPHeaders({ + Authorization: `Bearer ${tenantToken}`, + }); const consoleErrors: string[] = []; page.on("console", (msg) => { @@ -79,12 +72,13 @@ test.describe("staging canvas tabs", () => { await page.goto(tenantURL, { waitUntil: "networkidle" }); - // Canvas hydration races WebSocket connect + /workspaces fetch. Wait - // for the workspace node selector or the hydration-error banner — - // whichever wins first. - await page.waitForSelector('[role="tablist"], [data-testid="hydration-error"]', { - timeout: 45_000, - }); + // Canvas hydration races WebSocket connect + /workspaces fetch. + // Wait for the tablist element (appears after a workspace is + // selected) or the hydration-error banner — whichever wins first. + await page.waitForSelector( + '[role="tablist"], [data-testid="hydration-error"]', + { timeout: 45_000 }, + ); const hydrationErr = await page .locator('[data-testid="hydration-error"]') @@ -94,20 +88,19 @@ test.describe("staging canvas tabs", () => { "canvas hydration failed — check staging CP + tenant reachability", ).toBe(0); - // Click the workspace node to open the side panel. The node's - // accessible name is the workspace display name; we match by id attr - // to avoid coupling to the display name which tests can't know. - const node = page.locator(`[data-workspace-id="${workspaceId}"]`).first(); - // Fallback: click by role if the data attribute isn't wired - if ((await node.count()) === 0) { - // Try clicking the first workspace card visible - const firstNode = page.locator('[role="button"][aria-label*="Workspace"]').first(); - await firstNode.click({ timeout: 10_000 }); + // Click the workspace node to open the side panel. Try a data + // attribute first, fall back to a generic role-based selector so + // the test doesn't break when the node-card markup changes. + const byDataAttr = page.locator(`[data-workspace-id="${workspaceId}"]`).first(); + if ((await byDataAttr.count()) > 0) { + await byDataAttr.click({ timeout: 10_000 }); } else { - await node.click({ timeout: 10_000 }); + const firstNode = page + .locator('[role="button"][aria-label*="Workspace" i]') + .first(); + await firstNode.click({ timeout: 10_000 }); } - // Wait for the side panel tablist to mount await page.waitForSelector('[role="tablist"]', { timeout: 15_000 }); for (const tabId of TAB_IDS) { @@ -120,23 +113,17 @@ test.describe("staging canvas tabs", () => { await tabButton.click(); const panel = page.locator(`#panel-${tabId}`); - await expect( - panel, - `panel for ${tabId} never rendered`, - ).toBeVisible({ timeout: 10_000 }); + await expect(panel, `panel for ${tabId} never rendered`).toBeVisible({ + timeout: 10_000, + }); - // No toast-style error banner should appear for a healthy workspace. - // Known exceptions: terminal may 4xx on SaaS cross-EC2 (WS target - // unreachable), peers may 401 without workspace token. Those are - // reported separately in issue #1369; here we just guard against - // hard crashes (toast with "Error" keyword). + // "Failed to load" toast = hard crash. Known SaaS-mode gaps + // (Files empty, Terminal disconnected, Peers 401) surface as + // in-panel content, not toasts. const errorToasts = await page .locator('[role="alert"]:has-text("Failed to load")') .count(); - expect( - errorToasts, - `tab ${tabId}: saw "Failed to load" toast`, - ).toBe(0); + expect(errorToasts, `tab ${tabId}: "Failed to load" toast`).toBe(0); await page.screenshot({ path: `test-results/staging-tab-${tabId}.png`, @@ -145,14 +132,16 @@ test.describe("staging canvas tabs", () => { }); } - // Aggregate console-error check. Allow a small budget for known-noisy - // Sentry/Vercel analytics errors that don't reflect app health. + // Aggregate console-error budget. Known-noisy sources whitelisted: + // Sentry, Vercel analytics, WS reconnects (expected on SaaS + // terminal), favicon 404 (cosmetic). const appErrors = consoleErrors.filter( (msg) => !msg.includes("sentry") && !msg.includes("vercel") && - !msg.includes("WebSocket") && // WS failures ≠ app failures - !msg.includes("favicon"), + !msg.includes("WebSocket") && + !msg.includes("favicon") && + !msg.includes("molecule-icon.png"), // another cosmetic 404 ); expect( appErrors, diff --git a/tests/e2e/STAGING_SAAS_E2E.md b/tests/e2e/STAGING_SAAS_E2E.md index 11d1c973..dd4e3095 100644 --- a/tests/e2e/STAGING_SAAS_E2E.md +++ b/tests/e2e/STAGING_SAAS_E2E.md @@ -1,63 +1,76 @@ -# Staging full-SaaS E2E +# Staging SaaS E2E — runbook -`tests/e2e/test_staging_full_saas.sh` provisions a fresh org per run, exercises the workspace lifecycle end-to-end, then tears the org down and asserts leak-free. Runs in CI via `.github/workflows/e2e-staging-saas.yml`. +Four workflows + a shared bash harness that together cover the SaaS stack end to end against live staging. Every workflow provisions a fresh org per run and tears it down; leaks are CI failures. -## What it covers +## Coverage -| Step | What it verifies | +| Workflow | Cadence | Wall time | Scope | +|---|---|---|---| +| `e2e-staging-saas.yml` | push + nightly 07:00 UTC | ~20 min | Full API: org → tenant → 2 workspaces → A2A → HMA → delegation → leak check | +| `canary-staging.yml` | every 30 min | ~8 min | Minimum smoke + self-managed alert issue | +| `e2e-staging-canvas.yml` | push + weekly Sunday 08:00 | ~25 min | All 13 canvas workspace-panel tabs via Playwright | +| `e2e-staging-sanity.yml` | weekly Monday 06:00 | ~10 min | Intentional-failure: teardown safety-net self-check | + +`tests/e2e/test_staging_full_saas.sh` is the shared harness all workflows invoke (with `E2E_MODE={full|canary}` and `E2E_INTENTIONAL_FAILURE={0|1}` toggles). + +### Full-SaaS checklist (sections) + +| # | What | |---|---| -| 1. Accept terms (POST `/cp/auth/accept-terms`) | Session cookie valid, ToS gate honours idempotent replay | -| 2. Create org (POST `/cp/orgs`) | Slug validation, member insert, billing gate, quota | -| 3. Wait for provisioning | CP tenant EC2 boot + cloudflared tunnel + DNS + TLS (~5–10 min cold) | -| 4. Tenant health (GET `/health` on new tenant URL) | Cert chain OK, TenantGuard + session-auth wired | -| 5. Provision parent workspace | SaaS provision path (CP RunInstances, EC2 bootstrap, runtime register) | -| 6. Provision child workspace under parent | `parent_id` relationship, team-hierarchy | -| 7. Wait both online | Workspace sweeper + register handler + token bootstrap | -| 8. A2A round-trip (POST `/workspaces/:id/a2a`) | Full LLM loop — registration, MCP tools, provider auth, response shape | -| 9. HMA memory write+read | `/memories` scope routing, awareness namespace, persistence | -| 9b. Peers + activity smoke | Route registration + activity-log write path | -| 10. Teardown | `DELETE /cp/admin/tenants/:slug` + leak assertion | +| 0 | CP preflight | +| 1 | `POST /cp/admin/orgs` — org create without WorkOS session | +| 2 | Wait for tenant status = running | +| 3 | `GET /cp/admin/orgs/:slug/admin-token` — fetch per-tenant bearer | +| 4 | Tenant TLS readiness on `/health` | +| 5 | Provision parent workspace | +| 6 | Provision child workspace (full mode) | +| 7 | Wait both online | +| 8 | A2A round-trip on parent — expect agent response | +| 9 | HMA memory write + read, peers smoke, activity log (full mode) | +| 10 | Delegation mechanics: parent → child via proxy + activity assertion (full mode) | +| 11 | EXIT trap — teardown + leak detection | -If any step fails, the EXIT trap tears down the org anyway. +### Canvas tabs -## Required GitHub Actions secrets +Opens all 13 workspace-panel tabs against the freshly-provisioned org: -Both are at **Settings → Secrets and variables → Actions → Repository secrets**: +``` +chat, activity, details, skills, terminal, config, schedule, +channels, files, memory, traces, events, audit +``` -### `MOLECULE_STAGING_SESSION_COOKIE` +Per tab: visible, panel renders, no "Failed to load" toast, screenshot captured. Known SaaS-mode gaps (Files empty, Terminal disconnect, Peers 401) are whitelisted — see issue #1369. -A valid `molecule_cp_session` cookie for a **test user** that: +### Sanity self-check -- is on the staging beta allowlist (or `BETA_GATE_ENABLED=false` on staging) -- has already accepted the current terms version (the script re-accepts idempotently but can't bootstrap from unaccepted) -- has under-quota owned orgs +Runs the harness with `E2E_INTENTIONAL_FAILURE=1`, which poisons the tenant admin token after the org is provisioned. The workspace-provision step then fails and the script exits non-zero; the EXIT trap + teardown + leak assertion must still run clean. If they don't, the sanity workflow files a `priority-high` issue with label `e2e-safety-net`. -**How to extract:** +## Required secret (exactly one) -1. In an incognito window, sign in at `https://staging-api.moleculesai.app/cp/auth/login` with the test user. -2. DevTools → Application → Cookies → `https://staging-api.moleculesai.app` -3. Copy the `molecule_cp_session` value (base64-looking blob). -4. Paste as the secret value. Do not include the `molecule_cp_session=` prefix. - -**Rotation:** WorkOS sessions don't expire until the user signs out or the refresh token revokes. A 90-day rotation schedule is safe. +Set in **Settings → Secrets and variables → Actions → Repository secrets**: ### `MOLECULE_STAGING_ADMIN_TOKEN` -The `CP_ADMIN_API_TOKEN` env var currently set on the Railway **staging** molecule-platform → controlplane service. - -**How to extract:** +The `CP_ADMIN_API_TOKEN` env currently set on the Railway staging molecule-platform → controlplane service. ``` -railway variables --service controlplane --environment staging --kv | grep CP_ADMIN_API_TOKEN +railway variables --environment staging --service controlplane --kv | grep CP_ADMIN_API_TOKEN ``` -Used exclusively for teardown (`DELETE /cp/admin/tenants/:slug`) and leak detection (`GET /cp/admin/orgs`). Write access, treat like prod admin. +This **one** secret drives everything: + +- `POST /cp/admin/orgs` — provision org (no WorkOS session needed) +- `GET /cp/admin/orgs/:slug/admin-token` — fetch per-tenant bearer +- `DELETE /cp/admin/tenants/:slug` — teardown +- `GET /cp/admin/orgs` — leak detection post-teardown + +The per-tenant admin token (short-lived, per-org) drives every tenant-side call (`POST /workspaces`, `/memories`, `/a2a`, etc.). + +**No WorkOS session cookie needed** — admin endpoints bypass session auth via `AdminGate` (bearer + rate-limit only). CI provision + teardown collapse to one credential. ## Running locally ``` -export MOLECULE_CP_URL=https://staging-api.moleculesai.app -export MOLECULE_SESSION_COOKIE="…" export MOLECULE_ADMIN_TOKEN="…" # Optional: keep the org for post-mortem inspection export E2E_KEEP_ORG=1 @@ -68,14 +81,29 @@ bash tests/e2e/test_staging_full_saas.sh ## Cost -- Full run: ~20 min wall clock -- Compute: ~12 min of t3.small tenant EC2 + ~4 min of per-workspace EC2 × 2 = ~20 t3.small-minutes ≈ **$0.007/run** -- Daily (nightly cron + PR runs ≈ 5/day): **~$0.04/day** -- Hard timeout (30 min workflow timeout + per-request curl timeouts) caps runaway cost +- Full run: ~20 min, ~$0.007 +- Canary (48/day): ~$0.06/day +- Canvas (few/week): ~$0.01/day +- Sanity (weekly): ~$0.002/week +- **Total staging burn: < $0.15/day** at expected CI load -## Known gaps (follow-ups) +Hard per-workflow timeouts (15–40 min) cap runaway cost. Three teardown layers: -- Canvas UI tabs not covered — separate Playwright workflow in `e2e-staging-canvas.yml` (todo) -- Delegation end-to-end (parent calls `delegate_task` MCP tool against child) — not in this run because it needs a real LLM loop and doubles runtime cost -- Claude Code runtime test — currently only Hermes is exercised to keep wall time down; pass `runtime: claude-code` via workflow_dispatch to test it -- No screenshot/trace capture on failure — add if CI signal is noisy +1. Bash `trap cleanup_org EXIT INT TERM` in the harness +2. Playwright `globalTeardown` for the canvas workflow +3. `if: always()` step in every workflow that greps today's `e2e-*` orgs and force-deletes them + +## Exit codes + +| Code | Meaning | +|---|---| +| 0 | Happy path | +| 1 | Generic failure (agent didn't respond, provisioning hung, etc.) | +| 2 | Missing required env | +| 3 | Provisioning timed out | +| 4 | Teardown left orphan resources (**leak detected — sanity workflow catches this**) | + +## Known gaps (tracked elsewhere) + +- [#1369](https://github.com/Molecule-AI/molecule-core/issues/1369): SaaS canvas Files / Terminal / Peers tabs — architecturally broken; whitelisted in the spec +- LLM-driven delegation (autonomous `delegate_task` tool use) — probabilistic, not in v1; proxy mechanics covered diff --git a/tests/e2e/test_staging_full_saas.sh b/tests/e2e/test_staging_full_saas.sh index 4e0ace5c..85e5bc8a 100755 --- a/tests/e2e/test_staging_full_saas.sh +++ b/tests/e2e/test_staging_full_saas.sh @@ -1,61 +1,50 @@ #!/usr/bin/env bash # Full-lifecycle SaaS E2E against staging. # -# Creates a fresh org per run (unique slug), waits for tenant EC2 + cloudflared -# provisioning, exercises every major workspace-level API (registration, -# heartbeat, A2A, delegation, HMA memory, activity, peers, events), then -# tears the whole org down and asserts that every cloud artefact (EC2, SG, -# Cloudflare tunnel, DNS record, DB rows) has gone. A leaked resource at -# teardown is a CI failure — that's the whole point of per-run org -# provisioning. +# Creates a fresh org per run (unique slug), waits for tenant EC2 + +# cloudflared provisioning, exercises every major workspace-level API +# (register, heartbeat, A2A, delegation, HMA memory, activity, peers), +# then tears the whole org down and asserts that every cloud artefact +# (EC2, SG, Cloudflare tunnel, DNS record, DB rows) is gone. A leaked +# resource at teardown is a CI failure. +# +# Auth model: +# Single MOLECULE_ADMIN_TOKEN (= CP_ADMIN_API_TOKEN on Railway staging) +# drives everything: +# - POST /cp/admin/orgs to provision (no WorkOS session scraping) +# - GET /cp/admin/orgs/:slug/admin-token to retrieve the per-tenant +# ADMIN_TOKEN once provisioning completes +# - DELETE /cp/admin/tenants/:slug for teardown +# The per-tenant admin token drives all tenant API calls (workspaces, +# memories, a2a). # # Required env: -# MOLECULE_CP_URL Staging CP base URL (default: -# https://staging-api.moleculesai.app) -# MOLECULE_SESSION_COOKIE Valid WorkOS session cookie for a test -# user that's already in the beta -# allowlist AND has accepted current terms. -# Extract from browser after signing in to -# staging. Name: molecule_cp_session. -# MOLECULE_ADMIN_TOKEN CP admin bearer (CP_ADMIN_API_TOKEN on -# Railway). Used for teardown via -# DELETE /cp/admin/tenants/:slug and for -# leak-detection reads. +# MOLECULE_CP_URL default: https://staging-api.moleculesai.app +# MOLECULE_ADMIN_TOKEN CP admin bearer — Railway CP_ADMIN_API_TOKEN # # Optional env: -# E2E_RUNTIME Which runtime to test the agent round-trip -# with. Default: hermes (fastest boot, cheap). -# Use claude-code when you need to validate -# that fix. -# E2E_PROVISION_TIMEOUT_SECS How long to wait for the tenant EC2 to -# come up. Default: 900 (15 min — cold -# EC2 + cloudflared tunnel + DNS propagation -# can touch that window). -# E2E_KEEP_ORG If set to 1, skip teardown. ONLY use -# locally for debugging — CI must never -# set this or staging fills with orphans. -# E2E_RUN_ID Override the auto-generated suffix. CI -# should pass ${GITHUB_RUN_ID} so the -# org slug is grep-able in AWS later. -# E2E_MODE "full" (default) runs every section. -# "canary" runs a lean variant: one -# parent workspace, one A2A PONG, then -# teardown. Used by the 30-min cron -# workflow so each canary finishes in -# ~8 min instead of the full ~20. +# E2E_RUNTIME hermes (default) | claude-code | langgraph +# E2E_PROVISION_TIMEOUT_SECS default 900 (15 min cold EC2 budget) +# E2E_KEEP_ORG 1 → skip teardown (debugging only) +# E2E_RUN_ID Slug suffix; CI: ${GITHUB_RUN_ID} +# E2E_MODE full (default) | canary +# E2E_INTENTIONAL_FAILURE 1 → poison tenant token mid-run so the +# script fails; the EXIT trap MUST still +# tear down cleanly (and exit 4 on leak). +# Used by a dedicated sanity workflow +# that verifies the safety net. # # Exit codes: # 0 happy path -# 1 generic failure (see log) +# 1 generic failure # 2 missing required env # 3 provisioning timed out -# 4 cleanup left orphan resources (leak detected) +# 4 teardown left orphan resources set -euo pipefail CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}" -SESSION_COOKIE="${MOLECULE_SESSION_COOKIE:?MOLECULE_SESSION_COOKIE required — see header for how to obtain}" -ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — from Railway molecule-platform CP env}" +ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}" RUNTIME="${E2E_RUNTIME:-hermes}" PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}" RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}" @@ -65,13 +54,15 @@ case "$MODE" in *) echo "E2E_MODE must be 'full' or 'canary' (got: $MODE)" >&2; exit 2 ;; esac -# Slug constraints from orgs.go: ^[a-z][a-z0-9-]{2,31}$. -# Prefix with "e2e-" so test orgs are grep-able and auto-cleanup crons -# can target them even when a script crashes before the EXIT trap fires. -SLUG="e2e-$(date +%Y%m%d)-${RUN_ID_SUFFIX}" +# Canary runs get a distinct prefix so their safety-net sweeper only +# touches their own runs, not in-flight full runs. +if [ "$MODE" = "canary" ]; then + SLUG="e2e-canary-$(date +%Y%m%d)-${RUN_ID_SUFFIX}" +else + SLUG="e2e-$(date +%Y%m%d)-${RUN_ID_SUFFIX}" +fi SLUG=$(echo "$SLUG" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-' | head -c 32) -# ─── logging helpers ──────────────────────────────────────────────────── log() { echo "[$(date +%H:%M:%S)] $*"; } fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; } ok() { echo "[$(date +%H:%M:%S)] ✅ $*"; } @@ -79,9 +70,6 @@ ok() { echo "[$(date +%H:%M:%S)] ✅ $*"; } CURL_COMMON=(-sS --fail-with-body --max-time 30) # ─── cleanup trap ─────────────────────────────────────────────────────── -# Teardown runs on every exit path (success, failure, signal). The -# delete-tenant endpoint is idempotent — calling it on a slug that was -# never created returns 404 which we swallow. CLEANUP_DONE=0 cleanup_org() { [ "$CLEANUP_DONE" = "1" ] && return 0 @@ -93,7 +81,6 @@ cleanup_org() { fi log "🧹 Tearing down org $SLUG..." - # Confirm token must equal slug — defense against accidental teardowns. curl "${CURL_COMMON[@]}" -X DELETE "$CP_URL/cp/admin/tenants/$SLUG" \ -H "Authorization: Bearer $ADMIN_TOKEN" \ -H "Content-Type: application/json" \ @@ -101,8 +88,6 @@ cleanup_org() { && ok "Teardown request accepted" \ || log "Teardown returned non-2xx (may already be gone)" - # Leak detection: wait briefly then query CP for any remaining artefacts - # tagged with this slug. Anything left = bug in DeprovisionInstance. sleep 10 local leak_count leak_count=$(curl "${CURL_COMMON[@]}" "$CP_URL/cp/admin/orgs" \ @@ -125,72 +110,71 @@ log " Slug: $SLUG" log " Runtime: $RUNTIME" log " Mode: $MODE" log " Timeout: ${PROVISION_TIMEOUT_SECS}s" +[ "${E2E_INTENTIONAL_FAILURE:-0}" = "1" ] && log " ⚠️ INTENTIONAL_FAILURE=1 — this run MUST fail mid-way; teardown MUST still clean up" log "═══════════════════════════════════════════════════════════════════" -log "0/10 Preflight: CP reachable?" +log "0/11 Preflight: CP reachable?" curl "${CURL_COMMON[@]}" "$CP_URL/health" >/dev/null || fail "CP health check failed" ok "CP reachable" -# ─── 1. Accept terms (idempotent) ─────────────────────────────────────── -log "1/10 Accepting current terms..." -curl "${CURL_COMMON[@]}" -X POST "$CP_URL/cp/auth/accept-terms" \ - -H "Cookie: molecule_cp_session=$SESSION_COOKIE" \ - -H "Content-Type: application/json" \ - -d '{}' >/dev/null || log "accept-terms returned non-2xx (may already be accepted)" -ok "Terms acceptance step complete" +admin_call() { + local method="$1"; shift + local path="$1"; shift + curl "${CURL_COMMON[@]}" -X "$method" "$CP_URL$path" \ + -H "Authorization: Bearer $ADMIN_TOKEN" \ + -H "Content-Type: application/json" \ + "$@" +} -# ─── 2. Create org ────────────────────────────────────────────────────── -log "2/10 Creating org $SLUG..." -CREATE_RESP=$(curl "${CURL_COMMON[@]}" -X POST "$CP_URL/cp/orgs" \ - -H "Cookie: molecule_cp_session=$SESSION_COOKIE" \ - -H "Content-Type: application/json" \ - -d "{\"slug\":\"$SLUG\",\"name\":\"E2E $SLUG\"}") +# ─── 1. Create org via admin endpoint ─────────────────────────────────── +log "1/11 Creating org $SLUG via /cp/admin/orgs..." +CREATE_RESP=$(admin_call POST /cp/admin/orgs \ + -d "{\"slug\":\"$SLUG\",\"name\":\"E2E $SLUG\",\"owner_user_id\":\"e2e-runner:$SLUG\"}") echo "$CREATE_RESP" | python3 -m json.tool >/dev/null || fail "Org create returned non-JSON: $CREATE_RESP" ok "Org created" -# ─── 3. Wait for tenant EC2 + cloudflared tunnel + DNS ────────────────── -log "3/10 Waiting for tenant provisioning (up to ${PROVISION_TIMEOUT_SECS}s)..." +# ─── 2. Wait for tenant provisioning ──────────────────────────────────── +log "2/11 Waiting for tenant provisioning (up to ${PROVISION_TIMEOUT_SECS}s)..." DEADLINE=$(( $(date +%s) + PROVISION_TIMEOUT_SECS )) LAST_STATUS="" while true; do if [ "$(date +%s)" -gt "$DEADLINE" ]; then fail "Tenant provisioning timed out after ${PROVISION_TIMEOUT_SECS}s (last: $LAST_STATUS)" fi - STATUS_JSON=$(curl "${CURL_COMMON[@]}" "$CP_URL/cp/orgs/$SLUG/provision-status" \ - -H "Cookie: molecule_cp_session=$SESSION_COOKIE" 2>/dev/null || echo '{}') - STATUS=$(echo "$STATUS_JSON" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))" 2>/dev/null || echo "") + LIST_JSON=$(admin_call GET /cp/admin/orgs 2>/dev/null || echo '{"orgs":[]}') + STATUS=$(echo "$LIST_JSON" | python3 -c " +import json, sys +d = json.load(sys.stdin) +for o in d.get('orgs', []): + if o.get('slug') == '$SLUG': + print(o.get('status', '')) + sys.exit(0) +print('') +" 2>/dev/null || echo "") if [ "$STATUS" != "$LAST_STATUS" ]; then log " status → $STATUS" LAST_STATUS="$STATUS" fi case "$STATUS" in - running) break ;; - failed) fail "Tenant provisioning failed: $(echo "$STATUS_JSON" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("error",""))')" ;; - provisioning|awaiting_payment|pending|"") sleep 15 ;; - *) sleep 15 ;; + running) break ;; + failed) fail "Tenant provisioning failed for $SLUG" ;; + *) sleep 15 ;; esac done ok "Tenant provisioning complete" -TENANT_URL=$(echo "$STATUS_JSON" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('url') or '')" 2>/dev/null || echo "") -[ -z "$TENANT_URL" ] && TENANT_URL="https://$SLUG.moleculesai.app" +TENANT_URL="https://$SLUG.moleculesai.app" log " TENANT_URL=$TENANT_URL" -# Auth strategy for tenant calls: session cookie. The tenant platform's -# session-auth middleware verifies the cookie against CP via -# /cp/auth/tenant-member; a session that's a member of the org is -# treated as admin on that tenant. Same cookie that authed /cp/orgs -# above, so no separate token plumbing needed -- as long as the test -# user is auto-added as owner of the freshly-created org (which is the -# default behaviour of POST /cp/orgs). -# -# provision-status does not return org_id or admin_token today; both -# were an assumption in an earlier draft. X-Molecule-Org-Id is derived -# server-side from the session membership lookup, so the header is -# unnecessary. +# ─── 3. Retrieve per-tenant admin token ──────────────────────────────── +log "3/11 Fetching per-tenant admin token..." +TENANT_TOKEN_RESP=$(admin_call GET "/cp/admin/orgs/$SLUG/admin-token") +TENANT_TOKEN=$(echo "$TENANT_TOKEN_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('admin_token',''))" 2>/dev/null || echo "") +[ -z "$TENANT_TOKEN" ] && fail "Could not retrieve per-tenant admin token for $SLUG" +ok "Tenant admin token retrieved (len=${#TENANT_TOKEN})" -# ─── 4. Wait for tenant TLS cert to be reachable ─────────────────────── -log "4/10 Waiting for tenant TLS / DNS propagation..." +# ─── 4. Wait for tenant TLS / DNS propagation ────────────────────────── +log "4/11 Waiting for tenant TLS / DNS propagation..." TLS_DEADLINE=$(( $(date +%s) + 180 )) while true; do if curl -sSfk --max-time 5 "$TENANT_URL/health" >/dev/null 2>&1; then @@ -203,38 +187,47 @@ while true; do done ok "Tenant reachable at $TENANT_URL" +# Sanity-test path: once the tenant is provisioned, poisoning the +# tenant token proves the EXIT trap + leak assertion still fire. +# Gate AFTER provisioning so the provision path itself stays valid. +EFFECTIVE_TENANT_TOKEN="$TENANT_TOKEN" +if [ "${E2E_INTENTIONAL_FAILURE:-0}" = "1" ]; then + log "⚠️ INTENTIONAL_FAILURE: poisoning tenant token for the workspace-provision step" + EFFECTIVE_TENANT_TOKEN="poisoned-$$" +fi + tenant_call() { local method="$1"; shift local path="$1"; shift curl "${CURL_COMMON[@]}" -X "$method" "$TENANT_URL$path" \ - -H "Cookie: molecule_cp_session=$SESSION_COOKIE" \ + -H "Authorization: Bearer $EFFECTIVE_TENANT_TOKEN" \ "$@" } -# ─── 5. Provision workspace (parent) ─────────────────────────────────── -log "5/10 Provisioning parent workspace (runtime=$RUNTIME)..." +# ─── 5. Provision parent workspace ───────────────────────────────────── +log "5/11 Provisioning parent workspace (runtime=$RUNTIME)..." PARENT_RESP=$(tenant_call POST /workspaces \ -H "Content-Type: application/json" \ -d "{\"name\":\"E2E Parent\",\"runtime\":\"$RUNTIME\",\"tier\":2,\"model\":\"gpt-4o\"}") PARENT_ID=$(echo "$PARENT_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['id'])") log " PARENT_ID=$PARENT_ID" -# ─── 6. Provision child (full mode only — for delegation test) ───────── +# ─── 6. Provision child (full mode only) ──────────────────────────────── CHILD_ID="" if [ "$MODE" = "full" ]; then - log "6/10 Provisioning child workspace..." + log "6/11 Provisioning child workspace..." CHILD_RESP=$(tenant_call POST /workspaces \ -H "Content-Type: application/json" \ -d "{\"name\":\"E2E Child\",\"runtime\":\"$RUNTIME\",\"tier\":2,\"model\":\"gpt-4o\",\"parent_id\":\"$PARENT_ID\"}") CHILD_ID=$(echo "$CHILD_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['id'])") log " CHILD_ID=$CHILD_ID" else - log "6/10 Canary mode — skipping child workspace (full mode only)" + log "6/11 Canary mode — skipping child workspace" fi # ─── 7. Wait for workspace(s) online ─────────────────────────────────── -log "7/10 Waiting for workspace(s) to reach status=online..." -WS_DEADLINE=$(( $(date +%s) + 600 )) # 10 min +log "7/11 Waiting for workspace(s) to reach status=online..." +WS_DEADLINE=$(( $(date +%s) + 600 )) WS_TO_CHECK="$PARENT_ID" [ -n "$CHILD_ID" ] && WS_TO_CHECK="$WS_TO_CHECK $CHILD_ID" for wid in $WS_TO_CHECK; do @@ -254,7 +247,7 @@ for wid in $WS_TO_CHECK; do done # ─── 8. A2A round-trip on parent ─────────────────────────────────────── -log "8/10 Sending A2A message to parent — expecting an agent response..." +log "8/11 Sending A2A message to parent — expecting agent response..." A2A_PAYLOAD=$(python3 -c " import json, uuid print(json.dumps({ @@ -287,9 +280,9 @@ if echo "$AGENT_TEXT" | grep -qiE "error|exception"; then fi ok "A2A parent round-trip succeeded: \"${AGENT_TEXT:0:80}\"" -# ─── 9. HMA memory + peers + activity (full mode only) ──────────────── +# ─── 9. HMA + peers + activity (full mode) ───────────────────────────── if [ "$MODE" = "full" ]; then - log "9/10 Writing + reading HMA memory on parent..." + log "9/11 Writing + reading HMA memory on parent..." MEM_PAYLOAD=$(python3 -c " import json print(json.dumps({ @@ -311,10 +304,8 @@ print(json.dumps({ tenant_call GET "/registry/$PARENT_ID/peers" -o /dev/null -w "%{http_code}\n" 2>&1 | head -1 > /tmp/peers_code.txt set -e PEERS_CODE=$(cat /tmp/peers_code.txt) - if [ "$PEERS_CODE" = "404" ]; then - fail "Peers endpoint missing (404) — route regression" - fi - ok "Peers endpoint reachable (HTTP $PEERS_CODE — 401 expected without ws token)" + [ "$PEERS_CODE" = "404" ] && fail "Peers endpoint missing (404) — route regression" + ok "Peers endpoint reachable (HTTP $PEERS_CODE)" ACTIVITY=$(tenant_call GET "/activity?workspace_id=$PARENT_ID&limit=5" 2>/dev/null || echo '[]') ACTIVITY_COUNT=$(echo "$ACTIVITY" | python3 -c "import json,sys @@ -322,19 +313,12 @@ d=json.load(sys.stdin) print(len(d if isinstance(d, list) else d.get('events', [])))" 2>/dev/null || echo 0) log " Activity events observed: $ACTIVITY_COUNT" else - log "9/10 Canary mode — skipping HMA / peers / activity (full mode only)" + log "9/11 Canary mode — skipping HMA / peers / activity" fi -# ─── 10. Delegation mechanics (full mode + child exists) ────────────── -# Verifies the proxy path that delegate_task uses under the hood: -# parent → /workspaces/$CHILD_ID/a2a (X-Source-Workspace-Id: parent) → -# child runtime → response routes back. Does NOT depend on LLM compliance -# (the parent agent's tool-use behaviour is tested separately via -# canvas-driven prompts). If the proxy mechanics are broken, no amount -# of prompt-engineering on the parent will land a delegation; this -# section pins the mechanics regression. +# ─── 10. Delegation mechanics (full mode + child) ────────────────────── if [ "$MODE" = "full" ] && [ -n "$CHILD_ID" ]; then - log "10/11 Delegation mechanics: parent → child via /workspaces/:id/a2a proxy" + log "10/11 Delegation mechanics: parent → child via proxy" DELEG_PAYLOAD=$(python3 -c " import json, uuid print(json.dumps({ @@ -352,15 +336,13 @@ print(json.dumps({ ") set +e DELEG_RESP=$(curl "${CURL_COMMON[@]}" -X POST "$TENANT_URL/workspaces/$CHILD_ID/a2a" \ - -H "Cookie: molecule_cp_session=$SESSION_COOKIE" \ + -H "Authorization: Bearer $EFFECTIVE_TENANT_TOKEN" \ -H "X-Source-Workspace-Id: $PARENT_ID" \ -H "Content-Type: application/json" \ -d "$DELEG_PAYLOAD") DELEG_RC=$? set -e - if [ $DELEG_RC -ne 0 ]; then - fail "Delegation A2A POST failed (rc=$DELEG_RC)" - fi + [ $DELEG_RC -ne 0 ] && fail "Delegation A2A POST failed (rc=$DELEG_RC)" DELEG_TEXT=$(echo "$DELEG_RESP" | python3 -c " import json, sys try: @@ -370,22 +352,17 @@ try: except Exception: print('') " 2>/dev/null || echo "") - if [ -z "$DELEG_TEXT" ]; then - fail "Delegation returned no text. Raw: ${DELEG_RESP:0:200}" - fi + [ -z "$DELEG_TEXT" ] && fail "Delegation returned no text. Raw: ${DELEG_RESP:0:200}" ok "Delegation proxy works (child responded: \"${DELEG_TEXT:0:60}\")" - # Verify activity log on child captured the delegation. The source - # workspace id is logged by the a2a_proxy when X-Source-Workspace-Id - # is present on the inbound request. CHILD_ACT=$(tenant_call GET "/activity?workspace_id=$CHILD_ID&limit=20" 2>/dev/null || echo '[]') if echo "$CHILD_ACT" | grep -q "$PARENT_ID"; then ok "Child activity log records parent as source" else - log "Child activity log did not reference parent (activity pipeline may be async — soft warning only)" + log "Child activity log did not reference parent (pipeline may be async)" fi fi -# ─── 11. Cleanup runs via trap ──────────────────────────────────────── +# ─── 11. Teardown runs via trap ──────────────────────────────────────── log "11/11 All checks passed. Teardown runs via EXIT trap." ok "═══ STAGING $MODE-SAAS E2E PASSED ═══"