Merge branch 'staging' into fix/canvas-multilevel-layout-ux

This commit is contained in:
Hongming Wang 2026-04-26 00:36:54 -07:00 committed by GitHub
commit 8543bae83f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 675 additions and 63 deletions

View File

@ -43,6 +43,17 @@ jobs:
env:
MOLECULE_CP_URL: https://staging-api.moleculesai.app
MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
# Without an LLM key the test_staging_full_saas.sh script provisions
# the workspace with empty secrets, hermes derive-provider.sh resolves
# `openai/gpt-4o` to PROVIDER=openrouter, no OPENROUTER_API_KEY is
# found in env, and A2A returns "No LLM provider configured" at
# request time (canary step 8/11). The full-lifecycle workflow
# (e2e-staging-saas.yml) has carried this secret since launch — the
# canary regressed when it was first split out and lost the env
# block. Issue #1500 had ~30 consecutive failures before this was
# spotted; do NOT remove without re-reading the script's secrets-
# injection block.
E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }}
E2E_MODE: canary
E2E_RUNTIME: hermes
E2E_RUN_ID: "canary-${{ github.run_id }}"
@ -57,6 +68,14 @@ jobs:
exit 2
fi
- name: Verify OpenAI key present
run: |
if [ -z "$E2E_OPENAI_API_KEY" ]; then
echo "::error::MOLECULE_STAGING_OPENAI_KEY secret not set — A2A will fail at request time with 'No LLM provider configured'"
exit 2
fi
echo "OpenAI key present ✓ (len=${#E2E_OPENAI_API_KEY})"
- name: Canary run
id: canary
run: bash tests/e2e/test_staging_full_saas.sh

View File

@ -0,0 +1,164 @@
name: redeploy-tenants-on-main
# Auto-refresh prod tenant EC2s after every main merge.
#
# Why this workflow exists: publish-workspace-server-image builds and
# pushes a new platform-tenant:latest + :<sha> to GHCR on every merge
# to main, but running tenants pulled their image once at boot and
# never re-pull. Users see stale code indefinitely.
#
# This workflow closes the gap by calling the control-plane admin
# endpoint that performs a canary-first, batched, health-gated rolling
# redeploy across every live tenant. Implemented in Molecule-AI/
# molecule-controlplane as POST /cp/admin/tenants/redeploy-fleet
# (feat/tenant-auto-redeploy, landing alongside this workflow).
#
# Runtime ordering:
# 1. publish-workspace-server-image completes → new :latest in GHCR.
# 2. This workflow fires via workflow_run, waits 30s for GHCR's
# CDN to propagate the new tag to the region the tenants pull from.
# 3. Calls redeploy-fleet with canary_slug=hongmingwang and a 60s
# soak. Canary proves the image boots; batches follow.
# 4. Any failure aborts the rollout and leaves older tenants on the
# prior image — safer default than half-and-half state.
#
# Rollback path: re-run this workflow with a specific SHA pinned via
# the workflow_dispatch input. That calls redeploy-fleet with
# target_tag=<sha>, re-pulling the older image on every tenant.
on:
workflow_run:
workflows: ['publish-workspace-server-image']
types: [completed]
branches: [main]
workflow_dispatch:
inputs:
target_tag:
description: 'Tenant image tag to deploy (e.g. "latest" or "a59f1a6c"). Defaults to latest when empty.'
required: false
type: string
default: 'latest'
canary_slug:
description: 'Tenant slug to deploy first + soak (empty = skip canary, fan out immediately).'
required: false
type: string
default: 'hongmingwang'
soak_seconds:
description: 'Seconds to wait after canary before fanning out.'
required: false
type: string
default: '60'
batch_size:
description: 'How many tenants SSM redeploys in parallel per batch.'
required: false
type: string
default: '3'
dry_run:
description: 'Plan only — do not actually redeploy.'
required: false
type: boolean
default: false
permissions:
contents: read
# No write scopes needed — the workflow hits an external CP endpoint,
# not the GitHub API.
jobs:
redeploy:
# Skip the auto-trigger if publish-workspace-server-image didn't
# actually succeed. workflow_run fires on any completion state; we
# don't want to redeploy against a half-built image.
if: |
github.event_name == 'workflow_dispatch' ||
(github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
runs-on: ubuntu-latest
timeout-minutes: 25
steps:
- name: Wait for GHCR tag propagation
# GHCR's edge cache takes ~15-30s to consistently serve the new
# :latest manifest after the registry accepts the push. Without
# this sleep, the first tenant's docker pull sometimes races
# and fetches the previous digest; sleeping is the cheapest
# way to reduce that without polling GHCR for the new digest.
run: sleep 30
- name: Call CP redeploy-fleet
# CP_ADMIN_API_TOKEN must be set as a repo/org secret on
# Molecule-AI/molecule-core, matching the staging/prod CP's
# CP_ADMIN_API_TOKEN env. Stored in Railway, mirrored to this
# repo's secrets for CI.
env:
CP_URL: ${{ vars.CP_URL || 'https://api.moleculesai.app' }}
CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
TARGET_TAG: ${{ inputs.target_tag || 'latest' }}
CANARY_SLUG: ${{ inputs.canary_slug || 'hongmingwang' }}
SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }}
BATCH_SIZE: ${{ inputs.batch_size || '3' }}
DRY_RUN: ${{ inputs.dry_run || false }}
run: |
set -euo pipefail
if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then
echo "::error::CP_ADMIN_API_TOKEN secret not set — skipping redeploy"
echo "::notice::Set CP_ADMIN_API_TOKEN in repo secrets to enable auto-redeploy."
exit 1
fi
BODY=$(jq -nc \
--arg tag "$TARGET_TAG" \
--arg canary "$CANARY_SLUG" \
--argjson soak "$SOAK_SECONDS" \
--argjson batch "$BATCH_SIZE" \
--argjson dry "$DRY_RUN" \
'{
target_tag: $tag,
canary_slug: $canary,
soak_seconds: $soak,
batch_size: $batch,
dry_run: $dry
}')
echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet"
echo " body: $BODY"
HTTP_RESPONSE=$(mktemp)
HTTP_CODE=$(curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
-m 1200 \
-H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
-H "Content-Type: application/json" \
-X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
-d "$BODY" || echo "000")
echo "HTTP $HTTP_CODE"
cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
# Pretty-print per-tenant results in the job summary so
# ops can see which tenants were redeployed without drilling
# into the raw response.
{
echo "## Tenant redeploy fleet"
echo ""
echo "**Target tag:** \`$TARGET_TAG\`"
echo "**Canary:** \`$CANARY_SLUG\` (soak ${SOAK_SECONDS}s)"
echo "**Batch size:** $BATCH_SIZE"
echo "**Dry run:** $DRY_RUN"
echo "**HTTP:** $HTTP_CODE"
echo ""
echo "### Per-tenant result"
echo ""
echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |'
echo '|------|-------|------------|------|---------|-------|'
jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true
} >> "$GITHUB_STEP_SUMMARY"
if [ "$HTTP_CODE" != "200" ]; then
echo "::error::redeploy-fleet returned HTTP $HTTP_CODE"
exit 1
fi
OK=$(jq -r '.ok' "$HTTP_RESPONSE")
if [ "$OK" != "true" ]; then
echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)"
exit 1
fi
echo "::notice::Tenant fleet redeploy complete."

View File

@ -0,0 +1,170 @@
name: Sweep stale e2e-* orgs (staging)
# Janitor for staging tenants left behind when E2E cleanup didn't run:
# CI cancellations, runner crashes, transient AWS errors mid-cascade,
# bash trap missed (signal 9), etc. Without this loop, every failed
# teardown leaks an EC2 + DNS + DB row until manual ops cleanup —
# 2026-04-23 staging hit the 64 vCPU AWS quota from ~27 such orphans.
#
# Why not rely on per-test-run teardown:
# - Per-run teardown is best-effort by definition. Any process death
# after the test starts but before the trap fires leaves debris.
# - GH Actions cancellation kills the runner without grace period.
# The workflow's `if: always()` step usually catches this, but it
# too can fail (CP transient 5xx, runner network issue at the
# wrong moment).
# - Even when teardown runs, the CP cascade is best-effort in places
# (cascadeTerminateWorkspaces logs+continues; DNS deletion same).
# - This sweep is the catch-all that converges staging back to clean
# regardless of which specific path leaked.
#
# The PROPER fix is making CP cleanup transactional + verify-after-
# terminate (filed separately as cleanup-correctness work). This
# workflow is the safety net that catches everything else AND any
# future leak source we haven't yet identified.
on:
schedule:
# Every hour on the hour. E2E orgs are short-lived (~10-25 min wall
# clock from create to teardown). Anything older than the
# MAX_AGE_MINUTES threshold below is presumed dead.
- cron: '0 * * * *'
workflow_dispatch:
inputs:
max_age_minutes:
description: "Delete e2e-* orgs older than N minutes (default 120)"
required: false
default: "120"
dry_run:
description: "Dry run only — list what would be deleted"
required: false
type: boolean
default: false
# Don't let two sweeps fight. Cron + workflow_dispatch could overlap
# on a manual trigger; queue rather than parallel-delete.
concurrency:
group: sweep-stale-e2e-orgs
cancel-in-progress: false
permissions:
contents: read
jobs:
sweep:
name: Sweep e2e orgs
runs-on: ubuntu-latest
timeout-minutes: 15
env:
MOLECULE_CP_URL: https://staging-api.moleculesai.app
ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
MAX_AGE_MINUTES: ${{ github.event.inputs.max_age_minutes || '120' }}
DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }}
# Refuse to delete more than this many orgs in one tick. If the
# CP DB is briefly empty (or the admin endpoint goes weird and
# returns no created_at), every e2e- org would look stale.
# Bailing protects against runaway nukes.
SAFETY_CAP: 50
steps:
- name: Verify admin token present
run: |
if [ -z "$ADMIN_TOKEN" ]; then
echo "::error::MOLECULE_STAGING_ADMIN_TOKEN not set"
exit 2
fi
echo "Admin token present ✓"
- name: Identify stale e2e orgs
id: identify
run: |
set -euo pipefail
# Fetch into a file so the python step reads it via stdin —
# cleaner than embedding $(curl ...) into a heredoc.
curl -sS --fail-with-body --max-time 30 \
"$MOLECULE_CP_URL/cp/admin/orgs?limit=500" \
-H "Authorization: Bearer $ADMIN_TOKEN" \
> orgs.json
# Filter:
# 1. slug starts with 'e2e-' (covers e2e-, e2e-canary-,
# e2e-canvas-* — all variants the test scripts mint)
# 2. created_at is older than MAX_AGE_MINUTES ago
# Output one slug per line to a file the next step reads.
python3 > stale_slugs.txt <<'PY'
import json, os
from datetime import datetime, timezone, timedelta
with open("orgs.json") as f:
data = json.load(f)
max_age = int(os.environ["MAX_AGE_MINUTES"])
cutoff = datetime.now(timezone.utc) - timedelta(minutes=max_age)
for o in data.get("orgs", []):
slug = o.get("slug", "")
if not slug.startswith("e2e-"):
continue
created = o.get("created_at")
if not created:
# Defensively skip rows without created_at — better
# to leave one orphan than nuke a brand-new row
# whose timestamp didn't render.
continue
# Python 3.11+ handles RFC3339 with Z directly via
# fromisoformat; older runners need the trailing Z swap.
created_dt = datetime.fromisoformat(created.replace("Z", "+00:00"))
if created_dt < cutoff:
print(slug)
PY
count=$(wc -l < stale_slugs.txt | tr -d ' ')
echo "Found $count stale e2e org(s) older than ${MAX_AGE_MINUTES}m"
if [ "$count" -gt 0 ]; then
echo "First 20:"
head -20 stale_slugs.txt | sed 's/^/ /'
fi
echo "count=$count" >> "$GITHUB_OUTPUT"
- name: Safety gate
if: steps.identify.outputs.count != '0'
run: |
count="${{ steps.identify.outputs.count }}"
if [ "$count" -gt "$SAFETY_CAP" ]; then
echo "::error::Refusing to delete $count orgs in one sweep (cap=$SAFETY_CAP). Investigate manually — this usually means the CP admin API returned no created_at or returned a degraded result. Re-run with workflow_dispatch + max_age_minutes if intentional."
exit 1
fi
echo "Within safety cap ($count ≤ $SAFETY_CAP) ✓"
- name: Delete stale orgs
if: steps.identify.outputs.count != '0' && env.DRY_RUN != 'true'
run: |
set -uo pipefail
deleted=0
failed=0
while IFS= read -r slug; do
[ -z "$slug" ] && continue
# The DELETE handler requires {"confirm": "<slug>"} matching
# the URL slug — fat-finger guard. Idempotent: re-issuing
# picks up via org_purges.last_step.
http_code=$(curl -sS -o /tmp/del_resp -w "%{http_code}" \
--max-time 60 \
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
-H "Authorization: Bearer $ADMIN_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"confirm\":\"$slug\"}" || echo "000")
if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then
deleted=$((deleted+1))
echo " deleted: $slug"
else
failed=$((failed+1))
echo " FAILED ($http_code): $slug — $(cat /tmp/del_resp 2>/dev/null | head -c 200)"
fi
done < stale_slugs.txt
echo ""
echo "Sweep summary: deleted=$deleted failed=$failed"
# Don't fail the workflow on per-org delete errors — the
# sweeper is best-effort. Next hourly tick re-attempts. We
# only fail loud at the safety-cap gate above.
- name: Dry-run summary
if: env.DRY_RUN == 'true'
run: |
echo "DRY RUN — would have deleted ${{ steps.identify.outputs.count }} org(s). Re-run with dry_run=false to actually delete."

View File

@ -5,7 +5,7 @@
* the per-tenant admin token, provisions one hermes workspace, waits
* for online, then exports:
*
* STAGING_TENANT_URL https://<slug>.moleculesai.app
* STAGING_TENANT_URL https://<slug>.staging.moleculesai.app
* STAGING_WORKSPACE_ID UUID of the hermes workspace
* STAGING_TENANT_TOKEN per-tenant admin bearer (for spec requests)
* STAGING_SLUG org slug (used by teardown)
@ -16,6 +16,11 @@
* CP_ADMIN_API_TOKEN). Drives provision +
* tenant-token retrieval + teardown via a
* single credential.
* STAGING_TENANT_DOMAIN default: staging.moleculesai.app the
* DNS suffix the CP provisioner writes for
* staging tenants. Override only when
* running this harness against a non-default
* zone.
*/
import type { FullConfig } from "@playwright/test";
@ -25,6 +30,14 @@ import { join } from "path";
const CP_URL = process.env.MOLECULE_CP_URL || "https://staging-api.moleculesai.app";
const ADMIN_TOKEN = process.env.MOLECULE_ADMIN_TOKEN;
const STAGING = process.env.CANVAS_E2E_STAGING === "1";
// Tenant DNS zone for staging. CP provisioner registers DNS as
// `<slug>.staging.moleculesai.app` (see internal/provisioner/ec2.go's
// EC2 provisioner: DNS log line). The previous default of plain
// `moleculesai.app` matched prod tenant naming and silently broke
// every staging E2E at the TLS readiness step — DNS literally didn't
// resolve, fetch threw NXDOMAIN, waitFor saw null on every poll, and
// the harness wedged at TLS_TIMEOUT_MS instead of failing loud.
const TENANT_DOMAIN = process.env.STAGING_TENANT_DOMAIN || "staging.moleculesai.app";
// Tenant cold boot on staging regularly takes 12-15 min when the
// workspace-server Docker image isn't already cached on the AMI. Raised
@ -105,22 +118,44 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
}
console.log(`[staging-setup] Org created: ${slug}`);
// 2. Wait for tenant running (admin-orgs list is the status source)
// 2. Wait for tenant running (admin-orgs list is the status source).
//
// The CP /cp/admin/orgs endpoint returns each org with an
// `instance_status` field (handlers/admin.go:adminOrgSummary,
// sourced from `org_instances.status`). NOT `status` — there's no
// top-level `status` on the row at all. A previous version of this
// test polled `row.status`, which was always undefined, so this
// waitFor never resolved truthy and the harness invariably timed
// out at 1200s — masking real CP bugs (see #242 chain) AND
// surviving real CP fixes alike.
// Capture the org UUID alongside the running check — every request
// we send to the tenant URL after this point needs an
// X-Molecule-Org-Id header (see workspace-server middleware/tenant_guard.go).
// Without it, TenantGuard returns 404 ("must not be inferable by
// probing other orgs' machines"). The CP returns the id on the
// admin-orgs row; capture it here while we're already polling.
let orgID = "";
await waitFor<boolean>(
async () => {
const r = await jsonFetch(`${CP_URL}/cp/admin/orgs`, { headers: adminAuth });
if (r.status !== 200) return null;
const row = (r.body?.orgs || []).find((o: any) => o.slug === slug);
if (!row) return null;
if (row.status === "running") return true;
if (row.status === "failed") throw new Error(`provision failed: ${slug}`);
if (row.instance_status === "running") {
orgID = row.id;
return true;
}
if (row.instance_status === "failed") throw new Error(`provision failed: ${slug}`);
return null;
},
PROVISION_TIMEOUT_MS,
15_000,
"tenant provision",
);
console.log(`[staging-setup] Tenant running`);
if (!orgID) {
throw new Error(`expected admin-orgs row to carry id, got empty for slug=${slug}`);
}
console.log(`[staging-setup] Tenant running (org_id=${orgID})`);
// 3. Fetch per-tenant admin token
const tokRes = await jsonFetch(
@ -133,7 +168,7 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
);
}
const tenantToken: string = tokRes.body.admin_token;
const tenantURL = `https://${slug}.moleculesai.app`;
const tenantURL = `https://${slug}.${TENANT_DOMAIN}`;
console.log(`[staging-setup] Tenant URL: ${tenantURL}`);
// 4. TLS readiness
@ -154,7 +189,17 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
);
// 5. Provision workspace
const tenantAuth = { Authorization: `Bearer ${tenantToken}` };
//
// tenantAuth carries TWO headers, both required:
// - Authorization: Bearer <admin-token> — wsAdmin middleware gate
// - X-Molecule-Org-Id: <uuid> — TenantGuard cross-org gate
// Missing the org-id header silently 404s every non-allowlisted
// route, with no body and no security headers. The 404 is intentional
// (existence-non-inference) which makes it look like a missing route.
const tenantAuth = {
"Authorization": `Bearer ${tenantToken}`,
"X-Molecule-Org-Id": orgID,
};
const ws = await jsonFetch(`${tenantURL}/workspaces`, {
method: "POST",
headers: tenantAuth,

View File

@ -63,6 +63,82 @@ test.describe("staging canvas tabs", () => {
Authorization: `Bearer ${tenantToken}`,
});
// canvas/src/components/AuthGate.tsx fetches /cp/auth/me on mount
// and redirects to the login page on 401. The bearer header above
// is for platform API calls — it does NOT satisfy /cp/auth/me,
// which is cookie-based (WorkOS session). Without this mock, the
// canvas page mounts AuthGate, sees 401 from /cp/auth/me, and
// redirects away from the tenant URL before the React Flow root
// ever renders. The [aria-label] selector wait then times out.
//
// Intercept /cp/auth/me + return a fake Session shape so AuthGate
// resolves to "authenticated" and renders {children}. The session
// contents are cosmetic — the canvas only inspects org_id/user_id
// in a few places that don't fail when these are dummy values.
await context.route("**/cp/auth/me", (route) =>
route.fulfill({
status: 200,
contentType: "application/json",
body: JSON.stringify({
user_id: `e2e-test-user-${workspaceId}`,
org_id: "e2e-test-org",
email: "e2e@test.local",
}),
}),
);
// Universal 401 → empty-200 fallback (defense-in-depth).
//
// The original product bug was canvas/src/lib/api.ts:62-74 calling
// `redirectToLogin` on EVERY 401 — a single workspace-scoped 401
// (e.g. /workspaces/:id/peers, /plugins) yanked the user (and the
// test) to AuthKit. That's now fixed at the source: api.ts probes
// /cp/auth/me before redirecting, so a 401 from a non-auth path
// with a live session throws a regular error instead.
//
// This route handler stays as a SAFETY NET, not the primary
// defense:
// 1. It silences resource-load console noise from the browser
// (those messages don't include the URL — useless in
// diagnostics, captured by the filter in the assertion
// block but having no 401s reach the network is cleaner).
// 2. It guards against panels that DON'T have try/catch around
// their api calls — an unhandled rejection would surface
// as console.error → fail the assertion. Panels SHOULD
// handle errors, but until they're all audited, this is
// the test's belt to api.ts's braces.
//
// Pass-through real responses; swap 401s for 200 + empty body.
// Skip /cp/auth/me (mocked above) and non-fetch resources
// (HTML/JS/CSS bundles that should NOT be intercepted).
await context.route("**", async (route, request) => {
if (request.resourceType() !== "fetch") {
return route.fallback();
}
// /cp/auth/me is mocked above with a fixed Session shape — let
// that handler win without us round-tripping the network.
if (request.url().includes("/cp/auth/me")) {
return route.fallback();
}
let resp;
try {
resp = await route.fetch();
} catch {
return route.fallback();
}
if (resp.status() !== 401) {
return route.fulfill({ response: resp });
}
const lastSeg =
new URL(request.url()).pathname.split("/").filter(Boolean).pop() || "";
const looksLikeList = !/^[0-9a-f-]{8,}$/.test(lastSeg);
await route.fulfill({
status: 200,
contentType: "application/json",
body: looksLikeList ? "[]" : "{}",
});
});
const consoleErrors: string[] = [];
page.on("console", (msg) => {
if (msg.type() === "error") {
@ -70,13 +146,38 @@ test.describe("staging canvas tabs", () => {
}
});
await page.goto(tenantURL, { waitUntil: "networkidle" });
// Capture the URL of any failed network request so a "Failed to load
// resource: 404" console message we filter out below leaves a
// breadcrumb. Browser console messages for resource-load failures
// omit the URL, so we'd otherwise be flying blind. Logged to the
// test's stdout (visible in the workflow log under the failed step).
page.on("requestfailed", (req) => {
console.log(`[e2e/requestfailed] ${req.method()} ${req.url()}: ${req.failure()?.errorText ?? "?"}`);
});
page.on("response", (res) => {
if (res.status() >= 400) {
console.log(`[e2e/response-${res.status()}] ${res.request().method()} ${res.url()}`);
}
});
// waitUntil="networkidle" is wrong here — the canvas keeps a
// WebSocket open + polls /events and /workspaces every few
// seconds, so the network is *never* idle for 500ms. page.goto
// would hang until its 45s default timeout. "domcontentloaded"
// returns as soon as the HTML is parsed; React hydration + the
// selector wait below is what actually gates ready-for-interaction.
await page.goto(tenantURL, { waitUntil: "domcontentloaded" });
// Canvas hydration races WebSocket connect + /workspaces fetch.
// Wait for the tablist element (appears after a workspace is
// selected) or the hydration-error banner — whichever wins first.
// Wait for the React Flow canvas wrapper (always present once
// hydrated, even with zero workspaces) or the hydration-error
// banner — whichever wins first. Previous version of this wait
// used `[role="tablist"]`, but that selector only appears AFTER
// a workspace node is clicked (which happens below at L100), so
// the wait would always time out at 45s before any meaningful
// failure surfaced.
await page.waitForSelector(
'[role="tablist"], [data-testid="hydration-error"]',
'[aria-label="Molecule AI workspace canvas"], [data-testid="hydration-error"]',
{ timeout: 45_000 },
);
@ -106,6 +207,15 @@ test.describe("staging canvas tabs", () => {
for (const tabId of TAB_IDS) {
await test.step(`tab: ${tabId}`, async () => {
const tabButton = page.locator(`#tab-${tabId}`);
// The TABS bar is `overflow-x-auto` (SidePanel.tsx:~tabs
// wrapper) — tabs after position ~3 are clipped behind the
// right-edge fade gradient on smaller viewports. Playwright's
// `toBeVisible()` returns false for clipped elements, so a
// bare visibility check fails on `skills` and later tabs in
// CI. scrollIntoViewIfNeeded brings the button into view
// before the visibility check, mirroring what SidePanel's own
// keyboard handler does on arrow-key navigation.
await tabButton.scrollIntoViewIfNeeded({ timeout: 5_000 });
await expect(
tabButton,
`tab-${tabId} button missing — TABS list may have drifted`,
@ -134,14 +244,22 @@ test.describe("staging canvas tabs", () => {
// Aggregate console-error budget. Known-noisy sources whitelisted:
// Sentry, Vercel analytics, WS reconnects (expected on SaaS
// terminal), favicon 404 (cosmetic).
// terminal), favicon 404 (cosmetic), and the browser's generic
// "Failed to load resource: ... 404" message which never includes
// the URL — uninformative on its own and impossible to filter
// meaningfully without a URL. The page.on('requestfailed') +
// page.on('response>=400') logging above captures the actual URLs
// so a real bug still leaves a breadcrumb in the workflow log;
// a real exception (panel crash, JS error) surfaces as a typed
// error with file path which the filter still catches.
const appErrors = consoleErrors.filter(
(msg) =>
!msg.includes("sentry") &&
!msg.includes("vercel") &&
!msg.includes("WebSocket") &&
!msg.includes("favicon") &&
!msg.includes("molecule-icon.png"), // another cosmetic 404
!msg.includes("molecule-icon.png") && // cosmetic 404
!msg.includes("Failed to load resource"),
);
expect(
appErrors,

View File

@ -74,6 +74,11 @@ export default function Home() {
{hydrationError && (
<div
role="alert"
// Stable testid so the staging E2E (canvas/e2e/staging-tabs.spec.ts)
// can detect this banner without depending on the role="alert"
// selector that's used by other transient toasts. Don't rename
// without updating that spec.
data-testid="hydration-error"
className="fixed inset-0 flex flex-col items-center justify-center bg-zinc-950 text-zinc-300 gap-4 z-[9999]"
>
<p className="text-zinc-400 text-sm">{hydrationError}</p>

View File

@ -14,7 +14,7 @@ import { PricingTable } from "@/components/PricingTable";
export const metadata = {
title: "Pricing — Molecule AI",
description:
"Free while you tinker, paid tiers for shipping production multi-agent organizations. Transparent usage-based overage pricing on Pro.",
"Flat-rate team and org pricing — no per-seat fees. Free to start, $29/month for teams, $99/month for production orgs. Full runtime stack included on every paid tier.",
};
export default function PricingPage() {
@ -25,9 +25,12 @@ export default function PricingPage() {
Pricing
</h1>
<p className="mx-auto mt-4 max-w-2xl text-lg text-zinc-300">
Free while you tinker. Pay when you ship real agents to production.
Every tier includes the full runtime stack you upgrade for scale,
support, and dedicated infrastructure.
One flat price per org not per seat. Every paid tier includes the
full runtime stack. You upgrade for scale, support, and dedicated
infrastructure.
</p>
<p className="mx-auto mt-2 max-w-xl text-sm text-zinc-400">
5-person team? You pay $29/month not $200. No seat math, ever.
</p>
</div>
@ -53,7 +56,8 @@ export default function PricingPage() {
.
</p>
<p className="mt-6 text-sm text-zinc-500">
Prices shown in USD. Enterprise / self-hosted licensing available contact us.
Prices shown in USD. Flat-rate per org no per-seat fees on any paid tier.
Enterprise / self-hosted licensing available contact us.
</p>
</section>

View File

@ -50,14 +50,14 @@ describe("PricingTable", () => {
it("renders all three plans with their CTAs", () => {
render(<PricingTable />);
expect(screen.getByRole("heading", { name: "Free" })).toBeTruthy();
expect(screen.getByRole("heading", { name: "Starter" })).toBeTruthy();
expect(screen.getByRole("heading", { name: "Pro" })).toBeTruthy();
expect(screen.getByRole("heading", { name: "Team" })).toBeTruthy();
expect(screen.getByRole("heading", { name: "Growth" })).toBeTruthy();
expect(screen.getByRole("button", { name: "Get started" })).toBeTruthy();
expect(screen.getByRole("button", { name: "Upgrade to Starter" })).toBeTruthy();
expect(screen.getByRole("button", { name: "Upgrade to Pro" })).toBeTruthy();
expect(screen.getByRole("button", { name: "Upgrade to Team" })).toBeTruthy();
expect(screen.getByRole("button", { name: "Upgrade to Growth" })).toBeTruthy();
});
it("shows the 'Most popular' badge only on the starter card", () => {
it("shows the 'Most popular' badge only on the Team card", () => {
render(<PricingTable />);
const badges = screen.getAllByText("Most popular");
expect(badges.length).toBe(1);
@ -74,7 +74,7 @@ describe("PricingTable", () => {
it("Paid CTA + anonymous → bounces to signup (no checkout call)", async () => {
mockedFetchSession.mockResolvedValue(null);
render(<PricingTable />);
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Starter" }));
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Team" }));
await waitFor(() => expect(mockedRedirectToLogin).toHaveBeenCalledWith("sign-up"));
expect(mockedStartCheckout).not.toHaveBeenCalled();
});
@ -91,7 +91,7 @@ describe("PricingTable", () => {
});
render(<PricingTable />);
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Pro" }));
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Growth" }));
await waitFor(() =>
expect(mockedStartCheckout).toHaveBeenCalledWith("pro", "acme"),
@ -111,7 +111,7 @@ describe("PricingTable", () => {
mockedGetTenantSlug.mockReturnValue("");
render(<PricingTable />);
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Starter" }));
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Team" }));
await waitFor(() => {
const alert = screen.getByRole("alert");
@ -129,7 +129,7 @@ describe("PricingTable", () => {
mockedStartCheckout.mockRejectedValue(new Error("checkout: 500 boom"));
render(<PricingTable />);
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Pro" }));
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Growth" }));
await waitFor(() => {
const alert = screen.getByRole("alert");
@ -140,7 +140,7 @@ describe("PricingTable", () => {
it("treats fetchSession network errors as anonymous (fail-closed to signup)", async () => {
mockedFetchSession.mockRejectedValue(new Error("network down"));
render(<PricingTable />);
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Starter" }));
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Team" }));
await waitFor(() => expect(mockedRedirectToLogin).toHaveBeenCalledWith("sign-up"));
expect(mockedStartCheckout).not.toHaveBeenCalled();
});
@ -155,7 +155,7 @@ describe("PricingTable", () => {
mockedStartCheckout.mockReturnValue(new Promise(() => {}));
render(<PricingTable />);
const button = screen.getByRole("button", { name: "Upgrade to Pro" });
const button = screen.getByRole("button", { name: "Upgrade to Growth" });
fireEvent.click(button);
await waitFor(() => {

View File

@ -6,32 +6,44 @@ import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
// runs happily in node. Splitting keeps the node tests fast.
// ---------------------------------------------------------------------------
// 401 handling — gated on SaaS-tenant hostname
// 401 handling — session-probe-before-redirect
// ---------------------------------------------------------------------------
//
// Before fix/quickstart-bugless, any 401 from any endpoint triggered
// `redirectToLogin()`, navigating to `/cp/auth/login`. That route
// exists only on SaaS (mounted by cp_proxy when CP_UPSTREAM_URL is
// set). On localhost / self-hosted / Vercel preview it 404s, so the
// user lands on a broken login page instead of seeing the actual error.
// History:
// 1. fix/quickstart-bugless: gated redirect on SaaS hostname (slug).
// 2. fix/api-401-probe-before-redirect (this file): probe /cp/auth/me
// before redirecting on a 401 from a non-auth path. The earlier
// behaviour redirected on EVERY 401, so a single 401 from
// /workspaces/:id/plugins (workspace-scoped — refused by the
// tenant admin bearer) yanked the user to AuthKit even when
// the session was fine. The probe lets us tell "session dead"
// from "endpoint refused this token."
//
// These tests lock in:
// - SaaS tenant hostname (*.moleculesai.app) → 401 still redirects.
// - non-SaaS hostname (localhost, LAN IP, apex) → 401 throws, no
// redirect, so the caller renders a real error affordance.
// Matrix:
// slug | path | probe → me | expected
// --- | --- | --- | ---
// acme | /cp/auth/me | (n/a) | redirect (path IS auth)
// acme | /workspaces/... | 401 | redirect (session dead)
// acme | /workspaces/... | 200 | throw, no redirect
// acme | /workspaces/... | network err| throw, no redirect
// "" | /workspaces/... | (n/a) | throw, no redirect (no slug)
const mockFetch = vi.fn();
globalThis.fetch = mockFetch;
function mockFailure(status: number, text: string) {
function mockNextResponse(status: number, text = "") {
mockFetch.mockResolvedValueOnce({
ok: false,
ok: status >= 200 && status < 300,
status,
json: () => Promise.reject(new Error("no json")),
text: () => Promise.resolve(text),
} as unknown as Response);
}
function mockNextNetworkError() {
mockFetch.mockRejectedValueOnce(new Error("network"));
}
function setHostname(host: string) {
Object.defineProperty(window, "location", {
configurable: true,
@ -59,27 +71,66 @@ describe("api 401 handling", () => {
vi.resetModules();
});
it("redirects to login on SaaS tenant hostname", async () => {
it("redirects when /cp/auth/me itself 401s — that IS the session-dead signal", async () => {
setHostname("acme.moleculesai.app");
mockFailure(401, '{"error":"admin auth required"}');
// Single fetch: the /cp/auth/me call itself.
mockNextResponse(401, '{"error":"unauthenticated"}');
const { api } = await import("../api");
await expect(api.get("/workspaces")).rejects.toThrow(/Session expired/);
await expect(api.get("/cp/auth/me")).rejects.toThrow(/Session expired/);
expect(redirectSpy).toHaveBeenCalledWith("sign-in");
// No probe fired — we already know the session is dead.
expect(mockFetch).toHaveBeenCalledTimes(1);
});
it("redirects when /cp/auth/me probe ALSO 401s — session genuinely dead", async () => {
setHostname("acme.moleculesai.app");
// First call: the workspace-scoped fetch returns 401.
mockNextResponse(401, '{"error":"workspace token required"}');
// Second call: the probe to /cp/auth/me also 401s.
mockNextResponse(401, '{"error":"unauthenticated"}');
const { api } = await import("../api");
await expect(api.get("/workspaces/abc/plugins")).rejects.toThrow(/Session expired/);
expect(redirectSpy).toHaveBeenCalledWith("sign-in");
});
it("does NOT redirect when probe returns 200 — endpoint refused this token, session fine", async () => {
setHostname("acme.moleculesai.app");
// First call: workspace-scoped 401.
mockNextResponse(401, '{"error":"workspace token required"}');
// Second call: probe shows the session is alive.
mockNextResponse(200, '{"user_id":"u1","org_id":"o1","email":"x@y"}');
const { api } = await import("../api");
await expect(api.get("/workspaces/abc/plugins")).rejects.toThrow(/401/);
expect(redirectSpy).not.toHaveBeenCalled();
});
it("does NOT redirect when probe network-errors — conservative fallback", async () => {
setHostname("acme.moleculesai.app");
mockNextResponse(401, '{"error":"workspace token required"}');
mockNextNetworkError();
const { api } = await import("../api");
await expect(api.get("/workspaces/abc/plugins")).rejects.toThrow(/401/);
expect(redirectSpy).not.toHaveBeenCalled();
});
it("does NOT redirect on localhost — throws a real error instead", async () => {
setHostname("localhost");
mockFailure(401, '{"error":"admin auth required"}');
mockNextResponse(401, '{"error":"admin auth required"}');
const { api } = await import("../api");
await expect(api.get("/workspaces")).rejects.toThrow(/401/);
expect(redirectSpy).not.toHaveBeenCalled();
// No slug → no probe fires either.
expect(mockFetch).toHaveBeenCalledTimes(1);
});
it("does NOT redirect on a LAN hostname", async () => {
setHostname("192.168.1.74");
mockFailure(401, '{"error":"missing workspace auth token"}');
mockNextResponse(401, '{"error":"missing workspace auth token"}');
const { api } = await import("../api");
await expect(api.get("/workspaces/abc/activity")).rejects.toThrow(/401/);
@ -91,7 +142,7 @@ describe("api 401 handling", () => {
// Users landing on app.moleculesai.app (pre-tenant-selection) must
// see the real 401 error rather than loop on login.
setHostname("app.moleculesai.app");
mockFailure(401, '{"error":"admin auth required"}');
mockNextResponse(401, '{"error":"admin auth required"}');
const { api } = await import("../api");
await expect(api.get("/workspaces")).rejects.toThrow(/401/);

View File

@ -60,15 +60,45 @@ async function request<T>(
return request<T>(method, path, body, retryCount + 1, options);
}
if (res.status === 401) {
// Session expired or credentials lost. On SaaS (tenant subdomain)
// the login page lives at /cp/auth/login and is mounted by the
// control-plane reverse proxy — redirect. On self-hosted / local
// dev / Vercel preview there IS no /cp/* mount, so redirecting
// would navigate to a 404 ("404 page not found") instead of the
// real error the user should see. In that case, throw instead
// and let the caller render a meaningful failure (retry button,
// error banner, etc.).
if (slug) {
// Distinguish "session is dead" from "this endpoint refused this
// token." Old behaviour blanket-redirected on every 401, so a
// single transient 401 from a workspace-scoped endpoint
// (/workspaces/:id/peers, /plugins, etc. that need a workspace
// token rather than the tenant admin bearer) yanked the user
// back to AuthKit even when their session was perfectly fine.
// That broke the staging-tabs E2E for the entire 2026-04-25
// night; #2073/#2074 worked around the symptom in the test by
// mocking 401→200 for every fetch, but the user-facing bug
// stayed.
//
// The canonical "session is dead" signal is /cp/auth/me
// returning 401. For any 401 on a non-auth path, probe
// /cp/auth/me before deciding to redirect:
// - probe 401 → session is actually dead → redirect
// - probe 200 → session is fine, the endpoint just refused
// our specific token → throw a real error,
// caller renders an error state
// - probe network error → assume session-fine (conservative;
// better to throw than to redirect on a
// transient probe failure)
//
// Self-hosted / localhost / reserved subdomains still throw
// without redirecting (slug is empty in those cases) — same
// policy as before.
const isAuthPath = path.startsWith("/cp/auth/");
let sessionDead = isAuthPath;
if (!isAuthPath && slug) {
try {
const probe = await fetch(`${PLATFORM_URL}/cp/auth/me`, {
credentials: "include",
signal: AbortSignal.timeout(5000),
});
sessionDead = probe.status === 401;
} catch {
// Probe failed (network/timeout) — fall through to throw.
}
}
if (sessionDead && slug) {
const { redirectToLogin } = await import("./auth");
redirectToLogin("sign-in");
throw new Error("Session expired — redirecting to login");

View File

@ -32,6 +32,10 @@ export interface Plan {
// plans is the canonical order shown on the pricing page: free → starter
// → pro. Change the order here + the rendered columns follow. Keeping
// this as a module-level const so tests can assert against a known list.
//
// Flat-rate positioning (Issue #1833): "starter" and "pro" are flat-rate
// per-org, not per-seat. This is a deliberate wedge against Cursor/Windsurf
// ($40/seat) — at 5 engineers the Team tier is 28% cheaper.
export const plans: Plan[] = [
{
id: "free",
@ -48,8 +52,8 @@ export const plans: Plan[] = [
},
{
id: "starter",
name: "Starter",
tagline: "For small teams shipping real agents",
name: "Team",
tagline: "Flat-rate for teams — one price, no per-seat fees",
price: "$29/month",
features: [
"10 workspaces",
@ -57,14 +61,15 @@ export const plans: Plan[] = [
"Private Upstash Redis namespace",
"Email support (48h)",
"5M LLM tokens / month included",
"No per-seat pricing",
],
ctaLabel: "Upgrade to Starter",
ctaLabel: "Upgrade to Team",
highlighted: true,
},
{
id: "pro",
name: "Pro",
tagline: "For production multi-agent orgs",
name: "Growth",
tagline: "Flat-rate for production multi-agent orgs",
price: "$99/month",
features: [
"Unlimited workspaces",
@ -72,9 +77,10 @@ export const plans: Plan[] = [
"Cross-workspace A2A audit log",
"Priority support (24h)",
"25M LLM tokens / month included",
"No per-seat pricing",
"Usage-based overage billing",
],
ctaLabel: "Upgrade to Pro",
ctaLabel: "Upgrade to Growth",
},
];

View File

@ -32,7 +32,7 @@
set -euo pipefail
DRY_RUN=1
MAX_DELETE_PCT=50 # refuse to delete more than half the records in one run
MAX_DELETE_PCT="${MAX_DELETE_PCT:-50}" # refuse to delete more than this pct of records in one run; caller can override via env
REGION="${AWS_DEFAULT_REGION:-us-east-2}"
for arg in "$@"; do