Merge branch 'staging' into fix/canvas-multilevel-layout-ux

2026-04-26 00:36:54 -07:00 · 2026-04-26 00:36:54 -07:00 · 8543bae83f
commit 8543bae83f
parent 5e36c6638c 194121c674
12 changed files with 675 additions and 63 deletions
--- a/.github/workflows/canary-staging.yml
+++ b/.github/workflows/canary-staging.yml
@ -43,6 +43,17 @@ jobs:
    env:
      MOLECULE_CP_URL: https://staging-api.moleculesai.app
      MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
+      # Without an LLM key the test_staging_full_saas.sh script provisions
+      # the workspace with empty secrets, hermes derive-provider.sh resolves
+      # `openai/gpt-4o` to PROVIDER=openrouter, no OPENROUTER_API_KEY is
+      # found in env, and A2A returns "No LLM provider configured" at
+      # request time (canary step 8/11). The full-lifecycle workflow
+      # (e2e-staging-saas.yml) has carried this secret since launch — the
+      # canary regressed when it was first split out and lost the env
+      # block. Issue #1500 had ~30 consecutive failures before this was
+      # spotted; do NOT remove without re-reading the script's secrets-
+      # injection block.
+      E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }}
      E2E_MODE: canary
      E2E_RUNTIME: hermes
      E2E_RUN_ID: "canary-${{ github.run_id }}"
@ -57,6 +68,14 @@ jobs:
            exit 2
          fi

+      - name: Verify OpenAI key present
+        run: |
+          if [ -z "$E2E_OPENAI_API_KEY" ]; then
+            echo "::error::MOLECULE_STAGING_OPENAI_KEY secret not set — A2A will fail at request time with 'No LLM provider configured'"
+            exit 2
+          fi
+          echo "OpenAI key present ✓ (len=${#E2E_OPENAI_API_KEY})"
+
      - name: Canary run
        id: canary
        run: bash tests/e2e/test_staging_full_saas.sh
--- a/.github/workflows/redeploy-tenants-on-main.yml
+++ b/.github/workflows/redeploy-tenants-on-main.yml
@ -0,0 +1,164 @@
+name: redeploy-tenants-on-main
+
+# Auto-refresh prod tenant EC2s after every main merge.
+#
+# Why this workflow exists: publish-workspace-server-image builds and
+# pushes a new platform-tenant:latest + :<sha> to GHCR on every merge
+# to main, but running tenants pulled their image once at boot and
+# never re-pull. Users see stale code indefinitely.
+#
+# This workflow closes the gap by calling the control-plane admin
+# endpoint that performs a canary-first, batched, health-gated rolling
+# redeploy across every live tenant. Implemented in Molecule-AI/
+# molecule-controlplane as POST /cp/admin/tenants/redeploy-fleet
+# (feat/tenant-auto-redeploy, landing alongside this workflow).
+#
+# Runtime ordering:
+#   1. publish-workspace-server-image completes → new :latest in GHCR.
+#   2. This workflow fires via workflow_run, waits 30s for GHCR's
+#      CDN to propagate the new tag to the region the tenants pull from.
+#   3. Calls redeploy-fleet with canary_slug=hongmingwang and a 60s
+#      soak. Canary proves the image boots; batches follow.
+#   4. Any failure aborts the rollout and leaves older tenants on the
+#      prior image — safer default than half-and-half state.
+#
+# Rollback path: re-run this workflow with a specific SHA pinned via
+# the workflow_dispatch input. That calls redeploy-fleet with
+# target_tag=<sha>, re-pulling the older image on every tenant.
+
+on:
+  workflow_run:
+    workflows: ['publish-workspace-server-image']
+    types: [completed]
+    branches: [main]
+  workflow_dispatch:
+    inputs:
+      target_tag:
+        description: 'Tenant image tag to deploy (e.g. "latest" or "a59f1a6c"). Defaults to latest when empty.'
+        required: false
+        type: string
+        default: 'latest'
+      canary_slug:
+        description: 'Tenant slug to deploy first + soak (empty = skip canary, fan out immediately).'
+        required: false
+        type: string
+        default: 'hongmingwang'
+      soak_seconds:
+        description: 'Seconds to wait after canary before fanning out.'
+        required: false
+        type: string
+        default: '60'
+      batch_size:
+        description: 'How many tenants SSM redeploys in parallel per batch.'
+        required: false
+        type: string
+        default: '3'
+      dry_run:
+        description: 'Plan only — do not actually redeploy.'
+        required: false
+        type: boolean
+        default: false
+
+permissions:
+  contents: read
+  # No write scopes needed — the workflow hits an external CP endpoint,
+  # not the GitHub API.
+
+jobs:
+  redeploy:
+    # Skip the auto-trigger if publish-workspace-server-image didn't
+    # actually succeed. workflow_run fires on any completion state; we
+    # don't want to redeploy against a half-built image.
+    if: |
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
+    runs-on: ubuntu-latest
+    timeout-minutes: 25
+    steps:
+      - name: Wait for GHCR tag propagation
+        # GHCR's edge cache takes ~15-30s to consistently serve the new
+        # :latest manifest after the registry accepts the push. Without
+        # this sleep, the first tenant's docker pull sometimes races
+        # and fetches the previous digest; sleeping is the cheapest
+        # way to reduce that without polling GHCR for the new digest.
+        run: sleep 30
+
+      - name: Call CP redeploy-fleet
+        # CP_ADMIN_API_TOKEN must be set as a repo/org secret on
+        # Molecule-AI/molecule-core, matching the staging/prod CP's
+        # CP_ADMIN_API_TOKEN env. Stored in Railway, mirrored to this
+        # repo's secrets for CI.
+        env:
+          CP_URL: ${{ vars.CP_URL || 'https://api.moleculesai.app' }}
+          CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
+          TARGET_TAG: ${{ inputs.target_tag || 'latest' }}
+          CANARY_SLUG: ${{ inputs.canary_slug || 'hongmingwang' }}
+          SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }}
+          BATCH_SIZE: ${{ inputs.batch_size || '3' }}
+          DRY_RUN: ${{ inputs.dry_run || false }}
+        run: |
+          set -euo pipefail
+
+          if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then
+            echo "::error::CP_ADMIN_API_TOKEN secret not set — skipping redeploy"
+            echo "::notice::Set CP_ADMIN_API_TOKEN in repo secrets to enable auto-redeploy."
+            exit 1
+          fi
+
+          BODY=$(jq -nc \
+            --arg tag "$TARGET_TAG" \
+            --arg canary "$CANARY_SLUG" \
+            --argjson soak "$SOAK_SECONDS" \
+            --argjson batch "$BATCH_SIZE" \
+            --argjson dry "$DRY_RUN" \
+            '{
+              target_tag: $tag,
+              canary_slug: $canary,
+              soak_seconds: $soak,
+              batch_size: $batch,
+              dry_run: $dry
+            }')
+
+          echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet"
+          echo "  body: $BODY"
+
+          HTTP_RESPONSE=$(mktemp)
+          HTTP_CODE=$(curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
+            -m 1200 \
+            -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
+            -H "Content-Type: application/json" \
+            -X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
+            -d "$BODY" || echo "000")
+
+          echo "HTTP $HTTP_CODE"
+          cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
+
+          # Pretty-print per-tenant results in the job summary so
+          # ops can see which tenants were redeployed without drilling
+          # into the raw response.
+          {
+            echo "## Tenant redeploy fleet"
+            echo ""
+            echo "**Target tag:** \`$TARGET_TAG\`"
+            echo "**Canary:** \`$CANARY_SLUG\` (soak ${SOAK_SECONDS}s)"
+            echo "**Batch size:** $BATCH_SIZE"
+            echo "**Dry run:** $DRY_RUN"
+            echo "**HTTP:** $HTTP_CODE"
+            echo ""
+            echo "### Per-tenant result"
+            echo ""
+            echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |'
+            echo '|------|-------|------------|------|---------|-------|'
+            jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true
+          } >> "$GITHUB_STEP_SUMMARY"
+
+          if [ "$HTTP_CODE" != "200" ]; then
+            echo "::error::redeploy-fleet returned HTTP $HTTP_CODE"
+            exit 1
+          fi
+          OK=$(jq -r '.ok' "$HTTP_RESPONSE")
+          if [ "$OK" != "true" ]; then
+            echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)"
+            exit 1
+          fi
+          echo "::notice::Tenant fleet redeploy complete."
--- a/.github/workflows/sweep-stale-e2e-orgs.yml
+++ b/.github/workflows/sweep-stale-e2e-orgs.yml
@ -0,0 +1,170 @@
+name: Sweep stale e2e-* orgs (staging)
+
+# Janitor for staging tenants left behind when E2E cleanup didn't run:
+# CI cancellations, runner crashes, transient AWS errors mid-cascade,
+# bash trap missed (signal 9), etc. Without this loop, every failed
+# teardown leaks an EC2 + DNS + DB row until manual ops cleanup —
+# 2026-04-23 staging hit the 64 vCPU AWS quota from ~27 such orphans.
+#
+# Why not rely on per-test-run teardown:
+#   - Per-run teardown is best-effort by definition. Any process death
+#     after the test starts but before the trap fires leaves debris.
+#   - GH Actions cancellation kills the runner without grace period.
+#     The workflow's `if: always()` step usually catches this, but it
+#     too can fail (CP transient 5xx, runner network issue at the
+#     wrong moment).
+#   - Even when teardown runs, the CP cascade is best-effort in places
+#     (cascadeTerminateWorkspaces logs+continues; DNS deletion same).
+#   - This sweep is the catch-all that converges staging back to clean
+#     regardless of which specific path leaked.
+#
+# The PROPER fix is making CP cleanup transactional + verify-after-
+# terminate (filed separately as cleanup-correctness work). This
+# workflow is the safety net that catches everything else AND any
+# future leak source we haven't yet identified.
+
+on:
+  schedule:
+    # Every hour on the hour. E2E orgs are short-lived (~10-25 min wall
+    # clock from create to teardown). Anything older than the
+    # MAX_AGE_MINUTES threshold below is presumed dead.
+    - cron: '0 * * * *'
+  workflow_dispatch:
+    inputs:
+      max_age_minutes:
+        description: "Delete e2e-* orgs older than N minutes (default 120)"
+        required: false
+        default: "120"
+      dry_run:
+        description: "Dry run only — list what would be deleted"
+        required: false
+        type: boolean
+        default: false
+
+# Don't let two sweeps fight. Cron + workflow_dispatch could overlap
+# on a manual trigger; queue rather than parallel-delete.
+concurrency:
+  group: sweep-stale-e2e-orgs
+  cancel-in-progress: false
+
+permissions:
+  contents: read
+
+jobs:
+  sweep:
+    name: Sweep e2e orgs
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    env:
+      MOLECULE_CP_URL: https://staging-api.moleculesai.app
+      ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
+      MAX_AGE_MINUTES: ${{ github.event.inputs.max_age_minutes || '120' }}
+      DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }}
+      # Refuse to delete more than this many orgs in one tick. If the
+      # CP DB is briefly empty (or the admin endpoint goes weird and
+      # returns no created_at), every e2e- org would look stale.
+      # Bailing protects against runaway nukes.
+      SAFETY_CAP: 50
+
+    steps:
+      - name: Verify admin token present
+        run: |
+          if [ -z "$ADMIN_TOKEN" ]; then
+            echo "::error::MOLECULE_STAGING_ADMIN_TOKEN not set"
+            exit 2
+          fi
+          echo "Admin token present ✓"
+
+      - name: Identify stale e2e orgs
+        id: identify
+        run: |
+          set -euo pipefail
+          # Fetch into a file so the python step reads it via stdin —
+          # cleaner than embedding $(curl ...) into a heredoc.
+          curl -sS --fail-with-body --max-time 30 \
+            "$MOLECULE_CP_URL/cp/admin/orgs?limit=500" \
+            -H "Authorization: Bearer $ADMIN_TOKEN" \
+            > orgs.json
+
+          # Filter:
+          #   1. slug starts with 'e2e-' (covers e2e-, e2e-canary-,
+          #      e2e-canvas-* — all variants the test scripts mint)
+          #   2. created_at is older than MAX_AGE_MINUTES ago
+          # Output one slug per line to a file the next step reads.
+          python3 > stale_slugs.txt <<'PY'
+          import json, os
+          from datetime import datetime, timezone, timedelta
+          with open("orgs.json") as f:
+              data = json.load(f)
+          max_age = int(os.environ["MAX_AGE_MINUTES"])
+          cutoff = datetime.now(timezone.utc) - timedelta(minutes=max_age)
+          for o in data.get("orgs", []):
+              slug = o.get("slug", "")
+              if not slug.startswith("e2e-"):
+                  continue
+              created = o.get("created_at")
+              if not created:
+                  # Defensively skip rows without created_at — better
+                  # to leave one orphan than nuke a brand-new row
+                  # whose timestamp didn't render.
+                  continue
+              # Python 3.11+ handles RFC3339 with Z directly via
+              # fromisoformat; older runners need the trailing Z swap.
+              created_dt = datetime.fromisoformat(created.replace("Z", "+00:00"))
+              if created_dt < cutoff:
+                  print(slug)
+          PY
+
+          count=$(wc -l < stale_slugs.txt | tr -d ' ')
+          echo "Found $count stale e2e org(s) older than ${MAX_AGE_MINUTES}m"
+          if [ "$count" -gt 0 ]; then
+            echo "First 20:"
+            head -20 stale_slugs.txt | sed 's/^/  /'
+          fi
+          echo "count=$count" >> "$GITHUB_OUTPUT"
+
+      - name: Safety gate
+        if: steps.identify.outputs.count != '0'
+        run: |
+          count="${{ steps.identify.outputs.count }}"
+          if [ "$count" -gt "$SAFETY_CAP" ]; then
+            echo "::error::Refusing to delete $count orgs in one sweep (cap=$SAFETY_CAP). Investigate manually — this usually means the CP admin API returned no created_at or returned a degraded result. Re-run with workflow_dispatch + max_age_minutes if intentional."
+            exit 1
+          fi
+          echo "Within safety cap ($count ≤ $SAFETY_CAP) ✓"
+
+      - name: Delete stale orgs
+        if: steps.identify.outputs.count != '0' && env.DRY_RUN != 'true'
+        run: |
+          set -uo pipefail
+          deleted=0
+          failed=0
+          while IFS= read -r slug; do
+            [ -z "$slug" ] && continue
+            # The DELETE handler requires {"confirm": "<slug>"} matching
+            # the URL slug — fat-finger guard. Idempotent: re-issuing
+            # picks up via org_purges.last_step.
+            http_code=$(curl -sS -o /tmp/del_resp -w "%{http_code}" \
+              --max-time 60 \
+              -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
+              -H "Authorization: Bearer $ADMIN_TOKEN" \
+              -H "Content-Type: application/json" \
+              -d "{\"confirm\":\"$slug\"}" || echo "000")
+            if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then
+              deleted=$((deleted+1))
+              echo "  deleted: $slug"
+            else
+              failed=$((failed+1))
+              echo "  FAILED ($http_code): $slug — $(cat /tmp/del_resp 2>/dev/null | head -c 200)"
+            fi
+          done < stale_slugs.txt
+          echo ""
+          echo "Sweep summary: deleted=$deleted failed=$failed"
+          # Don't fail the workflow on per-org delete errors — the
+          # sweeper is best-effort. Next hourly tick re-attempts. We
+          # only fail loud at the safety-cap gate above.
+
+      - name: Dry-run summary
+        if: env.DRY_RUN == 'true'
+        run: |
+          echo "DRY RUN — would have deleted ${{ steps.identify.outputs.count }} org(s). Re-run with dry_run=false to actually delete."
--- a/canvas/e2e/staging-setup.ts
+++ b/canvas/e2e/staging-setup.ts
@ -5,7 +5,7 @@
 * the per-tenant admin token, provisions one hermes workspace, waits
 * for online, then exports:
 *
- *   STAGING_TENANT_URL     https://<slug>.moleculesai.app
+ *   STAGING_TENANT_URL     https://<slug>.staging.moleculesai.app
 *   STAGING_WORKSPACE_ID   UUID of the hermes workspace
 *   STAGING_TENANT_TOKEN   per-tenant admin bearer (for spec requests)
 *   STAGING_SLUG           org slug (used by teardown)
@ -16,6 +16,11 @@
 *                          CP_ADMIN_API_TOKEN). Drives provision +
 *                          tenant-token retrieval + teardown via a
 *                          single credential.
+ *   STAGING_TENANT_DOMAIN  default: staging.moleculesai.app — the
+ *                          DNS suffix the CP provisioner writes for
+ *                          staging tenants. Override only when
+ *                          running this harness against a non-default
+ *                          zone.
 */

 import type { FullConfig } from "@playwright/test";
@ -25,6 +30,14 @@ import { join } from "path";
 const CP_URL = process.env.MOLECULE_CP_URL || "https://staging-api.moleculesai.app";
 const ADMIN_TOKEN = process.env.MOLECULE_ADMIN_TOKEN;
 const STAGING = process.env.CANVAS_E2E_STAGING === "1";
+// Tenant DNS zone for staging. CP provisioner registers DNS as
+// `<slug>.staging.moleculesai.app` (see internal/provisioner/ec2.go's
+// EC2 provisioner: DNS log line). The previous default of plain
+// `moleculesai.app` matched prod tenant naming and silently broke
+// every staging E2E at the TLS readiness step — DNS literally didn't
+// resolve, fetch threw NXDOMAIN, waitFor saw null on every poll, and
+// the harness wedged at TLS_TIMEOUT_MS instead of failing loud.
+const TENANT_DOMAIN = process.env.STAGING_TENANT_DOMAIN || "staging.moleculesai.app";

 // Tenant cold boot on staging regularly takes 12-15 min when the
 // workspace-server Docker image isn't already cached on the AMI. Raised
@ -105,22 +118,44 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
  }
  console.log(`[staging-setup] Org created: ${slug}`);

-  // 2. Wait for tenant running (admin-orgs list is the status source)
+  // 2. Wait for tenant running (admin-orgs list is the status source).
+  //
+  // The CP /cp/admin/orgs endpoint returns each org with an
+  // `instance_status` field (handlers/admin.go:adminOrgSummary,
+  // sourced from `org_instances.status`). NOT `status` — there's no
+  // top-level `status` on the row at all. A previous version of this
+  // test polled `row.status`, which was always undefined, so this
+  // waitFor never resolved truthy and the harness invariably timed
+  // out at 1200s — masking real CP bugs (see #242 chain) AND
+  // surviving real CP fixes alike.
+  // Capture the org UUID alongside the running check — every request
+  // we send to the tenant URL after this point needs an
+  // X-Molecule-Org-Id header (see workspace-server middleware/tenant_guard.go).
+  // Without it, TenantGuard returns 404 ("must not be inferable by
+  // probing other orgs' machines"). The CP returns the id on the
+  // admin-orgs row; capture it here while we're already polling.
+  let orgID = "";
  await waitFor<boolean>(
    async () => {
      const r = await jsonFetch(`${CP_URL}/cp/admin/orgs`, { headers: adminAuth });
      if (r.status !== 200) return null;
      const row = (r.body?.orgs || []).find((o: any) => o.slug === slug);
      if (!row) return null;
-      if (row.status === "running") return true;
-      if (row.status === "failed") throw new Error(`provision failed: ${slug}`);
+      if (row.instance_status === "running") {
+        orgID = row.id;
+        return true;
+      }
+      if (row.instance_status === "failed") throw new Error(`provision failed: ${slug}`);
      return null;
    },
    PROVISION_TIMEOUT_MS,
    15_000,
    "tenant provision",
  );
-  console.log(`[staging-setup] Tenant running`);
+  if (!orgID) {
+    throw new Error(`expected admin-orgs row to carry id, got empty for slug=${slug}`);
+  }
+  console.log(`[staging-setup] Tenant running (org_id=${orgID})`);

  // 3. Fetch per-tenant admin token
  const tokRes = await jsonFetch(
@ -133,7 +168,7 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
    );
  }
  const tenantToken: string = tokRes.body.admin_token;
-  const tenantURL = `https://${slug}.moleculesai.app`;
+  const tenantURL = `https://${slug}.${TENANT_DOMAIN}`;
  console.log(`[staging-setup] Tenant URL: ${tenantURL}`);

  // 4. TLS readiness
@ -154,7 +189,17 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
  );

  // 5. Provision workspace
-  const tenantAuth = { Authorization: `Bearer ${tenantToken}` };
+  //
+  // tenantAuth carries TWO headers, both required:
+  //   - Authorization: Bearer <admin-token>  — wsAdmin middleware gate
+  //   - X-Molecule-Org-Id: <uuid>           — TenantGuard cross-org gate
+  // Missing the org-id header silently 404s every non-allowlisted
+  // route, with no body and no security headers. The 404 is intentional
+  // (existence-non-inference) which makes it look like a missing route.
+  const tenantAuth = {
+    "Authorization": `Bearer ${tenantToken}`,
+    "X-Molecule-Org-Id": orgID,
+  };
  const ws = await jsonFetch(`${tenantURL}/workspaces`, {
    method: "POST",
    headers: tenantAuth,
--- a/canvas/e2e/staging-tabs.spec.ts
+++ b/canvas/e2e/staging-tabs.spec.ts
@ -63,6 +63,82 @@ test.describe("staging canvas tabs", () => {
      Authorization: `Bearer ${tenantToken}`,
    });

+    // canvas/src/components/AuthGate.tsx fetches /cp/auth/me on mount
+    // and redirects to the login page on 401. The bearer header above
+    // is for platform API calls — it does NOT satisfy /cp/auth/me,
+    // which is cookie-based (WorkOS session). Without this mock, the
+    // canvas page mounts AuthGate, sees 401 from /cp/auth/me, and
+    // redirects away from the tenant URL before the React Flow root
+    // ever renders. The [aria-label] selector wait then times out.
+    //
+    // Intercept /cp/auth/me + return a fake Session shape so AuthGate
+    // resolves to "authenticated" and renders {children}. The session
+    // contents are cosmetic — the canvas only inspects org_id/user_id
+    // in a few places that don't fail when these are dummy values.
+    await context.route("**/cp/auth/me", (route) =>
+      route.fulfill({
+        status: 200,
+        contentType: "application/json",
+        body: JSON.stringify({
+          user_id: `e2e-test-user-${workspaceId}`,
+          org_id: "e2e-test-org",
+          email: "e2e@test.local",
+        }),
+      }),
+    );
+
+    // Universal 401 → empty-200 fallback (defense-in-depth).
+    //
+    // The original product bug was canvas/src/lib/api.ts:62-74 calling
+    // `redirectToLogin` on EVERY 401 — a single workspace-scoped 401
+    // (e.g. /workspaces/:id/peers, /plugins) yanked the user (and the
+    // test) to AuthKit. That's now fixed at the source: api.ts probes
+    // /cp/auth/me before redirecting, so a 401 from a non-auth path
+    // with a live session throws a regular error instead.
+    //
+    // This route handler stays as a SAFETY NET, not the primary
+    // defense:
+    //   1. It silences resource-load console noise from the browser
+    //      (those messages don't include the URL — useless in
+    //      diagnostics, captured by the filter in the assertion
+    //      block but having no 401s reach the network is cleaner).
+    //   2. It guards against panels that DON'T have try/catch around
+    //      their api calls — an unhandled rejection would surface
+    //      as console.error → fail the assertion. Panels SHOULD
+    //      handle errors, but until they're all audited, this is
+    //      the test's belt to api.ts's braces.
+    //
+    // Pass-through real responses; swap 401s for 200 + empty body.
+    // Skip /cp/auth/me (mocked above) and non-fetch resources
+    // (HTML/JS/CSS bundles that should NOT be intercepted).
+    await context.route("**", async (route, request) => {
+      if (request.resourceType() !== "fetch") {
+        return route.fallback();
+      }
+      // /cp/auth/me is mocked above with a fixed Session shape — let
+      // that handler win without us round-tripping the network.
+      if (request.url().includes("/cp/auth/me")) {
+        return route.fallback();
+      }
+      let resp;
+      try {
+        resp = await route.fetch();
+      } catch {
+        return route.fallback();
+      }
+      if (resp.status() !== 401) {
+        return route.fulfill({ response: resp });
+      }
+      const lastSeg =
+        new URL(request.url()).pathname.split("/").filter(Boolean).pop() || "";
+      const looksLikeList = !/^[0-9a-f-]{8,}$/.test(lastSeg);
+      await route.fulfill({
+        status: 200,
+        contentType: "application/json",
+        body: looksLikeList ? "[]" : "{}",
+      });
+    });
+
    const consoleErrors: string[] = [];
    page.on("console", (msg) => {
      if (msg.type() === "error") {
@ -70,13 +146,38 @@ test.describe("staging canvas tabs", () => {
      }
    });

-    await page.goto(tenantURL, { waitUntil: "networkidle" });
+    // Capture the URL of any failed network request so a "Failed to load
+    // resource: 404" console message we filter out below leaves a
+    // breadcrumb. Browser console messages for resource-load failures
+    // omit the URL, so we'd otherwise be flying blind. Logged to the
+    // test's stdout (visible in the workflow log under the failed step).
+    page.on("requestfailed", (req) => {
+      console.log(`[e2e/requestfailed] ${req.method()} ${req.url()}: ${req.failure()?.errorText ?? "?"}`);
+    });
+    page.on("response", (res) => {
+      if (res.status() >= 400) {
+        console.log(`[e2e/response-${res.status()}] ${res.request().method()} ${res.url()}`);
+      }
+    });
+
+    // waitUntil="networkidle" is wrong here — the canvas keeps a
+    // WebSocket open + polls /events and /workspaces every few
+    // seconds, so the network is *never* idle for 500ms. page.goto
+    // would hang until its 45s default timeout. "domcontentloaded"
+    // returns as soon as the HTML is parsed; React hydration + the
+    // selector wait below is what actually gates ready-for-interaction.
+    await page.goto(tenantURL, { waitUntil: "domcontentloaded" });

    // Canvas hydration races WebSocket connect + /workspaces fetch.
-    // Wait for the tablist element (appears after a workspace is
-    // selected) or the hydration-error banner — whichever wins first.
+    // Wait for the React Flow canvas wrapper (always present once
+    // hydrated, even with zero workspaces) or the hydration-error
+    // banner — whichever wins first. Previous version of this wait
+    // used `[role="tablist"]`, but that selector only appears AFTER
+    // a workspace node is clicked (which happens below at L100), so
+    // the wait would always time out at 45s before any meaningful
+    // failure surfaced.
    await page.waitForSelector(
-      '[role="tablist"], [data-testid="hydration-error"]',
+      '[aria-label="Molecule AI workspace canvas"], [data-testid="hydration-error"]',
      { timeout: 45_000 },
    );

@ -106,6 +207,15 @@ test.describe("staging canvas tabs", () => {
    for (const tabId of TAB_IDS) {
      await test.step(`tab: ${tabId}`, async () => {
        const tabButton = page.locator(`#tab-${tabId}`);
+        // The TABS bar is `overflow-x-auto` (SidePanel.tsx:~tabs
+        // wrapper) — tabs after position ~3 are clipped behind the
+        // right-edge fade gradient on smaller viewports. Playwright's
+        // `toBeVisible()` returns false for clipped elements, so a
+        // bare visibility check fails on `skills` and later tabs in
+        // CI. scrollIntoViewIfNeeded brings the button into view
+        // before the visibility check, mirroring what SidePanel's own
+        // keyboard handler does on arrow-key navigation.
+        await tabButton.scrollIntoViewIfNeeded({ timeout: 5_000 });
        await expect(
          tabButton,
          `tab-${tabId} button missing — TABS list may have drifted`,
@ -134,14 +244,22 @@ test.describe("staging canvas tabs", () => {

    // Aggregate console-error budget. Known-noisy sources whitelisted:
    // Sentry, Vercel analytics, WS reconnects (expected on SaaS
-    // terminal), favicon 404 (cosmetic).
+    // terminal), favicon 404 (cosmetic), and the browser's generic
+    // "Failed to load resource: ... 404" message which never includes
+    // the URL — uninformative on its own and impossible to filter
+    // meaningfully without a URL. The page.on('requestfailed') +
+    // page.on('response>=400') logging above captures the actual URLs
+    // so a real bug still leaves a breadcrumb in the workflow log;
+    // a real exception (panel crash, JS error) surfaces as a typed
+    // error with file path which the filter still catches.
    const appErrors = consoleErrors.filter(
      (msg) =>
        !msg.includes("sentry") &&
        !msg.includes("vercel") &&
        !msg.includes("WebSocket") &&
        !msg.includes("favicon") &&
-        !msg.includes("molecule-icon.png"), // another cosmetic 404
+        !msg.includes("molecule-icon.png") && // cosmetic 404
+        !msg.includes("Failed to load resource"),
    );
    expect(
      appErrors,
--- a/canvas/src/app/page.tsx
+++ b/canvas/src/app/page.tsx
@ -74,6 +74,11 @@ export default function Home() {
      {hydrationError && (
        <div
          role="alert"
+          // Stable testid so the staging E2E (canvas/e2e/staging-tabs.spec.ts)
+          // can detect this banner without depending on the role="alert"
+          // selector that's used by other transient toasts. Don't rename
+          // without updating that spec.
+          data-testid="hydration-error"
          className="fixed inset-0 flex flex-col items-center justify-center bg-zinc-950 text-zinc-300 gap-4 z-[9999]"
        >
          <p className="text-zinc-400 text-sm">{hydrationError}</p>
--- a/canvas/src/app/pricing/page.tsx
+++ b/canvas/src/app/pricing/page.tsx
@ -14,7 +14,7 @@ import { PricingTable } from "@/components/PricingTable";
 export const metadata = {
  title: "Pricing — Molecule AI",
  description:
-    "Free while you tinker, paid tiers for shipping production multi-agent organizations. Transparent usage-based overage pricing on Pro.",
+    "Flat-rate team and org pricing — no per-seat fees. Free to start, $29/month for teams, $99/month for production orgs. Full runtime stack included on every paid tier.",
 };

 export default function PricingPage() {
@ -25,9 +25,12 @@ export default function PricingPage() {
          Pricing
        </h1>
        <p className="mx-auto mt-4 max-w-2xl text-lg text-zinc-300">
-          Free while you tinker. Pay when you ship real agents to production.
-          Every tier includes the full runtime stack — you upgrade for scale,
-          support, and dedicated infrastructure.
+          One flat price per org — not per seat. Every paid tier includes the
+          full runtime stack. You upgrade for scale, support, and dedicated
+          infrastructure.
+        </p>
+        <p className="mx-auto mt-2 max-w-xl text-sm text-zinc-400">
+          5-person team? You pay $29/month — not $200. No seat math, ever.
        </p>
      </div>

@ -53,7 +56,8 @@ export default function PricingPage() {
          .
        </p>
        <p className="mt-6 text-sm text-zinc-500">
-          Prices shown in USD. Enterprise / self-hosted licensing available — contact us.
+          Prices shown in USD. Flat-rate per org — no per-seat fees on any paid tier.
+          Enterprise / self-hosted licensing available — contact us.
        </p>
      </section>

--- a/canvas/src/components/tests/PricingTable.test.tsx
+++ b/canvas/src/components/tests/PricingTable.test.tsx
@ -50,14 +50,14 @@ describe("PricingTable", () => {
  it("renders all three plans with their CTAs", () => {
    render(<PricingTable />);
    expect(screen.getByRole("heading", { name: "Free" })).toBeTruthy();
-    expect(screen.getByRole("heading", { name: "Starter" })).toBeTruthy();
-    expect(screen.getByRole("heading", { name: "Pro" })).toBeTruthy();
+    expect(screen.getByRole("heading", { name: "Team" })).toBeTruthy();
+    expect(screen.getByRole("heading", { name: "Growth" })).toBeTruthy();
    expect(screen.getByRole("button", { name: "Get started" })).toBeTruthy();
-    expect(screen.getByRole("button", { name: "Upgrade to Starter" })).toBeTruthy();
-    expect(screen.getByRole("button", { name: "Upgrade to Pro" })).toBeTruthy();
+    expect(screen.getByRole("button", { name: "Upgrade to Team" })).toBeTruthy();
+    expect(screen.getByRole("button", { name: "Upgrade to Growth" })).toBeTruthy();
  });

-  it("shows the 'Most popular' badge only on the starter card", () => {
+  it("shows the 'Most popular' badge only on the Team card", () => {
    render(<PricingTable />);
    const badges = screen.getAllByText("Most popular");
    expect(badges.length).toBe(1);
@ -74,7 +74,7 @@ describe("PricingTable", () => {
  it("Paid CTA + anonymous → bounces to signup (no checkout call)", async () => {
    mockedFetchSession.mockResolvedValue(null);
    render(<PricingTable />);
-    fireEvent.click(screen.getByRole("button", { name: "Upgrade to Starter" }));
+    fireEvent.click(screen.getByRole("button", { name: "Upgrade to Team" }));
    await waitFor(() => expect(mockedRedirectToLogin).toHaveBeenCalledWith("sign-up"));
    expect(mockedStartCheckout).not.toHaveBeenCalled();
  });
@ -91,7 +91,7 @@ describe("PricingTable", () => {
    });

    render(<PricingTable />);
-    fireEvent.click(screen.getByRole("button", { name: "Upgrade to Pro" }));
+    fireEvent.click(screen.getByRole("button", { name: "Upgrade to Growth" }));

    await waitFor(() =>
      expect(mockedStartCheckout).toHaveBeenCalledWith("pro", "acme"),
@ -111,7 +111,7 @@ describe("PricingTable", () => {
    mockedGetTenantSlug.mockReturnValue("");

    render(<PricingTable />);
-    fireEvent.click(screen.getByRole("button", { name: "Upgrade to Starter" }));
+    fireEvent.click(screen.getByRole("button", { name: "Upgrade to Team" }));

    await waitFor(() => {
      const alert = screen.getByRole("alert");
@ -129,7 +129,7 @@ describe("PricingTable", () => {
    mockedStartCheckout.mockRejectedValue(new Error("checkout: 500 boom"));

    render(<PricingTable />);
-    fireEvent.click(screen.getByRole("button", { name: "Upgrade to Pro" }));
+    fireEvent.click(screen.getByRole("button", { name: "Upgrade to Growth" }));

    await waitFor(() => {
      const alert = screen.getByRole("alert");
@ -140,7 +140,7 @@ describe("PricingTable", () => {
  it("treats fetchSession network errors as anonymous (fail-closed to signup)", async () => {
    mockedFetchSession.mockRejectedValue(new Error("network down"));
    render(<PricingTable />);
-    fireEvent.click(screen.getByRole("button", { name: "Upgrade to Starter" }));
+    fireEvent.click(screen.getByRole("button", { name: "Upgrade to Team" }));
    await waitFor(() => expect(mockedRedirectToLogin).toHaveBeenCalledWith("sign-up"));
    expect(mockedStartCheckout).not.toHaveBeenCalled();
  });
@ -155,7 +155,7 @@ describe("PricingTable", () => {
    mockedStartCheckout.mockReturnValue(new Promise(() => {}));

    render(<PricingTable />);
-    const button = screen.getByRole("button", { name: "Upgrade to Pro" });
+    const button = screen.getByRole("button", { name: "Upgrade to Growth" });
    fireEvent.click(button);

    await waitFor(() => {
--- a/canvas/src/lib/tests/api-401.test.ts
+++ b/canvas/src/lib/tests/api-401.test.ts
@ -6,32 +6,44 @@ import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
 // runs happily in node. Splitting keeps the node tests fast.

 // ---------------------------------------------------------------------------
-// 401 handling — gated on SaaS-tenant hostname
+// 401 handling — session-probe-before-redirect
 // ---------------------------------------------------------------------------
 //
-// Before fix/quickstart-bugless, any 401 from any endpoint triggered
-// `redirectToLogin()`, navigating to `/cp/auth/login`. That route
-// exists only on SaaS (mounted by cp_proxy when CP_UPSTREAM_URL is
-// set). On localhost / self-hosted / Vercel preview it 404s, so the
-// user lands on a broken login page instead of seeing the actual error.
+// History:
+//   1. fix/quickstart-bugless: gated redirect on SaaS hostname (slug).
+//   2. fix/api-401-probe-before-redirect (this file): probe /cp/auth/me
+//      before redirecting on a 401 from a non-auth path. The earlier
+//      behaviour redirected on EVERY 401, so a single 401 from
+//      /workspaces/:id/plugins (workspace-scoped — refused by the
+//      tenant admin bearer) yanked the user to AuthKit even when
+//      the session was fine. The probe lets us tell "session dead"
+//      from "endpoint refused this token."
 //
-// These tests lock in:
-//   - SaaS tenant hostname (*.moleculesai.app) → 401 still redirects.
-//   - non-SaaS hostname (localhost, LAN IP, apex) → 401 throws, no
-//     redirect, so the caller renders a real error affordance.
+// Matrix:
+//   slug    | path             | probe → me | expected
+//   ---     | ---              | ---        | ---
+//   acme    | /cp/auth/me      | (n/a)      | redirect (path IS auth)
+//   acme    | /workspaces/...  | 401        | redirect (session dead)
+//   acme    | /workspaces/...  | 200        | throw, no redirect
+//   acme    | /workspaces/...  | network err| throw, no redirect
+//   ""      | /workspaces/...  | (n/a)      | throw, no redirect (no slug)

 const mockFetch = vi.fn();
 globalThis.fetch = mockFetch;

-function mockFailure(status: number, text: string) {
+function mockNextResponse(status: number, text = "") {
  mockFetch.mockResolvedValueOnce({
-    ok: false,
+    ok: status >= 200 && status < 300,
    status,
    json: () => Promise.reject(new Error("no json")),
    text: () => Promise.resolve(text),
  } as unknown as Response);
 }

+function mockNextNetworkError() {
+  mockFetch.mockRejectedValueOnce(new Error("network"));
+}
+
 function setHostname(host: string) {
  Object.defineProperty(window, "location", {
    configurable: true,
@ -59,27 +71,66 @@ describe("api 401 handling", () => {
    vi.resetModules();
  });

-  it("redirects to login on SaaS tenant hostname", async () => {
+  it("redirects when /cp/auth/me itself 401s — that IS the session-dead signal", async () => {
    setHostname("acme.moleculesai.app");
-    mockFailure(401, '{"error":"admin auth required"}');
+    // Single fetch: the /cp/auth/me call itself.
+    mockNextResponse(401, '{"error":"unauthenticated"}');

    const { api } = await import("../api");
-    await expect(api.get("/workspaces")).rejects.toThrow(/Session expired/);
+    await expect(api.get("/cp/auth/me")).rejects.toThrow(/Session expired/);
    expect(redirectSpy).toHaveBeenCalledWith("sign-in");
+    // No probe fired — we already know the session is dead.
+    expect(mockFetch).toHaveBeenCalledTimes(1);
+  });
+
+  it("redirects when /cp/auth/me probe ALSO 401s — session genuinely dead", async () => {
+    setHostname("acme.moleculesai.app");
+    // First call: the workspace-scoped fetch returns 401.
+    mockNextResponse(401, '{"error":"workspace token required"}');
+    // Second call: the probe to /cp/auth/me also 401s.
+    mockNextResponse(401, '{"error":"unauthenticated"}');
+
+    const { api } = await import("../api");
+    await expect(api.get("/workspaces/abc/plugins")).rejects.toThrow(/Session expired/);
+    expect(redirectSpy).toHaveBeenCalledWith("sign-in");
+  });
+
+  it("does NOT redirect when probe returns 200 — endpoint refused this token, session fine", async () => {
+    setHostname("acme.moleculesai.app");
+    // First call: workspace-scoped 401.
+    mockNextResponse(401, '{"error":"workspace token required"}');
+    // Second call: probe shows the session is alive.
+    mockNextResponse(200, '{"user_id":"u1","org_id":"o1","email":"x@y"}');
+
+    const { api } = await import("../api");
+    await expect(api.get("/workspaces/abc/plugins")).rejects.toThrow(/401/);
+    expect(redirectSpy).not.toHaveBeenCalled();
+  });
+
+  it("does NOT redirect when probe network-errors — conservative fallback", async () => {
+    setHostname("acme.moleculesai.app");
+    mockNextResponse(401, '{"error":"workspace token required"}');
+    mockNextNetworkError();
+
+    const { api } = await import("../api");
+    await expect(api.get("/workspaces/abc/plugins")).rejects.toThrow(/401/);
+    expect(redirectSpy).not.toHaveBeenCalled();
  });

  it("does NOT redirect on localhost — throws a real error instead", async () => {
    setHostname("localhost");
-    mockFailure(401, '{"error":"admin auth required"}');
+    mockNextResponse(401, '{"error":"admin auth required"}');

    const { api } = await import("../api");
    await expect(api.get("/workspaces")).rejects.toThrow(/401/);
    expect(redirectSpy).not.toHaveBeenCalled();
+    // No slug → no probe fires either.
+    expect(mockFetch).toHaveBeenCalledTimes(1);
  });

  it("does NOT redirect on a LAN hostname", async () => {
    setHostname("192.168.1.74");
-    mockFailure(401, '{"error":"missing workspace auth token"}');
+    mockNextResponse(401, '{"error":"missing workspace auth token"}');

    const { api } = await import("../api");
    await expect(api.get("/workspaces/abc/activity")).rejects.toThrow(/401/);
@ -91,7 +142,7 @@ describe("api 401 handling", () => {
    // Users landing on app.moleculesai.app (pre-tenant-selection) must
    // see the real 401 error rather than loop on login.
    setHostname("app.moleculesai.app");
-    mockFailure(401, '{"error":"admin auth required"}');
+    mockNextResponse(401, '{"error":"admin auth required"}');

    const { api } = await import("../api");
    await expect(api.get("/workspaces")).rejects.toThrow(/401/);
--- a/canvas/src/lib/api.ts
+++ b/canvas/src/lib/api.ts
@ -60,15 +60,45 @@ async function request<T>(
    return request<T>(method, path, body, retryCount + 1, options);
  }
  if (res.status === 401) {
-    // Session expired or credentials lost. On SaaS (tenant subdomain)
-    // the login page lives at /cp/auth/login and is mounted by the
-    // control-plane reverse proxy — redirect. On self-hosted / local
-    // dev / Vercel preview there IS no /cp/* mount, so redirecting
-    // would navigate to a 404 ("404 page not found") instead of the
-    // real error the user should see. In that case, throw instead
-    // and let the caller render a meaningful failure (retry button,
-    // error banner, etc.).
-    if (slug) {
+    // Distinguish "session is dead" from "this endpoint refused this
+    // token." Old behaviour blanket-redirected on every 401, so a
+    // single transient 401 from a workspace-scoped endpoint
+    // (/workspaces/:id/peers, /plugins, etc. that need a workspace
+    // token rather than the tenant admin bearer) yanked the user
+    // back to AuthKit even when their session was perfectly fine.
+    // That broke the staging-tabs E2E for the entire 2026-04-25
+    // night; #2073/#2074 worked around the symptom in the test by
+    // mocking 401→200 for every fetch, but the user-facing bug
+    // stayed.
+    //
+    // The canonical "session is dead" signal is /cp/auth/me
+    // returning 401. For any 401 on a non-auth path, probe
+    // /cp/auth/me before deciding to redirect:
+    //   - probe 401 → session is actually dead → redirect
+    //   - probe 200 → session is fine, the endpoint just refused
+    //                 our specific token → throw a real error,
+    //                 caller renders an error state
+    //   - probe network error → assume session-fine (conservative;
+    //                 better to throw than to redirect on a
+    //                 transient probe failure)
+    //
+    // Self-hosted / localhost / reserved subdomains still throw
+    // without redirecting (slug is empty in those cases) — same
+    // policy as before.
+    const isAuthPath = path.startsWith("/cp/auth/");
+    let sessionDead = isAuthPath;
+    if (!isAuthPath && slug) {
+      try {
+        const probe = await fetch(`${PLATFORM_URL}/cp/auth/me`, {
+          credentials: "include",
+          signal: AbortSignal.timeout(5000),
+        });
+        sessionDead = probe.status === 401;
+      } catch {
+        // Probe failed (network/timeout) — fall through to throw.
+      }
+    }
+    if (sessionDead && slug) {
      const { redirectToLogin } = await import("./auth");
      redirectToLogin("sign-in");
      throw new Error("Session expired — redirecting to login");
--- a/canvas/src/lib/billing.ts
+++ b/canvas/src/lib/billing.ts
@ -32,6 +32,10 @@ export interface Plan {
 // plans is the canonical order shown on the pricing page: free → starter
 // → pro. Change the order here + the rendered columns follow. Keeping
 // this as a module-level const so tests can assert against a known list.
+//
+// Flat-rate positioning (Issue #1833): "starter" and "pro" are flat-rate
+// per-org, not per-seat. This is a deliberate wedge against Cursor/Windsurf
+// ($40/seat) — at 5 engineers the Team tier is 28% cheaper.
 export const plans: Plan[] = [
  {
    id: "free",
@ -48,8 +52,8 @@ export const plans: Plan[] = [
  },
  {
    id: "starter",
-    name: "Starter",
-    tagline: "For small teams shipping real agents",
+    name: "Team",
+    tagline: "Flat-rate for teams — one price, no per-seat fees",
    price: "$29/month",
    features: [
      "10 workspaces",
@ -57,14 +61,15 @@ export const plans: Plan[] = [
      "Private Upstash Redis namespace",
      "Email support (48h)",
      "5M LLM tokens / month included",
+      "No per-seat pricing",
    ],
-    ctaLabel: "Upgrade to Starter",
+    ctaLabel: "Upgrade to Team",
    highlighted: true,
  },
  {
    id: "pro",
-    name: "Pro",
-    tagline: "For production multi-agent orgs",
+    name: "Growth",
+    tagline: "Flat-rate for production multi-agent orgs",
    price: "$99/month",
    features: [
      "Unlimited workspaces",
@ -72,9 +77,10 @@ export const plans: Plan[] = [
      "Cross-workspace A2A audit log",
      "Priority support (24h)",
      "25M LLM tokens / month included",
+      "No per-seat pricing",
      "Usage-based overage billing",
    ],
-    ctaLabel: "Upgrade to Pro",
+    ctaLabel: "Upgrade to Growth",
  },
 ];

--- a/scripts/ops/sweep-cf-orphans.sh
+++ b/scripts/ops/sweep-cf-orphans.sh
@ -32,7 +32,7 @@
 set -euo pipefail

 DRY_RUN=1
-MAX_DELETE_PCT=50   # refuse to delete more than half the records in one run
+MAX_DELETE_PCT="${MAX_DELETE_PCT:-50}"   # refuse to delete more than this pct of records in one run; caller can override via env
 REGION="${AWS_DEFAULT_REGION:-us-east-2}"

 for arg in "$@"; do