forked from molecule-ai/molecule-core
Merge branch 'staging' into fix/canvas-multilevel-layout-ux
This commit is contained in:
commit
8543bae83f
19
.github/workflows/canary-staging.yml
vendored
19
.github/workflows/canary-staging.yml
vendored
@ -43,6 +43,17 @@ jobs:
|
||||
env:
|
||||
MOLECULE_CP_URL: https://staging-api.moleculesai.app
|
||||
MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
||||
# Without an LLM key the test_staging_full_saas.sh script provisions
|
||||
# the workspace with empty secrets, hermes derive-provider.sh resolves
|
||||
# `openai/gpt-4o` to PROVIDER=openrouter, no OPENROUTER_API_KEY is
|
||||
# found in env, and A2A returns "No LLM provider configured" at
|
||||
# request time (canary step 8/11). The full-lifecycle workflow
|
||||
# (e2e-staging-saas.yml) has carried this secret since launch — the
|
||||
# canary regressed when it was first split out and lost the env
|
||||
# block. Issue #1500 had ~30 consecutive failures before this was
|
||||
# spotted; do NOT remove without re-reading the script's secrets-
|
||||
# injection block.
|
||||
E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }}
|
||||
E2E_MODE: canary
|
||||
E2E_RUNTIME: hermes
|
||||
E2E_RUN_ID: "canary-${{ github.run_id }}"
|
||||
@ -57,6 +68,14 @@ jobs:
|
||||
exit 2
|
||||
fi
|
||||
|
||||
- name: Verify OpenAI key present
|
||||
run: |
|
||||
if [ -z "$E2E_OPENAI_API_KEY" ]; then
|
||||
echo "::error::MOLECULE_STAGING_OPENAI_KEY secret not set — A2A will fail at request time with 'No LLM provider configured'"
|
||||
exit 2
|
||||
fi
|
||||
echo "OpenAI key present ✓ (len=${#E2E_OPENAI_API_KEY})"
|
||||
|
||||
- name: Canary run
|
||||
id: canary
|
||||
run: bash tests/e2e/test_staging_full_saas.sh
|
||||
|
||||
164
.github/workflows/redeploy-tenants-on-main.yml
vendored
Normal file
164
.github/workflows/redeploy-tenants-on-main.yml
vendored
Normal file
@ -0,0 +1,164 @@
|
||||
name: redeploy-tenants-on-main
|
||||
|
||||
# Auto-refresh prod tenant EC2s after every main merge.
|
||||
#
|
||||
# Why this workflow exists: publish-workspace-server-image builds and
|
||||
# pushes a new platform-tenant:latest + :<sha> to GHCR on every merge
|
||||
# to main, but running tenants pulled their image once at boot and
|
||||
# never re-pull. Users see stale code indefinitely.
|
||||
#
|
||||
# This workflow closes the gap by calling the control-plane admin
|
||||
# endpoint that performs a canary-first, batched, health-gated rolling
|
||||
# redeploy across every live tenant. Implemented in Molecule-AI/
|
||||
# molecule-controlplane as POST /cp/admin/tenants/redeploy-fleet
|
||||
# (feat/tenant-auto-redeploy, landing alongside this workflow).
|
||||
#
|
||||
# Runtime ordering:
|
||||
# 1. publish-workspace-server-image completes → new :latest in GHCR.
|
||||
# 2. This workflow fires via workflow_run, waits 30s for GHCR's
|
||||
# CDN to propagate the new tag to the region the tenants pull from.
|
||||
# 3. Calls redeploy-fleet with canary_slug=hongmingwang and a 60s
|
||||
# soak. Canary proves the image boots; batches follow.
|
||||
# 4. Any failure aborts the rollout and leaves older tenants on the
|
||||
# prior image — safer default than half-and-half state.
|
||||
#
|
||||
# Rollback path: re-run this workflow with a specific SHA pinned via
|
||||
# the workflow_dispatch input. That calls redeploy-fleet with
|
||||
# target_tag=<sha>, re-pulling the older image on every tenant.
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: ['publish-workspace-server-image']
|
||||
types: [completed]
|
||||
branches: [main]
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
target_tag:
|
||||
description: 'Tenant image tag to deploy (e.g. "latest" or "a59f1a6c"). Defaults to latest when empty.'
|
||||
required: false
|
||||
type: string
|
||||
default: 'latest'
|
||||
canary_slug:
|
||||
description: 'Tenant slug to deploy first + soak (empty = skip canary, fan out immediately).'
|
||||
required: false
|
||||
type: string
|
||||
default: 'hongmingwang'
|
||||
soak_seconds:
|
||||
description: 'Seconds to wait after canary before fanning out.'
|
||||
required: false
|
||||
type: string
|
||||
default: '60'
|
||||
batch_size:
|
||||
description: 'How many tenants SSM redeploys in parallel per batch.'
|
||||
required: false
|
||||
type: string
|
||||
default: '3'
|
||||
dry_run:
|
||||
description: 'Plan only — do not actually redeploy.'
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
# No write scopes needed — the workflow hits an external CP endpoint,
|
||||
# not the GitHub API.
|
||||
|
||||
jobs:
|
||||
redeploy:
|
||||
# Skip the auto-trigger if publish-workspace-server-image didn't
|
||||
# actually succeed. workflow_run fires on any completion state; we
|
||||
# don't want to redeploy against a half-built image.
|
||||
if: |
|
||||
github.event_name == 'workflow_dispatch' ||
|
||||
(github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 25
|
||||
steps:
|
||||
- name: Wait for GHCR tag propagation
|
||||
# GHCR's edge cache takes ~15-30s to consistently serve the new
|
||||
# :latest manifest after the registry accepts the push. Without
|
||||
# this sleep, the first tenant's docker pull sometimes races
|
||||
# and fetches the previous digest; sleeping is the cheapest
|
||||
# way to reduce that without polling GHCR for the new digest.
|
||||
run: sleep 30
|
||||
|
||||
- name: Call CP redeploy-fleet
|
||||
# CP_ADMIN_API_TOKEN must be set as a repo/org secret on
|
||||
# Molecule-AI/molecule-core, matching the staging/prod CP's
|
||||
# CP_ADMIN_API_TOKEN env. Stored in Railway, mirrored to this
|
||||
# repo's secrets for CI.
|
||||
env:
|
||||
CP_URL: ${{ vars.CP_URL || 'https://api.moleculesai.app' }}
|
||||
CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
|
||||
TARGET_TAG: ${{ inputs.target_tag || 'latest' }}
|
||||
CANARY_SLUG: ${{ inputs.canary_slug || 'hongmingwang' }}
|
||||
SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }}
|
||||
BATCH_SIZE: ${{ inputs.batch_size || '3' }}
|
||||
DRY_RUN: ${{ inputs.dry_run || false }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then
|
||||
echo "::error::CP_ADMIN_API_TOKEN secret not set — skipping redeploy"
|
||||
echo "::notice::Set CP_ADMIN_API_TOKEN in repo secrets to enable auto-redeploy."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
BODY=$(jq -nc \
|
||||
--arg tag "$TARGET_TAG" \
|
||||
--arg canary "$CANARY_SLUG" \
|
||||
--argjson soak "$SOAK_SECONDS" \
|
||||
--argjson batch "$BATCH_SIZE" \
|
||||
--argjson dry "$DRY_RUN" \
|
||||
'{
|
||||
target_tag: $tag,
|
||||
canary_slug: $canary,
|
||||
soak_seconds: $soak,
|
||||
batch_size: $batch,
|
||||
dry_run: $dry
|
||||
}')
|
||||
|
||||
echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet"
|
||||
echo " body: $BODY"
|
||||
|
||||
HTTP_RESPONSE=$(mktemp)
|
||||
HTTP_CODE=$(curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
|
||||
-m 1200 \
|
||||
-H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
|
||||
-d "$BODY" || echo "000")
|
||||
|
||||
echo "HTTP $HTTP_CODE"
|
||||
cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
|
||||
|
||||
# Pretty-print per-tenant results in the job summary so
|
||||
# ops can see which tenants were redeployed without drilling
|
||||
# into the raw response.
|
||||
{
|
||||
echo "## Tenant redeploy fleet"
|
||||
echo ""
|
||||
echo "**Target tag:** \`$TARGET_TAG\`"
|
||||
echo "**Canary:** \`$CANARY_SLUG\` (soak ${SOAK_SECONDS}s)"
|
||||
echo "**Batch size:** $BATCH_SIZE"
|
||||
echo "**Dry run:** $DRY_RUN"
|
||||
echo "**HTTP:** $HTTP_CODE"
|
||||
echo ""
|
||||
echo "### Per-tenant result"
|
||||
echo ""
|
||||
echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |'
|
||||
echo '|------|-------|------------|------|---------|-------|'
|
||||
jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
if [ "$HTTP_CODE" != "200" ]; then
|
||||
echo "::error::redeploy-fleet returned HTTP $HTTP_CODE"
|
||||
exit 1
|
||||
fi
|
||||
OK=$(jq -r '.ok' "$HTTP_RESPONSE")
|
||||
if [ "$OK" != "true" ]; then
|
||||
echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)"
|
||||
exit 1
|
||||
fi
|
||||
echo "::notice::Tenant fleet redeploy complete."
|
||||
170
.github/workflows/sweep-stale-e2e-orgs.yml
vendored
Normal file
170
.github/workflows/sweep-stale-e2e-orgs.yml
vendored
Normal file
@ -0,0 +1,170 @@
|
||||
name: Sweep stale e2e-* orgs (staging)
|
||||
|
||||
# Janitor for staging tenants left behind when E2E cleanup didn't run:
|
||||
# CI cancellations, runner crashes, transient AWS errors mid-cascade,
|
||||
# bash trap missed (signal 9), etc. Without this loop, every failed
|
||||
# teardown leaks an EC2 + DNS + DB row until manual ops cleanup —
|
||||
# 2026-04-23 staging hit the 64 vCPU AWS quota from ~27 such orphans.
|
||||
#
|
||||
# Why not rely on per-test-run teardown:
|
||||
# - Per-run teardown is best-effort by definition. Any process death
|
||||
# after the test starts but before the trap fires leaves debris.
|
||||
# - GH Actions cancellation kills the runner without grace period.
|
||||
# The workflow's `if: always()` step usually catches this, but it
|
||||
# too can fail (CP transient 5xx, runner network issue at the
|
||||
# wrong moment).
|
||||
# - Even when teardown runs, the CP cascade is best-effort in places
|
||||
# (cascadeTerminateWorkspaces logs+continues; DNS deletion same).
|
||||
# - This sweep is the catch-all that converges staging back to clean
|
||||
# regardless of which specific path leaked.
|
||||
#
|
||||
# The PROPER fix is making CP cleanup transactional + verify-after-
|
||||
# terminate (filed separately as cleanup-correctness work). This
|
||||
# workflow is the safety net that catches everything else AND any
|
||||
# future leak source we haven't yet identified.
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Every hour on the hour. E2E orgs are short-lived (~10-25 min wall
|
||||
# clock from create to teardown). Anything older than the
|
||||
# MAX_AGE_MINUTES threshold below is presumed dead.
|
||||
- cron: '0 * * * *'
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
max_age_minutes:
|
||||
description: "Delete e2e-* orgs older than N minutes (default 120)"
|
||||
required: false
|
||||
default: "120"
|
||||
dry_run:
|
||||
description: "Dry run only — list what would be deleted"
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
# Don't let two sweeps fight. Cron + workflow_dispatch could overlap
|
||||
# on a manual trigger; queue rather than parallel-delete.
|
||||
concurrency:
|
||||
group: sweep-stale-e2e-orgs
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
sweep:
|
||||
name: Sweep e2e orgs
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 15
|
||||
env:
|
||||
MOLECULE_CP_URL: https://staging-api.moleculesai.app
|
||||
ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
||||
MAX_AGE_MINUTES: ${{ github.event.inputs.max_age_minutes || '120' }}
|
||||
DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }}
|
||||
# Refuse to delete more than this many orgs in one tick. If the
|
||||
# CP DB is briefly empty (or the admin endpoint goes weird and
|
||||
# returns no created_at), every e2e- org would look stale.
|
||||
# Bailing protects against runaway nukes.
|
||||
SAFETY_CAP: 50
|
||||
|
||||
steps:
|
||||
- name: Verify admin token present
|
||||
run: |
|
||||
if [ -z "$ADMIN_TOKEN" ]; then
|
||||
echo "::error::MOLECULE_STAGING_ADMIN_TOKEN not set"
|
||||
exit 2
|
||||
fi
|
||||
echo "Admin token present ✓"
|
||||
|
||||
- name: Identify stale e2e orgs
|
||||
id: identify
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# Fetch into a file so the python step reads it via stdin —
|
||||
# cleaner than embedding $(curl ...) into a heredoc.
|
||||
curl -sS --fail-with-body --max-time 30 \
|
||||
"$MOLECULE_CP_URL/cp/admin/orgs?limit=500" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
> orgs.json
|
||||
|
||||
# Filter:
|
||||
# 1. slug starts with 'e2e-' (covers e2e-, e2e-canary-,
|
||||
# e2e-canvas-* — all variants the test scripts mint)
|
||||
# 2. created_at is older than MAX_AGE_MINUTES ago
|
||||
# Output one slug per line to a file the next step reads.
|
||||
python3 > stale_slugs.txt <<'PY'
|
||||
import json, os
|
||||
from datetime import datetime, timezone, timedelta
|
||||
with open("orgs.json") as f:
|
||||
data = json.load(f)
|
||||
max_age = int(os.environ["MAX_AGE_MINUTES"])
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(minutes=max_age)
|
||||
for o in data.get("orgs", []):
|
||||
slug = o.get("slug", "")
|
||||
if not slug.startswith("e2e-"):
|
||||
continue
|
||||
created = o.get("created_at")
|
||||
if not created:
|
||||
# Defensively skip rows without created_at — better
|
||||
# to leave one orphan than nuke a brand-new row
|
||||
# whose timestamp didn't render.
|
||||
continue
|
||||
# Python 3.11+ handles RFC3339 with Z directly via
|
||||
# fromisoformat; older runners need the trailing Z swap.
|
||||
created_dt = datetime.fromisoformat(created.replace("Z", "+00:00"))
|
||||
if created_dt < cutoff:
|
||||
print(slug)
|
||||
PY
|
||||
|
||||
count=$(wc -l < stale_slugs.txt | tr -d ' ')
|
||||
echo "Found $count stale e2e org(s) older than ${MAX_AGE_MINUTES}m"
|
||||
if [ "$count" -gt 0 ]; then
|
||||
echo "First 20:"
|
||||
head -20 stale_slugs.txt | sed 's/^/ /'
|
||||
fi
|
||||
echo "count=$count" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Safety gate
|
||||
if: steps.identify.outputs.count != '0'
|
||||
run: |
|
||||
count="${{ steps.identify.outputs.count }}"
|
||||
if [ "$count" -gt "$SAFETY_CAP" ]; then
|
||||
echo "::error::Refusing to delete $count orgs in one sweep (cap=$SAFETY_CAP). Investigate manually — this usually means the CP admin API returned no created_at or returned a degraded result. Re-run with workflow_dispatch + max_age_minutes if intentional."
|
||||
exit 1
|
||||
fi
|
||||
echo "Within safety cap ($count ≤ $SAFETY_CAP) ✓"
|
||||
|
||||
- name: Delete stale orgs
|
||||
if: steps.identify.outputs.count != '0' && env.DRY_RUN != 'true'
|
||||
run: |
|
||||
set -uo pipefail
|
||||
deleted=0
|
||||
failed=0
|
||||
while IFS= read -r slug; do
|
||||
[ -z "$slug" ] && continue
|
||||
# The DELETE handler requires {"confirm": "<slug>"} matching
|
||||
# the URL slug — fat-finger guard. Idempotent: re-issuing
|
||||
# picks up via org_purges.last_step.
|
||||
http_code=$(curl -sS -o /tmp/del_resp -w "%{http_code}" \
|
||||
--max-time 60 \
|
||||
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"confirm\":\"$slug\"}" || echo "000")
|
||||
if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then
|
||||
deleted=$((deleted+1))
|
||||
echo " deleted: $slug"
|
||||
else
|
||||
failed=$((failed+1))
|
||||
echo " FAILED ($http_code): $slug — $(cat /tmp/del_resp 2>/dev/null | head -c 200)"
|
||||
fi
|
||||
done < stale_slugs.txt
|
||||
echo ""
|
||||
echo "Sweep summary: deleted=$deleted failed=$failed"
|
||||
# Don't fail the workflow on per-org delete errors — the
|
||||
# sweeper is best-effort. Next hourly tick re-attempts. We
|
||||
# only fail loud at the safety-cap gate above.
|
||||
|
||||
- name: Dry-run summary
|
||||
if: env.DRY_RUN == 'true'
|
||||
run: |
|
||||
echo "DRY RUN — would have deleted ${{ steps.identify.outputs.count }} org(s). Re-run with dry_run=false to actually delete."
|
||||
@ -5,7 +5,7 @@
|
||||
* the per-tenant admin token, provisions one hermes workspace, waits
|
||||
* for online, then exports:
|
||||
*
|
||||
* STAGING_TENANT_URL https://<slug>.moleculesai.app
|
||||
* STAGING_TENANT_URL https://<slug>.staging.moleculesai.app
|
||||
* STAGING_WORKSPACE_ID UUID of the hermes workspace
|
||||
* STAGING_TENANT_TOKEN per-tenant admin bearer (for spec requests)
|
||||
* STAGING_SLUG org slug (used by teardown)
|
||||
@ -16,6 +16,11 @@
|
||||
* CP_ADMIN_API_TOKEN). Drives provision +
|
||||
* tenant-token retrieval + teardown via a
|
||||
* single credential.
|
||||
* STAGING_TENANT_DOMAIN default: staging.moleculesai.app — the
|
||||
* DNS suffix the CP provisioner writes for
|
||||
* staging tenants. Override only when
|
||||
* running this harness against a non-default
|
||||
* zone.
|
||||
*/
|
||||
|
||||
import type { FullConfig } from "@playwright/test";
|
||||
@ -25,6 +30,14 @@ import { join } from "path";
|
||||
const CP_URL = process.env.MOLECULE_CP_URL || "https://staging-api.moleculesai.app";
|
||||
const ADMIN_TOKEN = process.env.MOLECULE_ADMIN_TOKEN;
|
||||
const STAGING = process.env.CANVAS_E2E_STAGING === "1";
|
||||
// Tenant DNS zone for staging. CP provisioner registers DNS as
|
||||
// `<slug>.staging.moleculesai.app` (see internal/provisioner/ec2.go's
|
||||
// EC2 provisioner: DNS log line). The previous default of plain
|
||||
// `moleculesai.app` matched prod tenant naming and silently broke
|
||||
// every staging E2E at the TLS readiness step — DNS literally didn't
|
||||
// resolve, fetch threw NXDOMAIN, waitFor saw null on every poll, and
|
||||
// the harness wedged at TLS_TIMEOUT_MS instead of failing loud.
|
||||
const TENANT_DOMAIN = process.env.STAGING_TENANT_DOMAIN || "staging.moleculesai.app";
|
||||
|
||||
// Tenant cold boot on staging regularly takes 12-15 min when the
|
||||
// workspace-server Docker image isn't already cached on the AMI. Raised
|
||||
@ -105,22 +118,44 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
|
||||
}
|
||||
console.log(`[staging-setup] Org created: ${slug}`);
|
||||
|
||||
// 2. Wait for tenant running (admin-orgs list is the status source)
|
||||
// 2. Wait for tenant running (admin-orgs list is the status source).
|
||||
//
|
||||
// The CP /cp/admin/orgs endpoint returns each org with an
|
||||
// `instance_status` field (handlers/admin.go:adminOrgSummary,
|
||||
// sourced from `org_instances.status`). NOT `status` — there's no
|
||||
// top-level `status` on the row at all. A previous version of this
|
||||
// test polled `row.status`, which was always undefined, so this
|
||||
// waitFor never resolved truthy and the harness invariably timed
|
||||
// out at 1200s — masking real CP bugs (see #242 chain) AND
|
||||
// surviving real CP fixes alike.
|
||||
// Capture the org UUID alongside the running check — every request
|
||||
// we send to the tenant URL after this point needs an
|
||||
// X-Molecule-Org-Id header (see workspace-server middleware/tenant_guard.go).
|
||||
// Without it, TenantGuard returns 404 ("must not be inferable by
|
||||
// probing other orgs' machines"). The CP returns the id on the
|
||||
// admin-orgs row; capture it here while we're already polling.
|
||||
let orgID = "";
|
||||
await waitFor<boolean>(
|
||||
async () => {
|
||||
const r = await jsonFetch(`${CP_URL}/cp/admin/orgs`, { headers: adminAuth });
|
||||
if (r.status !== 200) return null;
|
||||
const row = (r.body?.orgs || []).find((o: any) => o.slug === slug);
|
||||
if (!row) return null;
|
||||
if (row.status === "running") return true;
|
||||
if (row.status === "failed") throw new Error(`provision failed: ${slug}`);
|
||||
if (row.instance_status === "running") {
|
||||
orgID = row.id;
|
||||
return true;
|
||||
}
|
||||
if (row.instance_status === "failed") throw new Error(`provision failed: ${slug}`);
|
||||
return null;
|
||||
},
|
||||
PROVISION_TIMEOUT_MS,
|
||||
15_000,
|
||||
"tenant provision",
|
||||
);
|
||||
console.log(`[staging-setup] Tenant running`);
|
||||
if (!orgID) {
|
||||
throw new Error(`expected admin-orgs row to carry id, got empty for slug=${slug}`);
|
||||
}
|
||||
console.log(`[staging-setup] Tenant running (org_id=${orgID})`);
|
||||
|
||||
// 3. Fetch per-tenant admin token
|
||||
const tokRes = await jsonFetch(
|
||||
@ -133,7 +168,7 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
|
||||
);
|
||||
}
|
||||
const tenantToken: string = tokRes.body.admin_token;
|
||||
const tenantURL = `https://${slug}.moleculesai.app`;
|
||||
const tenantURL = `https://${slug}.${TENANT_DOMAIN}`;
|
||||
console.log(`[staging-setup] Tenant URL: ${tenantURL}`);
|
||||
|
||||
// 4. TLS readiness
|
||||
@ -154,7 +189,17 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
|
||||
);
|
||||
|
||||
// 5. Provision workspace
|
||||
const tenantAuth = { Authorization: `Bearer ${tenantToken}` };
|
||||
//
|
||||
// tenantAuth carries TWO headers, both required:
|
||||
// - Authorization: Bearer <admin-token> — wsAdmin middleware gate
|
||||
// - X-Molecule-Org-Id: <uuid> — TenantGuard cross-org gate
|
||||
// Missing the org-id header silently 404s every non-allowlisted
|
||||
// route, with no body and no security headers. The 404 is intentional
|
||||
// (existence-non-inference) which makes it look like a missing route.
|
||||
const tenantAuth = {
|
||||
"Authorization": `Bearer ${tenantToken}`,
|
||||
"X-Molecule-Org-Id": orgID,
|
||||
};
|
||||
const ws = await jsonFetch(`${tenantURL}/workspaces`, {
|
||||
method: "POST",
|
||||
headers: tenantAuth,
|
||||
|
||||
@ -63,6 +63,82 @@ test.describe("staging canvas tabs", () => {
|
||||
Authorization: `Bearer ${tenantToken}`,
|
||||
});
|
||||
|
||||
// canvas/src/components/AuthGate.tsx fetches /cp/auth/me on mount
|
||||
// and redirects to the login page on 401. The bearer header above
|
||||
// is for platform API calls — it does NOT satisfy /cp/auth/me,
|
||||
// which is cookie-based (WorkOS session). Without this mock, the
|
||||
// canvas page mounts AuthGate, sees 401 from /cp/auth/me, and
|
||||
// redirects away from the tenant URL before the React Flow root
|
||||
// ever renders. The [aria-label] selector wait then times out.
|
||||
//
|
||||
// Intercept /cp/auth/me + return a fake Session shape so AuthGate
|
||||
// resolves to "authenticated" and renders {children}. The session
|
||||
// contents are cosmetic — the canvas only inspects org_id/user_id
|
||||
// in a few places that don't fail when these are dummy values.
|
||||
await context.route("**/cp/auth/me", (route) =>
|
||||
route.fulfill({
|
||||
status: 200,
|
||||
contentType: "application/json",
|
||||
body: JSON.stringify({
|
||||
user_id: `e2e-test-user-${workspaceId}`,
|
||||
org_id: "e2e-test-org",
|
||||
email: "e2e@test.local",
|
||||
}),
|
||||
}),
|
||||
);
|
||||
|
||||
// Universal 401 → empty-200 fallback (defense-in-depth).
|
||||
//
|
||||
// The original product bug was canvas/src/lib/api.ts:62-74 calling
|
||||
// `redirectToLogin` on EVERY 401 — a single workspace-scoped 401
|
||||
// (e.g. /workspaces/:id/peers, /plugins) yanked the user (and the
|
||||
// test) to AuthKit. That's now fixed at the source: api.ts probes
|
||||
// /cp/auth/me before redirecting, so a 401 from a non-auth path
|
||||
// with a live session throws a regular error instead.
|
||||
//
|
||||
// This route handler stays as a SAFETY NET, not the primary
|
||||
// defense:
|
||||
// 1. It silences resource-load console noise from the browser
|
||||
// (those messages don't include the URL — useless in
|
||||
// diagnostics, captured by the filter in the assertion
|
||||
// block but having no 401s reach the network is cleaner).
|
||||
// 2. It guards against panels that DON'T have try/catch around
|
||||
// their api calls — an unhandled rejection would surface
|
||||
// as console.error → fail the assertion. Panels SHOULD
|
||||
// handle errors, but until they're all audited, this is
|
||||
// the test's belt to api.ts's braces.
|
||||
//
|
||||
// Pass-through real responses; swap 401s for 200 + empty body.
|
||||
// Skip /cp/auth/me (mocked above) and non-fetch resources
|
||||
// (HTML/JS/CSS bundles that should NOT be intercepted).
|
||||
await context.route("**", async (route, request) => {
|
||||
if (request.resourceType() !== "fetch") {
|
||||
return route.fallback();
|
||||
}
|
||||
// /cp/auth/me is mocked above with a fixed Session shape — let
|
||||
// that handler win without us round-tripping the network.
|
||||
if (request.url().includes("/cp/auth/me")) {
|
||||
return route.fallback();
|
||||
}
|
||||
let resp;
|
||||
try {
|
||||
resp = await route.fetch();
|
||||
} catch {
|
||||
return route.fallback();
|
||||
}
|
||||
if (resp.status() !== 401) {
|
||||
return route.fulfill({ response: resp });
|
||||
}
|
||||
const lastSeg =
|
||||
new URL(request.url()).pathname.split("/").filter(Boolean).pop() || "";
|
||||
const looksLikeList = !/^[0-9a-f-]{8,}$/.test(lastSeg);
|
||||
await route.fulfill({
|
||||
status: 200,
|
||||
contentType: "application/json",
|
||||
body: looksLikeList ? "[]" : "{}",
|
||||
});
|
||||
});
|
||||
|
||||
const consoleErrors: string[] = [];
|
||||
page.on("console", (msg) => {
|
||||
if (msg.type() === "error") {
|
||||
@ -70,13 +146,38 @@ test.describe("staging canvas tabs", () => {
|
||||
}
|
||||
});
|
||||
|
||||
await page.goto(tenantURL, { waitUntil: "networkidle" });
|
||||
// Capture the URL of any failed network request so a "Failed to load
|
||||
// resource: 404" console message we filter out below leaves a
|
||||
// breadcrumb. Browser console messages for resource-load failures
|
||||
// omit the URL, so we'd otherwise be flying blind. Logged to the
|
||||
// test's stdout (visible in the workflow log under the failed step).
|
||||
page.on("requestfailed", (req) => {
|
||||
console.log(`[e2e/requestfailed] ${req.method()} ${req.url()}: ${req.failure()?.errorText ?? "?"}`);
|
||||
});
|
||||
page.on("response", (res) => {
|
||||
if (res.status() >= 400) {
|
||||
console.log(`[e2e/response-${res.status()}] ${res.request().method()} ${res.url()}`);
|
||||
}
|
||||
});
|
||||
|
||||
// waitUntil="networkidle" is wrong here — the canvas keeps a
|
||||
// WebSocket open + polls /events and /workspaces every few
|
||||
// seconds, so the network is *never* idle for 500ms. page.goto
|
||||
// would hang until its 45s default timeout. "domcontentloaded"
|
||||
// returns as soon as the HTML is parsed; React hydration + the
|
||||
// selector wait below is what actually gates ready-for-interaction.
|
||||
await page.goto(tenantURL, { waitUntil: "domcontentloaded" });
|
||||
|
||||
// Canvas hydration races WebSocket connect + /workspaces fetch.
|
||||
// Wait for the tablist element (appears after a workspace is
|
||||
// selected) or the hydration-error banner — whichever wins first.
|
||||
// Wait for the React Flow canvas wrapper (always present once
|
||||
// hydrated, even with zero workspaces) or the hydration-error
|
||||
// banner — whichever wins first. Previous version of this wait
|
||||
// used `[role="tablist"]`, but that selector only appears AFTER
|
||||
// a workspace node is clicked (which happens below at L100), so
|
||||
// the wait would always time out at 45s before any meaningful
|
||||
// failure surfaced.
|
||||
await page.waitForSelector(
|
||||
'[role="tablist"], [data-testid="hydration-error"]',
|
||||
'[aria-label="Molecule AI workspace canvas"], [data-testid="hydration-error"]',
|
||||
{ timeout: 45_000 },
|
||||
);
|
||||
|
||||
@ -106,6 +207,15 @@ test.describe("staging canvas tabs", () => {
|
||||
for (const tabId of TAB_IDS) {
|
||||
await test.step(`tab: ${tabId}`, async () => {
|
||||
const tabButton = page.locator(`#tab-${tabId}`);
|
||||
// The TABS bar is `overflow-x-auto` (SidePanel.tsx:~tabs
|
||||
// wrapper) — tabs after position ~3 are clipped behind the
|
||||
// right-edge fade gradient on smaller viewports. Playwright's
|
||||
// `toBeVisible()` returns false for clipped elements, so a
|
||||
// bare visibility check fails on `skills` and later tabs in
|
||||
// CI. scrollIntoViewIfNeeded brings the button into view
|
||||
// before the visibility check, mirroring what SidePanel's own
|
||||
// keyboard handler does on arrow-key navigation.
|
||||
await tabButton.scrollIntoViewIfNeeded({ timeout: 5_000 });
|
||||
await expect(
|
||||
tabButton,
|
||||
`tab-${tabId} button missing — TABS list may have drifted`,
|
||||
@ -134,14 +244,22 @@ test.describe("staging canvas tabs", () => {
|
||||
|
||||
// Aggregate console-error budget. Known-noisy sources whitelisted:
|
||||
// Sentry, Vercel analytics, WS reconnects (expected on SaaS
|
||||
// terminal), favicon 404 (cosmetic).
|
||||
// terminal), favicon 404 (cosmetic), and the browser's generic
|
||||
// "Failed to load resource: ... 404" message which never includes
|
||||
// the URL — uninformative on its own and impossible to filter
|
||||
// meaningfully without a URL. The page.on('requestfailed') +
|
||||
// page.on('response>=400') logging above captures the actual URLs
|
||||
// so a real bug still leaves a breadcrumb in the workflow log;
|
||||
// a real exception (panel crash, JS error) surfaces as a typed
|
||||
// error with file path which the filter still catches.
|
||||
const appErrors = consoleErrors.filter(
|
||||
(msg) =>
|
||||
!msg.includes("sentry") &&
|
||||
!msg.includes("vercel") &&
|
||||
!msg.includes("WebSocket") &&
|
||||
!msg.includes("favicon") &&
|
||||
!msg.includes("molecule-icon.png"), // another cosmetic 404
|
||||
!msg.includes("molecule-icon.png") && // cosmetic 404
|
||||
!msg.includes("Failed to load resource"),
|
||||
);
|
||||
expect(
|
||||
appErrors,
|
||||
|
||||
@ -74,6 +74,11 @@ export default function Home() {
|
||||
{hydrationError && (
|
||||
<div
|
||||
role="alert"
|
||||
// Stable testid so the staging E2E (canvas/e2e/staging-tabs.spec.ts)
|
||||
// can detect this banner without depending on the role="alert"
|
||||
// selector that's used by other transient toasts. Don't rename
|
||||
// without updating that spec.
|
||||
data-testid="hydration-error"
|
||||
className="fixed inset-0 flex flex-col items-center justify-center bg-zinc-950 text-zinc-300 gap-4 z-[9999]"
|
||||
>
|
||||
<p className="text-zinc-400 text-sm">{hydrationError}</p>
|
||||
|
||||
@ -14,7 +14,7 @@ import { PricingTable } from "@/components/PricingTable";
|
||||
export const metadata = {
|
||||
title: "Pricing — Molecule AI",
|
||||
description:
|
||||
"Free while you tinker, paid tiers for shipping production multi-agent organizations. Transparent usage-based overage pricing on Pro.",
|
||||
"Flat-rate team and org pricing — no per-seat fees. Free to start, $29/month for teams, $99/month for production orgs. Full runtime stack included on every paid tier.",
|
||||
};
|
||||
|
||||
export default function PricingPage() {
|
||||
@ -25,9 +25,12 @@ export default function PricingPage() {
|
||||
Pricing
|
||||
</h1>
|
||||
<p className="mx-auto mt-4 max-w-2xl text-lg text-zinc-300">
|
||||
Free while you tinker. Pay when you ship real agents to production.
|
||||
Every tier includes the full runtime stack — you upgrade for scale,
|
||||
support, and dedicated infrastructure.
|
||||
One flat price per org — not per seat. Every paid tier includes the
|
||||
full runtime stack. You upgrade for scale, support, and dedicated
|
||||
infrastructure.
|
||||
</p>
|
||||
<p className="mx-auto mt-2 max-w-xl text-sm text-zinc-400">
|
||||
5-person team? You pay $29/month — not $200. No seat math, ever.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
@ -53,7 +56,8 @@ export default function PricingPage() {
|
||||
.
|
||||
</p>
|
||||
<p className="mt-6 text-sm text-zinc-500">
|
||||
Prices shown in USD. Enterprise / self-hosted licensing available — contact us.
|
||||
Prices shown in USD. Flat-rate per org — no per-seat fees on any paid tier.
|
||||
Enterprise / self-hosted licensing available — contact us.
|
||||
</p>
|
||||
</section>
|
||||
|
||||
|
||||
@ -50,14 +50,14 @@ describe("PricingTable", () => {
|
||||
it("renders all three plans with their CTAs", () => {
|
||||
render(<PricingTable />);
|
||||
expect(screen.getByRole("heading", { name: "Free" })).toBeTruthy();
|
||||
expect(screen.getByRole("heading", { name: "Starter" })).toBeTruthy();
|
||||
expect(screen.getByRole("heading", { name: "Pro" })).toBeTruthy();
|
||||
expect(screen.getByRole("heading", { name: "Team" })).toBeTruthy();
|
||||
expect(screen.getByRole("heading", { name: "Growth" })).toBeTruthy();
|
||||
expect(screen.getByRole("button", { name: "Get started" })).toBeTruthy();
|
||||
expect(screen.getByRole("button", { name: "Upgrade to Starter" })).toBeTruthy();
|
||||
expect(screen.getByRole("button", { name: "Upgrade to Pro" })).toBeTruthy();
|
||||
expect(screen.getByRole("button", { name: "Upgrade to Team" })).toBeTruthy();
|
||||
expect(screen.getByRole("button", { name: "Upgrade to Growth" })).toBeTruthy();
|
||||
});
|
||||
|
||||
it("shows the 'Most popular' badge only on the starter card", () => {
|
||||
it("shows the 'Most popular' badge only on the Team card", () => {
|
||||
render(<PricingTable />);
|
||||
const badges = screen.getAllByText("Most popular");
|
||||
expect(badges.length).toBe(1);
|
||||
@ -74,7 +74,7 @@ describe("PricingTable", () => {
|
||||
it("Paid CTA + anonymous → bounces to signup (no checkout call)", async () => {
|
||||
mockedFetchSession.mockResolvedValue(null);
|
||||
render(<PricingTable />);
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Starter" }));
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Team" }));
|
||||
await waitFor(() => expect(mockedRedirectToLogin).toHaveBeenCalledWith("sign-up"));
|
||||
expect(mockedStartCheckout).not.toHaveBeenCalled();
|
||||
});
|
||||
@ -91,7 +91,7 @@ describe("PricingTable", () => {
|
||||
});
|
||||
|
||||
render(<PricingTable />);
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Pro" }));
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Growth" }));
|
||||
|
||||
await waitFor(() =>
|
||||
expect(mockedStartCheckout).toHaveBeenCalledWith("pro", "acme"),
|
||||
@ -111,7 +111,7 @@ describe("PricingTable", () => {
|
||||
mockedGetTenantSlug.mockReturnValue("");
|
||||
|
||||
render(<PricingTable />);
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Starter" }));
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Team" }));
|
||||
|
||||
await waitFor(() => {
|
||||
const alert = screen.getByRole("alert");
|
||||
@ -129,7 +129,7 @@ describe("PricingTable", () => {
|
||||
mockedStartCheckout.mockRejectedValue(new Error("checkout: 500 boom"));
|
||||
|
||||
render(<PricingTable />);
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Pro" }));
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Growth" }));
|
||||
|
||||
await waitFor(() => {
|
||||
const alert = screen.getByRole("alert");
|
||||
@ -140,7 +140,7 @@ describe("PricingTable", () => {
|
||||
it("treats fetchSession network errors as anonymous (fail-closed to signup)", async () => {
|
||||
mockedFetchSession.mockRejectedValue(new Error("network down"));
|
||||
render(<PricingTable />);
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Starter" }));
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Team" }));
|
||||
await waitFor(() => expect(mockedRedirectToLogin).toHaveBeenCalledWith("sign-up"));
|
||||
expect(mockedStartCheckout).not.toHaveBeenCalled();
|
||||
});
|
||||
@ -155,7 +155,7 @@ describe("PricingTable", () => {
|
||||
mockedStartCheckout.mockReturnValue(new Promise(() => {}));
|
||||
|
||||
render(<PricingTable />);
|
||||
const button = screen.getByRole("button", { name: "Upgrade to Pro" });
|
||||
const button = screen.getByRole("button", { name: "Upgrade to Growth" });
|
||||
fireEvent.click(button);
|
||||
|
||||
await waitFor(() => {
|
||||
|
||||
@ -6,32 +6,44 @@ import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
|
||||
// runs happily in node. Splitting keeps the node tests fast.
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// 401 handling — gated on SaaS-tenant hostname
|
||||
// 401 handling — session-probe-before-redirect
|
||||
// ---------------------------------------------------------------------------
|
||||
//
|
||||
// Before fix/quickstart-bugless, any 401 from any endpoint triggered
|
||||
// `redirectToLogin()`, navigating to `/cp/auth/login`. That route
|
||||
// exists only on SaaS (mounted by cp_proxy when CP_UPSTREAM_URL is
|
||||
// set). On localhost / self-hosted / Vercel preview it 404s, so the
|
||||
// user lands on a broken login page instead of seeing the actual error.
|
||||
// History:
|
||||
// 1. fix/quickstart-bugless: gated redirect on SaaS hostname (slug).
|
||||
// 2. fix/api-401-probe-before-redirect (this file): probe /cp/auth/me
|
||||
// before redirecting on a 401 from a non-auth path. The earlier
|
||||
// behaviour redirected on EVERY 401, so a single 401 from
|
||||
// /workspaces/:id/plugins (workspace-scoped — refused by the
|
||||
// tenant admin bearer) yanked the user to AuthKit even when
|
||||
// the session was fine. The probe lets us tell "session dead"
|
||||
// from "endpoint refused this token."
|
||||
//
|
||||
// These tests lock in:
|
||||
// - SaaS tenant hostname (*.moleculesai.app) → 401 still redirects.
|
||||
// - non-SaaS hostname (localhost, LAN IP, apex) → 401 throws, no
|
||||
// redirect, so the caller renders a real error affordance.
|
||||
// Matrix:
|
||||
// slug | path | probe → me | expected
|
||||
// --- | --- | --- | ---
|
||||
// acme | /cp/auth/me | (n/a) | redirect (path IS auth)
|
||||
// acme | /workspaces/... | 401 | redirect (session dead)
|
||||
// acme | /workspaces/... | 200 | throw, no redirect
|
||||
// acme | /workspaces/... | network err| throw, no redirect
|
||||
// "" | /workspaces/... | (n/a) | throw, no redirect (no slug)
|
||||
|
||||
const mockFetch = vi.fn();
|
||||
globalThis.fetch = mockFetch;
|
||||
|
||||
function mockFailure(status: number, text: string) {
|
||||
function mockNextResponse(status: number, text = "") {
|
||||
mockFetch.mockResolvedValueOnce({
|
||||
ok: false,
|
||||
ok: status >= 200 && status < 300,
|
||||
status,
|
||||
json: () => Promise.reject(new Error("no json")),
|
||||
text: () => Promise.resolve(text),
|
||||
} as unknown as Response);
|
||||
}
|
||||
|
||||
function mockNextNetworkError() {
|
||||
mockFetch.mockRejectedValueOnce(new Error("network"));
|
||||
}
|
||||
|
||||
function setHostname(host: string) {
|
||||
Object.defineProperty(window, "location", {
|
||||
configurable: true,
|
||||
@ -59,27 +71,66 @@ describe("api 401 handling", () => {
|
||||
vi.resetModules();
|
||||
});
|
||||
|
||||
it("redirects to login on SaaS tenant hostname", async () => {
|
||||
it("redirects when /cp/auth/me itself 401s — that IS the session-dead signal", async () => {
|
||||
setHostname("acme.moleculesai.app");
|
||||
mockFailure(401, '{"error":"admin auth required"}');
|
||||
// Single fetch: the /cp/auth/me call itself.
|
||||
mockNextResponse(401, '{"error":"unauthenticated"}');
|
||||
|
||||
const { api } = await import("../api");
|
||||
await expect(api.get("/workspaces")).rejects.toThrow(/Session expired/);
|
||||
await expect(api.get("/cp/auth/me")).rejects.toThrow(/Session expired/);
|
||||
expect(redirectSpy).toHaveBeenCalledWith("sign-in");
|
||||
// No probe fired — we already know the session is dead.
|
||||
expect(mockFetch).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("redirects when /cp/auth/me probe ALSO 401s — session genuinely dead", async () => {
|
||||
setHostname("acme.moleculesai.app");
|
||||
// First call: the workspace-scoped fetch returns 401.
|
||||
mockNextResponse(401, '{"error":"workspace token required"}');
|
||||
// Second call: the probe to /cp/auth/me also 401s.
|
||||
mockNextResponse(401, '{"error":"unauthenticated"}');
|
||||
|
||||
const { api } = await import("../api");
|
||||
await expect(api.get("/workspaces/abc/plugins")).rejects.toThrow(/Session expired/);
|
||||
expect(redirectSpy).toHaveBeenCalledWith("sign-in");
|
||||
});
|
||||
|
||||
it("does NOT redirect when probe returns 200 — endpoint refused this token, session fine", async () => {
|
||||
setHostname("acme.moleculesai.app");
|
||||
// First call: workspace-scoped 401.
|
||||
mockNextResponse(401, '{"error":"workspace token required"}');
|
||||
// Second call: probe shows the session is alive.
|
||||
mockNextResponse(200, '{"user_id":"u1","org_id":"o1","email":"x@y"}');
|
||||
|
||||
const { api } = await import("../api");
|
||||
await expect(api.get("/workspaces/abc/plugins")).rejects.toThrow(/401/);
|
||||
expect(redirectSpy).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("does NOT redirect when probe network-errors — conservative fallback", async () => {
|
||||
setHostname("acme.moleculesai.app");
|
||||
mockNextResponse(401, '{"error":"workspace token required"}');
|
||||
mockNextNetworkError();
|
||||
|
||||
const { api } = await import("../api");
|
||||
await expect(api.get("/workspaces/abc/plugins")).rejects.toThrow(/401/);
|
||||
expect(redirectSpy).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("does NOT redirect on localhost — throws a real error instead", async () => {
|
||||
setHostname("localhost");
|
||||
mockFailure(401, '{"error":"admin auth required"}');
|
||||
mockNextResponse(401, '{"error":"admin auth required"}');
|
||||
|
||||
const { api } = await import("../api");
|
||||
await expect(api.get("/workspaces")).rejects.toThrow(/401/);
|
||||
expect(redirectSpy).not.toHaveBeenCalled();
|
||||
// No slug → no probe fires either.
|
||||
expect(mockFetch).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("does NOT redirect on a LAN hostname", async () => {
|
||||
setHostname("192.168.1.74");
|
||||
mockFailure(401, '{"error":"missing workspace auth token"}');
|
||||
mockNextResponse(401, '{"error":"missing workspace auth token"}');
|
||||
|
||||
const { api } = await import("../api");
|
||||
await expect(api.get("/workspaces/abc/activity")).rejects.toThrow(/401/);
|
||||
@ -91,7 +142,7 @@ describe("api 401 handling", () => {
|
||||
// Users landing on app.moleculesai.app (pre-tenant-selection) must
|
||||
// see the real 401 error rather than loop on login.
|
||||
setHostname("app.moleculesai.app");
|
||||
mockFailure(401, '{"error":"admin auth required"}');
|
||||
mockNextResponse(401, '{"error":"admin auth required"}');
|
||||
|
||||
const { api } = await import("../api");
|
||||
await expect(api.get("/workspaces")).rejects.toThrow(/401/);
|
||||
|
||||
@ -60,15 +60,45 @@ async function request<T>(
|
||||
return request<T>(method, path, body, retryCount + 1, options);
|
||||
}
|
||||
if (res.status === 401) {
|
||||
// Session expired or credentials lost. On SaaS (tenant subdomain)
|
||||
// the login page lives at /cp/auth/login and is mounted by the
|
||||
// control-plane reverse proxy — redirect. On self-hosted / local
|
||||
// dev / Vercel preview there IS no /cp/* mount, so redirecting
|
||||
// would navigate to a 404 ("404 page not found") instead of the
|
||||
// real error the user should see. In that case, throw instead
|
||||
// and let the caller render a meaningful failure (retry button,
|
||||
// error banner, etc.).
|
||||
if (slug) {
|
||||
// Distinguish "session is dead" from "this endpoint refused this
|
||||
// token." Old behaviour blanket-redirected on every 401, so a
|
||||
// single transient 401 from a workspace-scoped endpoint
|
||||
// (/workspaces/:id/peers, /plugins, etc. that need a workspace
|
||||
// token rather than the tenant admin bearer) yanked the user
|
||||
// back to AuthKit even when their session was perfectly fine.
|
||||
// That broke the staging-tabs E2E for the entire 2026-04-25
|
||||
// night; #2073/#2074 worked around the symptom in the test by
|
||||
// mocking 401→200 for every fetch, but the user-facing bug
|
||||
// stayed.
|
||||
//
|
||||
// The canonical "session is dead" signal is /cp/auth/me
|
||||
// returning 401. For any 401 on a non-auth path, probe
|
||||
// /cp/auth/me before deciding to redirect:
|
||||
// - probe 401 → session is actually dead → redirect
|
||||
// - probe 200 → session is fine, the endpoint just refused
|
||||
// our specific token → throw a real error,
|
||||
// caller renders an error state
|
||||
// - probe network error → assume session-fine (conservative;
|
||||
// better to throw than to redirect on a
|
||||
// transient probe failure)
|
||||
//
|
||||
// Self-hosted / localhost / reserved subdomains still throw
|
||||
// without redirecting (slug is empty in those cases) — same
|
||||
// policy as before.
|
||||
const isAuthPath = path.startsWith("/cp/auth/");
|
||||
let sessionDead = isAuthPath;
|
||||
if (!isAuthPath && slug) {
|
||||
try {
|
||||
const probe = await fetch(`${PLATFORM_URL}/cp/auth/me`, {
|
||||
credentials: "include",
|
||||
signal: AbortSignal.timeout(5000),
|
||||
});
|
||||
sessionDead = probe.status === 401;
|
||||
} catch {
|
||||
// Probe failed (network/timeout) — fall through to throw.
|
||||
}
|
||||
}
|
||||
if (sessionDead && slug) {
|
||||
const { redirectToLogin } = await import("./auth");
|
||||
redirectToLogin("sign-in");
|
||||
throw new Error("Session expired — redirecting to login");
|
||||
|
||||
@ -32,6 +32,10 @@ export interface Plan {
|
||||
// plans is the canonical order shown on the pricing page: free → starter
|
||||
// → pro. Change the order here + the rendered columns follow. Keeping
|
||||
// this as a module-level const so tests can assert against a known list.
|
||||
//
|
||||
// Flat-rate positioning (Issue #1833): "starter" and "pro" are flat-rate
|
||||
// per-org, not per-seat. This is a deliberate wedge against Cursor/Windsurf
|
||||
// ($40/seat) — at 5 engineers the Team tier is 28% cheaper.
|
||||
export const plans: Plan[] = [
|
||||
{
|
||||
id: "free",
|
||||
@ -48,8 +52,8 @@ export const plans: Plan[] = [
|
||||
},
|
||||
{
|
||||
id: "starter",
|
||||
name: "Starter",
|
||||
tagline: "For small teams shipping real agents",
|
||||
name: "Team",
|
||||
tagline: "Flat-rate for teams — one price, no per-seat fees",
|
||||
price: "$29/month",
|
||||
features: [
|
||||
"10 workspaces",
|
||||
@ -57,14 +61,15 @@ export const plans: Plan[] = [
|
||||
"Private Upstash Redis namespace",
|
||||
"Email support (48h)",
|
||||
"5M LLM tokens / month included",
|
||||
"No per-seat pricing",
|
||||
],
|
||||
ctaLabel: "Upgrade to Starter",
|
||||
ctaLabel: "Upgrade to Team",
|
||||
highlighted: true,
|
||||
},
|
||||
{
|
||||
id: "pro",
|
||||
name: "Pro",
|
||||
tagline: "For production multi-agent orgs",
|
||||
name: "Growth",
|
||||
tagline: "Flat-rate for production multi-agent orgs",
|
||||
price: "$99/month",
|
||||
features: [
|
||||
"Unlimited workspaces",
|
||||
@ -72,9 +77,10 @@ export const plans: Plan[] = [
|
||||
"Cross-workspace A2A audit log",
|
||||
"Priority support (24h)",
|
||||
"25M LLM tokens / month included",
|
||||
"No per-seat pricing",
|
||||
"Usage-based overage billing",
|
||||
],
|
||||
ctaLabel: "Upgrade to Pro",
|
||||
ctaLabel: "Upgrade to Growth",
|
||||
},
|
||||
];
|
||||
|
||||
|
||||
@ -32,7 +32,7 @@
|
||||
set -euo pipefail
|
||||
|
||||
DRY_RUN=1
|
||||
MAX_DELETE_PCT=50 # refuse to delete more than half the records in one run
|
||||
MAX_DELETE_PCT="${MAX_DELETE_PCT:-50}" # refuse to delete more than this pct of records in one run; caller can override via env
|
||||
REGION="${AWS_DEFAULT_REGION:-us-east-2}"
|
||||
|
||||
for arg in "$@"; do
|
||||
|
||||
Loading…
Reference in New Issue
Block a user