diff --git a/.github/workflows/canary-staging.yml b/.github/workflows/canary-staging.yml index 32cba939..0c4bae19 100644 --- a/.github/workflows/canary-staging.yml +++ b/.github/workflows/canary-staging.yml @@ -43,6 +43,17 @@ jobs: env: MOLECULE_CP_URL: https://staging-api.moleculesai.app MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} + # Without an LLM key the test_staging_full_saas.sh script provisions + # the workspace with empty secrets, hermes derive-provider.sh resolves + # `openai/gpt-4o` to PROVIDER=openrouter, no OPENROUTER_API_KEY is + # found in env, and A2A returns "No LLM provider configured" at + # request time (canary step 8/11). The full-lifecycle workflow + # (e2e-staging-saas.yml) has carried this secret since launch — the + # canary regressed when it was first split out and lost the env + # block. Issue #1500 had ~30 consecutive failures before this was + # spotted; do NOT remove without re-reading the script's secrets- + # injection block. + E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }} E2E_MODE: canary E2E_RUNTIME: hermes E2E_RUN_ID: "canary-${{ github.run_id }}" @@ -57,6 +68,14 @@ jobs: exit 2 fi + - name: Verify OpenAI key present + run: | + if [ -z "$E2E_OPENAI_API_KEY" ]; then + echo "::error::MOLECULE_STAGING_OPENAI_KEY secret not set — A2A will fail at request time with 'No LLM provider configured'" + exit 2 + fi + echo "OpenAI key present ✓ (len=${#E2E_OPENAI_API_KEY})" + - name: Canary run id: canary run: bash tests/e2e/test_staging_full_saas.sh diff --git a/.github/workflows/redeploy-tenants-on-main.yml b/.github/workflows/redeploy-tenants-on-main.yml new file mode 100644 index 00000000..e0f84da5 --- /dev/null +++ b/.github/workflows/redeploy-tenants-on-main.yml @@ -0,0 +1,164 @@ +name: redeploy-tenants-on-main + +# Auto-refresh prod tenant EC2s after every main merge. +# +# Why this workflow exists: publish-workspace-server-image builds and +# pushes a new platform-tenant:latest + : to GHCR on every merge +# to main, but running tenants pulled their image once at boot and +# never re-pull. Users see stale code indefinitely. +# +# This workflow closes the gap by calling the control-plane admin +# endpoint that performs a canary-first, batched, health-gated rolling +# redeploy across every live tenant. Implemented in Molecule-AI/ +# molecule-controlplane as POST /cp/admin/tenants/redeploy-fleet +# (feat/tenant-auto-redeploy, landing alongside this workflow). +# +# Runtime ordering: +# 1. publish-workspace-server-image completes → new :latest in GHCR. +# 2. This workflow fires via workflow_run, waits 30s for GHCR's +# CDN to propagate the new tag to the region the tenants pull from. +# 3. Calls redeploy-fleet with canary_slug=hongmingwang and a 60s +# soak. Canary proves the image boots; batches follow. +# 4. Any failure aborts the rollout and leaves older tenants on the +# prior image — safer default than half-and-half state. +# +# Rollback path: re-run this workflow with a specific SHA pinned via +# the workflow_dispatch input. That calls redeploy-fleet with +# target_tag=, re-pulling the older image on every tenant. + +on: + workflow_run: + workflows: ['publish-workspace-server-image'] + types: [completed] + branches: [main] + workflow_dispatch: + inputs: + target_tag: + description: 'Tenant image tag to deploy (e.g. "latest" or "a59f1a6c"). Defaults to latest when empty.' + required: false + type: string + default: 'latest' + canary_slug: + description: 'Tenant slug to deploy first + soak (empty = skip canary, fan out immediately).' + required: false + type: string + default: 'hongmingwang' + soak_seconds: + description: 'Seconds to wait after canary before fanning out.' + required: false + type: string + default: '60' + batch_size: + description: 'How many tenants SSM redeploys in parallel per batch.' + required: false + type: string + default: '3' + dry_run: + description: 'Plan only — do not actually redeploy.' + required: false + type: boolean + default: false + +permissions: + contents: read + # No write scopes needed — the workflow hits an external CP endpoint, + # not the GitHub API. + +jobs: + redeploy: + # Skip the auto-trigger if publish-workspace-server-image didn't + # actually succeed. workflow_run fires on any completion state; we + # don't want to redeploy against a half-built image. + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') + runs-on: ubuntu-latest + timeout-minutes: 25 + steps: + - name: Wait for GHCR tag propagation + # GHCR's edge cache takes ~15-30s to consistently serve the new + # :latest manifest after the registry accepts the push. Without + # this sleep, the first tenant's docker pull sometimes races + # and fetches the previous digest; sleeping is the cheapest + # way to reduce that without polling GHCR for the new digest. + run: sleep 30 + + - name: Call CP redeploy-fleet + # CP_ADMIN_API_TOKEN must be set as a repo/org secret on + # Molecule-AI/molecule-core, matching the staging/prod CP's + # CP_ADMIN_API_TOKEN env. Stored in Railway, mirrored to this + # repo's secrets for CI. + env: + CP_URL: ${{ vars.CP_URL || 'https://api.moleculesai.app' }} + CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }} + TARGET_TAG: ${{ inputs.target_tag || 'latest' }} + CANARY_SLUG: ${{ inputs.canary_slug || 'hongmingwang' }} + SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }} + BATCH_SIZE: ${{ inputs.batch_size || '3' }} + DRY_RUN: ${{ inputs.dry_run || false }} + run: | + set -euo pipefail + + if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then + echo "::error::CP_ADMIN_API_TOKEN secret not set — skipping redeploy" + echo "::notice::Set CP_ADMIN_API_TOKEN in repo secrets to enable auto-redeploy." + exit 1 + fi + + BODY=$(jq -nc \ + --arg tag "$TARGET_TAG" \ + --arg canary "$CANARY_SLUG" \ + --argjson soak "$SOAK_SECONDS" \ + --argjson batch "$BATCH_SIZE" \ + --argjson dry "$DRY_RUN" \ + '{ + target_tag: $tag, + canary_slug: $canary, + soak_seconds: $soak, + batch_size: $batch, + dry_run: $dry + }') + + echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet" + echo " body: $BODY" + + HTTP_RESPONSE=$(mktemp) + HTTP_CODE=$(curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \ + -m 1200 \ + -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \ + -H "Content-Type: application/json" \ + -X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \ + -d "$BODY" || echo "000") + + echo "HTTP $HTTP_CODE" + cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE" + + # Pretty-print per-tenant results in the job summary so + # ops can see which tenants were redeployed without drilling + # into the raw response. + { + echo "## Tenant redeploy fleet" + echo "" + echo "**Target tag:** \`$TARGET_TAG\`" + echo "**Canary:** \`$CANARY_SLUG\` (soak ${SOAK_SECONDS}s)" + echo "**Batch size:** $BATCH_SIZE" + echo "**Dry run:** $DRY_RUN" + echo "**HTTP:** $HTTP_CODE" + echo "" + echo "### Per-tenant result" + echo "" + echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |' + echo '|------|-------|------------|------|---------|-------|' + jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true + } >> "$GITHUB_STEP_SUMMARY" + + if [ "$HTTP_CODE" != "200" ]; then + echo "::error::redeploy-fleet returned HTTP $HTTP_CODE" + exit 1 + fi + OK=$(jq -r '.ok' "$HTTP_RESPONSE") + if [ "$OK" != "true" ]; then + echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)" + exit 1 + fi + echo "::notice::Tenant fleet redeploy complete." diff --git a/.github/workflows/retarget-main-to-staging.yml b/.github/workflows/retarget-main-to-staging.yml index 90fd3d55..0c59ca98 100644 --- a/.github/workflows/retarget-main-to-staging.yml +++ b/.github/workflows/retarget-main-to-staging.yml @@ -33,18 +33,49 @@ jobs: || github.event.pull_request.user.login == 'molecule-ai[bot]' steps: - name: Retarget PR base to staging + id: retarget env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} PR_NUMBER: ${{ github.event.pull_request.number }} PR_AUTHOR: ${{ github.event.pull_request.user.login }} + # Issue #1884: when the bot opens a PR against main and there's + # already another PR on the same head branch targeting staging, + # GitHub's PATCH /pulls returns 422 with + # "A pull request already exists for base branch 'staging' …". + # The retarget can't proceed — but the right response is to + # close the now-redundant main-PR, not to fail the workflow + # noisily. Detect that specific 422 and close instead. run: | + set +e echo "Retargeting PR #${PR_NUMBER} (author: ${PR_AUTHOR}) from main → staging" - gh api -X PATCH \ + PATCH_OUTPUT=$(gh api -X PATCH \ "repos/${{ github.repository }}/pulls/${PR_NUMBER}" \ -f base=staging \ - --jq '.base.ref' + --jq '.base.ref' 2>&1) + PATCH_EXIT=$? + set -e + if [ "$PATCH_EXIT" -eq 0 ]; then + echo "::notice::Retargeted PR #${PR_NUMBER} → staging" + echo "outcome=retargeted" >> "$GITHUB_OUTPUT" + exit 0 + fi + # Specifically match the 422 duplicate-base/head error so + # any OTHER PATCH failure (auth, deleted PR, etc.) still + # surfaces as a real workflow failure. + if echo "$PATCH_OUTPUT" | grep -q "pull request already exists for base branch 'staging'"; then + echo "::notice::PR #${PR_NUMBER}: duplicate target-staging PR exists on same head — closing this main-PR as redundant." + gh pr close "$PR_NUMBER" \ + --repo "${{ github.repository }}" \ + --comment "[retarget-bot] Closing — another PR on the same head branch already targets \`staging\`. This PR is redundant. See issue #1884 for the rationale." + echo "outcome=closed-as-duplicate" >> "$GITHUB_OUTPUT" + exit 0 + fi + echo "::error::Retarget PATCH failed and was NOT a duplicate-base error:" + echo "$PATCH_OUTPUT" >&2 + exit 1 - name: Post explainer comment + if: steps.retarget.outputs.outcome == 'retargeted' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} PR_NUMBER: ${{ github.event.pull_request.number }} diff --git a/.github/workflows/sweep-stale-e2e-orgs.yml b/.github/workflows/sweep-stale-e2e-orgs.yml new file mode 100644 index 00000000..6913cba2 --- /dev/null +++ b/.github/workflows/sweep-stale-e2e-orgs.yml @@ -0,0 +1,170 @@ +name: Sweep stale e2e-* orgs (staging) + +# Janitor for staging tenants left behind when E2E cleanup didn't run: +# CI cancellations, runner crashes, transient AWS errors mid-cascade, +# bash trap missed (signal 9), etc. Without this loop, every failed +# teardown leaks an EC2 + DNS + DB row until manual ops cleanup — +# 2026-04-23 staging hit the 64 vCPU AWS quota from ~27 such orphans. +# +# Why not rely on per-test-run teardown: +# - Per-run teardown is best-effort by definition. Any process death +# after the test starts but before the trap fires leaves debris. +# - GH Actions cancellation kills the runner without grace period. +# The workflow's `if: always()` step usually catches this, but it +# too can fail (CP transient 5xx, runner network issue at the +# wrong moment). +# - Even when teardown runs, the CP cascade is best-effort in places +# (cascadeTerminateWorkspaces logs+continues; DNS deletion same). +# - This sweep is the catch-all that converges staging back to clean +# regardless of which specific path leaked. +# +# The PROPER fix is making CP cleanup transactional + verify-after- +# terminate (filed separately as cleanup-correctness work). This +# workflow is the safety net that catches everything else AND any +# future leak source we haven't yet identified. + +on: + schedule: + # Every hour on the hour. E2E orgs are short-lived (~10-25 min wall + # clock from create to teardown). Anything older than the + # MAX_AGE_MINUTES threshold below is presumed dead. + - cron: '0 * * * *' + workflow_dispatch: + inputs: + max_age_minutes: + description: "Delete e2e-* orgs older than N minutes (default 120)" + required: false + default: "120" + dry_run: + description: "Dry run only — list what would be deleted" + required: false + type: boolean + default: false + +# Don't let two sweeps fight. Cron + workflow_dispatch could overlap +# on a manual trigger; queue rather than parallel-delete. +concurrency: + group: sweep-stale-e2e-orgs + cancel-in-progress: false + +permissions: + contents: read + +jobs: + sweep: + name: Sweep e2e orgs + runs-on: ubuntu-latest + timeout-minutes: 15 + env: + MOLECULE_CP_URL: https://staging-api.moleculesai.app + ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} + MAX_AGE_MINUTES: ${{ github.event.inputs.max_age_minutes || '120' }} + DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }} + # Refuse to delete more than this many orgs in one tick. If the + # CP DB is briefly empty (or the admin endpoint goes weird and + # returns no created_at), every e2e- org would look stale. + # Bailing protects against runaway nukes. + SAFETY_CAP: 50 + + steps: + - name: Verify admin token present + run: | + if [ -z "$ADMIN_TOKEN" ]; then + echo "::error::MOLECULE_STAGING_ADMIN_TOKEN not set" + exit 2 + fi + echo "Admin token present ✓" + + - name: Identify stale e2e orgs + id: identify + run: | + set -euo pipefail + # Fetch into a file so the python step reads it via stdin — + # cleaner than embedding $(curl ...) into a heredoc. + curl -sS --fail-with-body --max-time 30 \ + "$MOLECULE_CP_URL/cp/admin/orgs?limit=500" \ + -H "Authorization: Bearer $ADMIN_TOKEN" \ + > orgs.json + + # Filter: + # 1. slug starts with 'e2e-' (covers e2e-, e2e-canary-, + # e2e-canvas-* — all variants the test scripts mint) + # 2. created_at is older than MAX_AGE_MINUTES ago + # Output one slug per line to a file the next step reads. + python3 > stale_slugs.txt <<'PY' + import json, os + from datetime import datetime, timezone, timedelta + with open("orgs.json") as f: + data = json.load(f) + max_age = int(os.environ["MAX_AGE_MINUTES"]) + cutoff = datetime.now(timezone.utc) - timedelta(minutes=max_age) + for o in data.get("orgs", []): + slug = o.get("slug", "") + if not slug.startswith("e2e-"): + continue + created = o.get("created_at") + if not created: + # Defensively skip rows without created_at — better + # to leave one orphan than nuke a brand-new row + # whose timestamp didn't render. + continue + # Python 3.11+ handles RFC3339 with Z directly via + # fromisoformat; older runners need the trailing Z swap. + created_dt = datetime.fromisoformat(created.replace("Z", "+00:00")) + if created_dt < cutoff: + print(slug) + PY + + count=$(wc -l < stale_slugs.txt | tr -d ' ') + echo "Found $count stale e2e org(s) older than ${MAX_AGE_MINUTES}m" + if [ "$count" -gt 0 ]; then + echo "First 20:" + head -20 stale_slugs.txt | sed 's/^/ /' + fi + echo "count=$count" >> "$GITHUB_OUTPUT" + + - name: Safety gate + if: steps.identify.outputs.count != '0' + run: | + count="${{ steps.identify.outputs.count }}" + if [ "$count" -gt "$SAFETY_CAP" ]; then + echo "::error::Refusing to delete $count orgs in one sweep (cap=$SAFETY_CAP). Investigate manually — this usually means the CP admin API returned no created_at or returned a degraded result. Re-run with workflow_dispatch + max_age_minutes if intentional." + exit 1 + fi + echo "Within safety cap ($count ≤ $SAFETY_CAP) ✓" + + - name: Delete stale orgs + if: steps.identify.outputs.count != '0' && env.DRY_RUN != 'true' + run: | + set -uo pipefail + deleted=0 + failed=0 + while IFS= read -r slug; do + [ -z "$slug" ] && continue + # The DELETE handler requires {"confirm": ""} matching + # the URL slug — fat-finger guard. Idempotent: re-issuing + # picks up via org_purges.last_step. + http_code=$(curl -sS -o /tmp/del_resp -w "%{http_code}" \ + --max-time 60 \ + -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ + -H "Authorization: Bearer $ADMIN_TOKEN" \ + -H "Content-Type: application/json" \ + -d "{\"confirm\":\"$slug\"}" || echo "000") + if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then + deleted=$((deleted+1)) + echo " deleted: $slug" + else + failed=$((failed+1)) + echo " FAILED ($http_code): $slug — $(cat /tmp/del_resp 2>/dev/null | head -c 200)" + fi + done < stale_slugs.txt + echo "" + echo "Sweep summary: deleted=$deleted failed=$failed" + # Don't fail the workflow on per-org delete errors — the + # sweeper is best-effort. Next hourly tick re-attempts. We + # only fail loud at the safety-cap gate above. + + - name: Dry-run summary + if: env.DRY_RUN == 'true' + run: | + echo "DRY RUN — would have deleted ${{ steps.identify.outputs.count }} org(s). Re-run with dry_run=false to actually delete." diff --git a/canvas/Dockerfile b/canvas/Dockerfile index 2fb7c92a..e834b7a5 100644 --- a/canvas/Dockerfile +++ b/canvas/Dockerfile @@ -1,4 +1,4 @@ -FROM node:20-alpine AS builder +FROM node:22-alpine AS builder WORKDIR /app COPY package.json package-lock.json* ./ RUN npm install @@ -11,7 +11,7 @@ ENV NEXT_PUBLIC_WS_URL=$NEXT_PUBLIC_WS_URL ENV NEXT_PUBLIC_ADMIN_TOKEN=$NEXT_PUBLIC_ADMIN_TOKEN RUN npm run build -FROM node:20-alpine +FROM node:22-alpine WORKDIR /app COPY --from=builder /app/.next/standalone ./ COPY --from=builder /app/.next/static ./.next/static diff --git a/canvas/e2e/staging-setup.ts b/canvas/e2e/staging-setup.ts index 7147f4ea..963f9ccb 100644 --- a/canvas/e2e/staging-setup.ts +++ b/canvas/e2e/staging-setup.ts @@ -5,7 +5,7 @@ * the per-tenant admin token, provisions one hermes workspace, waits * for online, then exports: * - * STAGING_TENANT_URL https://.moleculesai.app + * STAGING_TENANT_URL https://.staging.moleculesai.app * STAGING_WORKSPACE_ID UUID of the hermes workspace * STAGING_TENANT_TOKEN per-tenant admin bearer (for spec requests) * STAGING_SLUG org slug (used by teardown) @@ -16,6 +16,11 @@ * CP_ADMIN_API_TOKEN). Drives provision + * tenant-token retrieval + teardown via a * single credential. + * STAGING_TENANT_DOMAIN default: staging.moleculesai.app — the + * DNS suffix the CP provisioner writes for + * staging tenants. Override only when + * running this harness against a non-default + * zone. */ import type { FullConfig } from "@playwright/test"; @@ -25,6 +30,14 @@ import { join } from "path"; const CP_URL = process.env.MOLECULE_CP_URL || "https://staging-api.moleculesai.app"; const ADMIN_TOKEN = process.env.MOLECULE_ADMIN_TOKEN; const STAGING = process.env.CANVAS_E2E_STAGING === "1"; +// Tenant DNS zone for staging. CP provisioner registers DNS as +// `.staging.moleculesai.app` (see internal/provisioner/ec2.go's +// EC2 provisioner: DNS log line). The previous default of plain +// `moleculesai.app` matched prod tenant naming and silently broke +// every staging E2E at the TLS readiness step — DNS literally didn't +// resolve, fetch threw NXDOMAIN, waitFor saw null on every poll, and +// the harness wedged at TLS_TIMEOUT_MS instead of failing loud. +const TENANT_DOMAIN = process.env.STAGING_TENANT_DOMAIN || "staging.moleculesai.app"; // Tenant cold boot on staging regularly takes 12-15 min when the // workspace-server Docker image isn't already cached on the AMI. Raised @@ -105,22 +118,44 @@ export default async function globalSetup(_config: FullConfig): Promise { } console.log(`[staging-setup] Org created: ${slug}`); - // 2. Wait for tenant running (admin-orgs list is the status source) + // 2. Wait for tenant running (admin-orgs list is the status source). + // + // The CP /cp/admin/orgs endpoint returns each org with an + // `instance_status` field (handlers/admin.go:adminOrgSummary, + // sourced from `org_instances.status`). NOT `status` — there's no + // top-level `status` on the row at all. A previous version of this + // test polled `row.status`, which was always undefined, so this + // waitFor never resolved truthy and the harness invariably timed + // out at 1200s — masking real CP bugs (see #242 chain) AND + // surviving real CP fixes alike. + // Capture the org UUID alongside the running check — every request + // we send to the tenant URL after this point needs an + // X-Molecule-Org-Id header (see workspace-server middleware/tenant_guard.go). + // Without it, TenantGuard returns 404 ("must not be inferable by + // probing other orgs' machines"). The CP returns the id on the + // admin-orgs row; capture it here while we're already polling. + let orgID = ""; await waitFor( async () => { const r = await jsonFetch(`${CP_URL}/cp/admin/orgs`, { headers: adminAuth }); if (r.status !== 200) return null; const row = (r.body?.orgs || []).find((o: any) => o.slug === slug); if (!row) return null; - if (row.status === "running") return true; - if (row.status === "failed") throw new Error(`provision failed: ${slug}`); + if (row.instance_status === "running") { + orgID = row.id; + return true; + } + if (row.instance_status === "failed") throw new Error(`provision failed: ${slug}`); return null; }, PROVISION_TIMEOUT_MS, 15_000, "tenant provision", ); - console.log(`[staging-setup] Tenant running`); + if (!orgID) { + throw new Error(`expected admin-orgs row to carry id, got empty for slug=${slug}`); + } + console.log(`[staging-setup] Tenant running (org_id=${orgID})`); // 3. Fetch per-tenant admin token const tokRes = await jsonFetch( @@ -133,7 +168,7 @@ export default async function globalSetup(_config: FullConfig): Promise { ); } const tenantToken: string = tokRes.body.admin_token; - const tenantURL = `https://${slug}.moleculesai.app`; + const tenantURL = `https://${slug}.${TENANT_DOMAIN}`; console.log(`[staging-setup] Tenant URL: ${tenantURL}`); // 4. TLS readiness @@ -154,7 +189,17 @@ export default async function globalSetup(_config: FullConfig): Promise { ); // 5. Provision workspace - const tenantAuth = { Authorization: `Bearer ${tenantToken}` }; + // + // tenantAuth carries TWO headers, both required: + // - Authorization: Bearer — wsAdmin middleware gate + // - X-Molecule-Org-Id: — TenantGuard cross-org gate + // Missing the org-id header silently 404s every non-allowlisted + // route, with no body and no security headers. The 404 is intentional + // (existence-non-inference) which makes it look like a missing route. + const tenantAuth = { + "Authorization": `Bearer ${tenantToken}`, + "X-Molecule-Org-Id": orgID, + }; const ws = await jsonFetch(`${tenantURL}/workspaces`, { method: "POST", headers: tenantAuth, diff --git a/canvas/e2e/staging-tabs.spec.ts b/canvas/e2e/staging-tabs.spec.ts index 412953a5..bfc788ce 100644 --- a/canvas/e2e/staging-tabs.spec.ts +++ b/canvas/e2e/staging-tabs.spec.ts @@ -63,6 +63,82 @@ test.describe("staging canvas tabs", () => { Authorization: `Bearer ${tenantToken}`, }); + // canvas/src/components/AuthGate.tsx fetches /cp/auth/me on mount + // and redirects to the login page on 401. The bearer header above + // is for platform API calls — it does NOT satisfy /cp/auth/me, + // which is cookie-based (WorkOS session). Without this mock, the + // canvas page mounts AuthGate, sees 401 from /cp/auth/me, and + // redirects away from the tenant URL before the React Flow root + // ever renders. The [aria-label] selector wait then times out. + // + // Intercept /cp/auth/me + return a fake Session shape so AuthGate + // resolves to "authenticated" and renders {children}. The session + // contents are cosmetic — the canvas only inspects org_id/user_id + // in a few places that don't fail when these are dummy values. + await context.route("**/cp/auth/me", (route) => + route.fulfill({ + status: 200, + contentType: "application/json", + body: JSON.stringify({ + user_id: `e2e-test-user-${workspaceId}`, + org_id: "e2e-test-org", + email: "e2e@test.local", + }), + }), + ); + + // Universal 401 → empty-200 fallback (defense-in-depth). + // + // The original product bug was canvas/src/lib/api.ts:62-74 calling + // `redirectToLogin` on EVERY 401 — a single workspace-scoped 401 + // (e.g. /workspaces/:id/peers, /plugins) yanked the user (and the + // test) to AuthKit. That's now fixed at the source: api.ts probes + // /cp/auth/me before redirecting, so a 401 from a non-auth path + // with a live session throws a regular error instead. + // + // This route handler stays as a SAFETY NET, not the primary + // defense: + // 1. It silences resource-load console noise from the browser + // (those messages don't include the URL — useless in + // diagnostics, captured by the filter in the assertion + // block but having no 401s reach the network is cleaner). + // 2. It guards against panels that DON'T have try/catch around + // their api calls — an unhandled rejection would surface + // as console.error → fail the assertion. Panels SHOULD + // handle errors, but until they're all audited, this is + // the test's belt to api.ts's braces. + // + // Pass-through real responses; swap 401s for 200 + empty body. + // Skip /cp/auth/me (mocked above) and non-fetch resources + // (HTML/JS/CSS bundles that should NOT be intercepted). + await context.route("**", async (route, request) => { + if (request.resourceType() !== "fetch") { + return route.fallback(); + } + // /cp/auth/me is mocked above with a fixed Session shape — let + // that handler win without us round-tripping the network. + if (request.url().includes("/cp/auth/me")) { + return route.fallback(); + } + let resp; + try { + resp = await route.fetch(); + } catch { + return route.fallback(); + } + if (resp.status() !== 401) { + return route.fulfill({ response: resp }); + } + const lastSeg = + new URL(request.url()).pathname.split("/").filter(Boolean).pop() || ""; + const looksLikeList = !/^[0-9a-f-]{8,}$/.test(lastSeg); + await route.fulfill({ + status: 200, + contentType: "application/json", + body: looksLikeList ? "[]" : "{}", + }); + }); + const consoleErrors: string[] = []; page.on("console", (msg) => { if (msg.type() === "error") { @@ -70,13 +146,38 @@ test.describe("staging canvas tabs", () => { } }); - await page.goto(tenantURL, { waitUntil: "networkidle" }); + // Capture the URL of any failed network request so a "Failed to load + // resource: 404" console message we filter out below leaves a + // breadcrumb. Browser console messages for resource-load failures + // omit the URL, so we'd otherwise be flying blind. Logged to the + // test's stdout (visible in the workflow log under the failed step). + page.on("requestfailed", (req) => { + console.log(`[e2e/requestfailed] ${req.method()} ${req.url()}: ${req.failure()?.errorText ?? "?"}`); + }); + page.on("response", (res) => { + if (res.status() >= 400) { + console.log(`[e2e/response-${res.status()}] ${res.request().method()} ${res.url()}`); + } + }); + + // waitUntil="networkidle" is wrong here — the canvas keeps a + // WebSocket open + polls /events and /workspaces every few + // seconds, so the network is *never* idle for 500ms. page.goto + // would hang until its 45s default timeout. "domcontentloaded" + // returns as soon as the HTML is parsed; React hydration + the + // selector wait below is what actually gates ready-for-interaction. + await page.goto(tenantURL, { waitUntil: "domcontentloaded" }); // Canvas hydration races WebSocket connect + /workspaces fetch. - // Wait for the tablist element (appears after a workspace is - // selected) or the hydration-error banner — whichever wins first. + // Wait for the React Flow canvas wrapper (always present once + // hydrated, even with zero workspaces) or the hydration-error + // banner — whichever wins first. Previous version of this wait + // used `[role="tablist"]`, but that selector only appears AFTER + // a workspace node is clicked (which happens below at L100), so + // the wait would always time out at 45s before any meaningful + // failure surfaced. await page.waitForSelector( - '[role="tablist"], [data-testid="hydration-error"]', + '[aria-label="Molecule AI workspace canvas"], [data-testid="hydration-error"]', { timeout: 45_000 }, ); @@ -106,6 +207,15 @@ test.describe("staging canvas tabs", () => { for (const tabId of TAB_IDS) { await test.step(`tab: ${tabId}`, async () => { const tabButton = page.locator(`#tab-${tabId}`); + // The TABS bar is `overflow-x-auto` (SidePanel.tsx:~tabs + // wrapper) — tabs after position ~3 are clipped behind the + // right-edge fade gradient on smaller viewports. Playwright's + // `toBeVisible()` returns false for clipped elements, so a + // bare visibility check fails on `skills` and later tabs in + // CI. scrollIntoViewIfNeeded brings the button into view + // before the visibility check, mirroring what SidePanel's own + // keyboard handler does on arrow-key navigation. + await tabButton.scrollIntoViewIfNeeded({ timeout: 5_000 }); await expect( tabButton, `tab-${tabId} button missing — TABS list may have drifted`, @@ -134,14 +244,22 @@ test.describe("staging canvas tabs", () => { // Aggregate console-error budget. Known-noisy sources whitelisted: // Sentry, Vercel analytics, WS reconnects (expected on SaaS - // terminal), favicon 404 (cosmetic). + // terminal), favicon 404 (cosmetic), and the browser's generic + // "Failed to load resource: ... 404" message which never includes + // the URL — uninformative on its own and impossible to filter + // meaningfully without a URL. The page.on('requestfailed') + + // page.on('response>=400') logging above captures the actual URLs + // so a real bug still leaves a breadcrumb in the workflow log; + // a real exception (panel crash, JS error) surfaces as a typed + // error with file path which the filter still catches. const appErrors = consoleErrors.filter( (msg) => !msg.includes("sentry") && !msg.includes("vercel") && !msg.includes("WebSocket") && !msg.includes("favicon") && - !msg.includes("molecule-icon.png"), // another cosmetic 404 + !msg.includes("molecule-icon.png") && // cosmetic 404 + !msg.includes("Failed to load resource"), ); expect( appErrors, diff --git a/canvas/src/app/page.tsx b/canvas/src/app/page.tsx index 74291409..8b79ef83 100644 --- a/canvas/src/app/page.tsx +++ b/canvas/src/app/page.tsx @@ -61,6 +61,11 @@ export default function Home() { {hydrationError && (

{hydrationError}

diff --git a/canvas/src/app/pricing/page.tsx b/canvas/src/app/pricing/page.tsx index 061a7e60..a7327793 100644 --- a/canvas/src/app/pricing/page.tsx +++ b/canvas/src/app/pricing/page.tsx @@ -14,7 +14,7 @@ import { PricingTable } from "@/components/PricingTable"; export const metadata = { title: "Pricing — Molecule AI", description: - "Free while you tinker, paid tiers for shipping production multi-agent organizations. Transparent usage-based overage pricing on Pro.", + "Flat-rate team and org pricing — no per-seat fees. Free to start, $29/month for teams, $99/month for production orgs. Full runtime stack included on every paid tier.", }; export default function PricingPage() { @@ -25,9 +25,12 @@ export default function PricingPage() { Pricing

- Free while you tinker. Pay when you ship real agents to production. - Every tier includes the full runtime stack — you upgrade for scale, - support, and dedicated infrastructure. + One flat price per org — not per seat. Every paid tier includes the + full runtime stack. You upgrade for scale, support, and dedicated + infrastructure. +

+

+ 5-person team? You pay $29/month — not $200. No seat math, ever.

@@ -53,7 +56,8 @@ export default function PricingPage() { .

- Prices shown in USD. Enterprise / self-hosted licensing available — contact us. + Prices shown in USD. Flat-rate per org — no per-seat fees on any paid tier. + Enterprise / self-hosted licensing available — contact us.

diff --git a/canvas/src/components/ProvisioningTimeout.tsx b/canvas/src/components/ProvisioningTimeout.tsx index c4ed460c..1c09fa3b 100644 --- a/canvas/src/components/ProvisioningTimeout.tsx +++ b/canvas/src/components/ProvisioningTimeout.tsx @@ -6,10 +6,16 @@ import { api } from "@/lib/api"; import { showToast } from "./Toaster"; import { ConsoleModal } from "./ConsoleModal"; -/** Base provisioning timeout in milliseconds (2 minutes). Used as the - * floor; the effective threshold scales with the number of workspaces - * concurrently provisioning (see effectiveTimeoutMs below). */ -export const DEFAULT_PROVISION_TIMEOUT_MS = 120_000; +import { + DEFAULT_RUNTIME_PROFILE, + provisionTimeoutForRuntime, +} from "@/lib/runtimeProfiles"; + +/** Re-export for backward compatibility with tests and other importers + * that previously imported DEFAULT_PROVISION_TIMEOUT_MS from this file. + * New code should read via getRuntimeProfile() from @/lib/runtimeProfiles. */ +export const DEFAULT_PROVISION_TIMEOUT_MS = + DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs; /** The server provisions up to `PROVISION_CONCURRENCY` containers at * once and paces the rest in a queue (`workspaceCreatePacingMs` = @@ -43,8 +49,12 @@ interface TimeoutEntry { * time per node. */ export function ProvisioningTimeout({ - timeoutMs = DEFAULT_PROVISION_TIMEOUT_MS, + timeoutMs, }: { + // If undefined (the default when mounted without a prop), each workspace's + // threshold is resolved from its runtime via timeoutForRuntime(). + // Pass an explicit number to force a single threshold for every workspace + // (used by tests that want deterministic behavior regardless of runtime). timeoutMs?: number; }) { const [timedOut, setTimedOut] = useState([]); @@ -57,19 +67,28 @@ export function ProvisioningTimeout({ const [dismissed, setDismissed] = useState>(new Set()); // Subscribe to provisioning nodes — use shallow compare to avoid infinite re-render - // (filter+map creates new array reference on every store update) + // (filter+map creates new array reference on every store update). + // Runtime included so the timeout threshold can be resolved per-node + // (hermes cold-boot legitimately takes 8-13 min vs 30-90s for docker + // runtimes — a single threshold would false-alarm on one or the other). + // Separator: `|` between fields, `,` between nodes. Names may contain + // anything the user typed; strip `|` and `,` so serialization round-trips. const provisioningNodes = useCanvasStore((s) => { const result = s.nodes .filter((n) => n.data.status === "provisioning") - .map((n) => `${n.id}:${n.data.name}`); + .map((n) => { + const safeName = (n.data.name ?? "").replace(/[|,]/g, " "); + const runtime = n.data.runtime ?? ""; + return `${n.id}|${safeName}|${runtime}`; + }); return result.join(","); }); const parsedProvisioningNodes = useMemo( () => provisioningNodes ? provisioningNodes.split(",").map((entry) => { - const [id, name] = entry.split(":"); - return { id, name }; + const [id, name, runtime] = entry.split("|"); + return { id, name, runtime }; }) : [], [provisioningNodes], @@ -113,14 +132,21 @@ export function ProvisioningTimeout({ const interval = setInterval(() => { const now = Date.now(); const newTimedOut: TimeoutEntry[] = []; - const effective = effectiveTimeoutMs( - timeoutMs, - parsedProvisioningNodes.length, - ); + // Per-node timeout: each workspace resolves its own base via + // @/lib/runtimeProfiles (server-override → runtime profile → + // default), then scales by concurrent-provisioning count. A + // hermes workspace in a batch alongside two langgraph workspaces + // gets hermes's 12-min base, not langgraph's 2-min base. for (const node of parsedProvisioningNodes) { const startedAt = tracking.get(node.id); - if (startedAt && now - startedAt >= effective) { + if (!startedAt) continue; + const base = timeoutMs ?? provisionTimeoutForRuntime(node.runtime); + const effective = effectiveTimeoutMs( + base, + parsedProvisioningNodes.length, + ); + if (now - startedAt >= effective) { newTimedOut.push({ workspaceId: node.id, workspaceName: node.name, diff --git a/canvas/src/components/WorkspaceNode.tsx b/canvas/src/components/WorkspaceNode.tsx index 49c093e6..a2a8962f 100644 --- a/canvas/src/components/WorkspaceNode.tsx +++ b/canvas/src/components/WorkspaceNode.tsx @@ -322,31 +322,6 @@ function countDescendants(nodeId: string, allNodes: Node[], v * infinite recursion on circular parentId references and keeps the UI readable. */ const MAX_NESTING_DEPTH = 3; -/** Subscribes to allNodes only when children exist — isolates re-renders from parent */ -function EmbeddedTeam({ members, depth, onSelect, onExtract }: { - members: Node[]; - depth: number; - onSelect: (id: string) => void; - onExtract: (id: string) => void; -}) { - const allNodes = useCanvasStore((s) => s.nodes); - // Use grid layout at depth 0 when there are multiple members (departments side-by-side) - const useGrid = depth === 0 && members.length >= 2; - return ( -
-
Team Members
-
- {members.map((child) => ( - - ))} -
-
- ); -} - /** Recursive mini-card — mirrors parent card layout at smaller scale */ function TeamMemberChip({ node, diff --git a/canvas/src/components/__tests__/PricingTable.test.tsx b/canvas/src/components/__tests__/PricingTable.test.tsx index af5faec0..535daeb7 100644 --- a/canvas/src/components/__tests__/PricingTable.test.tsx +++ b/canvas/src/components/__tests__/PricingTable.test.tsx @@ -50,14 +50,14 @@ describe("PricingTable", () => { it("renders all three plans with their CTAs", () => { render(); expect(screen.getByRole("heading", { name: "Free" })).toBeTruthy(); - expect(screen.getByRole("heading", { name: "Starter" })).toBeTruthy(); - expect(screen.getByRole("heading", { name: "Pro" })).toBeTruthy(); + expect(screen.getByRole("heading", { name: "Team" })).toBeTruthy(); + expect(screen.getByRole("heading", { name: "Growth" })).toBeTruthy(); expect(screen.getByRole("button", { name: "Get started" })).toBeTruthy(); - expect(screen.getByRole("button", { name: "Upgrade to Starter" })).toBeTruthy(); - expect(screen.getByRole("button", { name: "Upgrade to Pro" })).toBeTruthy(); + expect(screen.getByRole("button", { name: "Upgrade to Team" })).toBeTruthy(); + expect(screen.getByRole("button", { name: "Upgrade to Growth" })).toBeTruthy(); }); - it("shows the 'Most popular' badge only on the starter card", () => { + it("shows the 'Most popular' badge only on the Team card", () => { render(); const badges = screen.getAllByText("Most popular"); expect(badges.length).toBe(1); @@ -74,7 +74,7 @@ describe("PricingTable", () => { it("Paid CTA + anonymous → bounces to signup (no checkout call)", async () => { mockedFetchSession.mockResolvedValue(null); render(); - fireEvent.click(screen.getByRole("button", { name: "Upgrade to Starter" })); + fireEvent.click(screen.getByRole("button", { name: "Upgrade to Team" })); await waitFor(() => expect(mockedRedirectToLogin).toHaveBeenCalledWith("sign-up")); expect(mockedStartCheckout).not.toHaveBeenCalled(); }); @@ -91,7 +91,7 @@ describe("PricingTable", () => { }); render(); - fireEvent.click(screen.getByRole("button", { name: "Upgrade to Pro" })); + fireEvent.click(screen.getByRole("button", { name: "Upgrade to Growth" })); await waitFor(() => expect(mockedStartCheckout).toHaveBeenCalledWith("pro", "acme"), @@ -111,7 +111,7 @@ describe("PricingTable", () => { mockedGetTenantSlug.mockReturnValue(""); render(); - fireEvent.click(screen.getByRole("button", { name: "Upgrade to Starter" })); + fireEvent.click(screen.getByRole("button", { name: "Upgrade to Team" })); await waitFor(() => { const alert = screen.getByRole("alert"); @@ -129,7 +129,7 @@ describe("PricingTable", () => { mockedStartCheckout.mockRejectedValue(new Error("checkout: 500 boom")); render(); - fireEvent.click(screen.getByRole("button", { name: "Upgrade to Pro" })); + fireEvent.click(screen.getByRole("button", { name: "Upgrade to Growth" })); await waitFor(() => { const alert = screen.getByRole("alert"); @@ -140,7 +140,7 @@ describe("PricingTable", () => { it("treats fetchSession network errors as anonymous (fail-closed to signup)", async () => { mockedFetchSession.mockRejectedValue(new Error("network down")); render(); - fireEvent.click(screen.getByRole("button", { name: "Upgrade to Starter" })); + fireEvent.click(screen.getByRole("button", { name: "Upgrade to Team" })); await waitFor(() => expect(mockedRedirectToLogin).toHaveBeenCalledWith("sign-up")); expect(mockedStartCheckout).not.toHaveBeenCalled(); }); @@ -155,7 +155,7 @@ describe("PricingTable", () => { mockedStartCheckout.mockReturnValue(new Promise(() => {})); render(); - const button = screen.getByRole("button", { name: "Upgrade to Pro" }); + const button = screen.getByRole("button", { name: "Upgrade to Growth" }); fireEvent.click(button); await waitFor(() => { diff --git a/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx b/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx index f1c5b150..2424ea49 100644 --- a/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx +++ b/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx @@ -8,6 +8,12 @@ global.fetch = vi.fn(() => import { useCanvasStore } from "../../store/canvas"; import type { WorkspaceData } from "../../store/socket"; import { DEFAULT_PROVISION_TIMEOUT_MS } from "../ProvisioningTimeout"; +import { + DEFAULT_RUNTIME_PROFILE, + RUNTIME_PROFILES, + getRuntimeProfile, + provisionTimeoutForRuntime, +} from "@/lib/runtimeProfiles"; // Helper to build a WorkspaceData object function makeWS(overrides: Partial & { id: string }): WorkspaceData { @@ -184,4 +190,102 @@ describe("ProvisioningTimeout", () => { .nodes.filter((n) => n.data.status === "provisioning"); expect(stillProvisioning).toHaveLength(2); }); + + // ── Runtime-aware timeout regression tests (2026-04-24 outage) ──────────── + // Prior to this, a hermes workspace consistently false-alarmed at 2 min + // into its 8-13 min cold boot, pushing users to retry something that + // would have come online on its own. The runtime-aware override keeps + // the 2-min floor for fast docker runtimes while giving hermes its + // honest 12-min budget. + + describe("runtime profile resolution (@/lib/runtimeProfiles)", () => { + describe("provisionTimeoutForRuntime", () => { + it("returns the default for unknown/missing runtimes", () => { + expect(provisionTimeoutForRuntime(undefined)).toBe( + DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs, + ); + expect(provisionTimeoutForRuntime("")).toBe( + DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs, + ); + expect(provisionTimeoutForRuntime("some-future-runtime")).toBe( + DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs, + ); + }); + + it("returns default for known-fast runtimes (not in profile map)", () => { + // If someone ever adds one of these to RUNTIME_PROFILES with a + // slower value, this test catches the unintended regression. + expect(provisionTimeoutForRuntime("claude-code")).toBe( + DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs, + ); + expect(provisionTimeoutForRuntime("langgraph")).toBe( + DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs, + ); + expect(provisionTimeoutForRuntime("crewai")).toBe( + DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs, + ); + }); + + it("returns hermes override when runtime = hermes", () => { + expect(provisionTimeoutForRuntime("hermes")).toBe( + RUNTIME_PROFILES.hermes?.provisionTimeoutMs, + ); + expect(provisionTimeoutForRuntime("hermes")).toBeGreaterThanOrEqual( + DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs * 5, + ); + }); + + it("server-side workspace override wins over runtime profile", () => { + // The resolution order is: overrides → profile → default. + // An operator-tunable per-workspace number on the backend + // (e.g. via a template manifest field) should beat the canvas + // runtime map. + expect( + provisionTimeoutForRuntime("hermes", { + provisionTimeoutMs: 60_000, + }), + ).toBe(60_000); + expect( + provisionTimeoutForRuntime("some-unknown", { + provisionTimeoutMs: 300_000, + }), + ).toBe(300_000); + }); + }); + + describe("getRuntimeProfile", () => { + it("returns a structural profile with required fields", () => { + const profile = getRuntimeProfile("hermes"); + expect(profile.provisionTimeoutMs).toBeTypeOf("number"); + expect(profile.provisionTimeoutMs).toBeGreaterThan(0); + }); + + it("default profile is a valid superset of every override", () => { + // Every entry in RUNTIME_PROFILES must provide fields the + // default does — otherwise consumers could get undefined where + // they expected a number. This test enforces that contract so + // future entries can't accidentally drop fields. + for (const [runtime, profile] of Object.entries(RUNTIME_PROFILES)) { + const resolved = getRuntimeProfile(runtime); + expect( + resolved.provisionTimeoutMs, + `runtime=${runtime} must resolve to a number`, + ).toBeTypeOf("number"); + expect(resolved.provisionTimeoutMs).toBeGreaterThan(0); + // Profile's explicit value should be used iff present. + if (profile.provisionTimeoutMs !== undefined) { + expect(resolved.provisionTimeoutMs).toBe(profile.provisionTimeoutMs); + } + } + }); + }); + + describe("DEFAULT_PROVISION_TIMEOUT_MS backward-compat export", () => { + it("still exports the same default for legacy importers", () => { + expect(DEFAULT_PROVISION_TIMEOUT_MS).toBe( + DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs, + ); + }); + }); + }); }); diff --git a/canvas/src/components/__tests__/tabs.a11y.test.tsx b/canvas/src/components/__tests__/tabs.a11y.test.tsx index 91f2c370..a7000917 100644 --- a/canvas/src/components/__tests__/tabs.a11y.test.tsx +++ b/canvas/src/components/__tests__/tabs.a11y.test.tsx @@ -183,7 +183,31 @@ describe("ChannelsTab — htmlFor/id label associations (WCAG 1.3.1)", () => { beforeEach(() => { mockApiGet.mockImplementation((url: string) => { if (url.includes("/channels/adapters")) { - return Promise.resolve([{ type: "telegram", display_name: "Telegram" }]); + // Mirror the real GET /channels/adapters shape — schema-driven form + // relies on config_schema arriving from the adapter. A bare + // {type, display_name} mock renders an empty form and every + // getByLabelText below fails. + return Promise.resolve([ + { + type: "telegram", + display_name: "Telegram", + config_schema: [ + { + key: "bot_token", + label: "Bot Token", + type: "password", + required: true, + sensitive: true, + }, + { + key: "chat_id", + label: "Chat IDs", + type: "text", + required: true, + }, + ], + }, + ]); } return Promise.resolve([]); }); diff --git a/canvas/src/components/settings/UnsavedChangesGuard.tsx b/canvas/src/components/settings/UnsavedChangesGuard.tsx index 373716a3..d9b198d1 100644 --- a/canvas/src/components/settings/UnsavedChangesGuard.tsx +++ b/canvas/src/components/settings/UnsavedChangesGuard.tsx @@ -31,12 +31,12 @@ export function UnsavedChangesGuard({
- - diff --git a/canvas/src/components/tabs/ActivityTab.tsx b/canvas/src/components/tabs/ActivityTab.tsx index 74f0d781..fc857842 100644 --- a/canvas/src/components/tabs/ActivityTab.tsx +++ b/canvas/src/components/tabs/ActivityTab.tsx @@ -186,7 +186,7 @@ function ActivityRow({ : "bg-zinc-800/60 border-zinc-700/40" }`} > - + + {/* Render one input per schema field. Fallback path: if the + backend didn't return a schema (older platform version) show + a single bot_token + chat_id pair to preserve the old UX. */} + {currentSchema.length === 0 ? ( +
+ Platform exposes no config schema — upgrade the platform to pick up first-class support.
- {discoveredChats.length > 0 && ( -
- {discoveredChats.map((chat) => ( - - ))} -
- )} - {(discoveredChats.length === 0 || showManualInput) && ( - setFormChatId(e.target.value)} - placeholder="-100123456789, -100987654321" - className="w-full text-xs bg-zinc-900 border border-zinc-700 rounded px-2 py-1.5 text-zinc-300 placeholder-zinc-600" + ) : ( + currentSchema.map((field) => ( + setFieldValue(field.key, v)} + // Detect Chats button lives next to the chat_id input on + // Telegram only (the only platform with getUpdates). + renderExtras={ + field.key === "chat_id" && SUPPORTS_DETECT_CHATS.has(formType) + ? () => ( + <> +
+ +
+ {discoveredChats.length > 0 && ( +
+ {discoveredChats.map((chat) => ( + + ))} + +
+ )} + + ) + : undefined + } /> - )} -

- {discoveredChats.length > 0 ? ( - <> - Chats: {formChatId || "(none selected)"} - {" · "} - - - ) : ( - "Click Detect Chats after adding the bot to groups or sending /start in DMs." - )} -

-
+ )) + )} +
{formError && ( @@ -343,7 +378,7 @@ export function ChannelsTab({ workspaceId }: Props) {

No channels connected

- Connect Telegram, Slack, or Discord to chat with this agent from social platforms. + Connect Telegram, Slack, Discord, or Lark / Feishu to chat with this agent from social platforms.

)} @@ -364,7 +399,7 @@ export function ChannelsTab({ workspaceId }: Props) { {ch.channel_type.charAt(0).toUpperCase() + ch.channel_type.slice(1)} - {ch.config.chat_id} + {ch.config.chat_id || ch.config.channel_id || ""}
@@ -415,3 +450,53 @@ export function ChannelsTab({ workspaceId }: Props) {
); } + +// SchemaField renders one ConfigField as a label + input. Kept inline in +// this file so the ChannelsTab stays self-contained; promote to its own +// module if another tab ever needs it. +function SchemaField({ + field, + value, + onChange, + renderExtras, +}: { + field: ConfigField; + value: string; + onChange: (v: string) => void; + renderExtras?: () => React.ReactNode; +}) { + const inputId = useId(); + const common = + "w-full text-xs bg-zinc-900 border border-zinc-700 rounded px-2 py-1.5 text-zinc-300 placeholder-zinc-600"; + return ( +
+ + {field.type === "textarea" ? ( +