Merge branch 'staging' into feat/external-runtime-first-class

This commit is contained in:
Hongming Wang 2026-04-26 02:22:38 -07:00 committed by GitHub
commit 775406d7fe
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
57 changed files with 2257 additions and 342 deletions

View File

@ -43,6 +43,17 @@ jobs:
env:
MOLECULE_CP_URL: https://staging-api.moleculesai.app
MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
# Without an LLM key the test_staging_full_saas.sh script provisions
# the workspace with empty secrets, hermes derive-provider.sh resolves
# `openai/gpt-4o` to PROVIDER=openrouter, no OPENROUTER_API_KEY is
# found in env, and A2A returns "No LLM provider configured" at
# request time (canary step 8/11). The full-lifecycle workflow
# (e2e-staging-saas.yml) has carried this secret since launch — the
# canary regressed when it was first split out and lost the env
# block. Issue #1500 had ~30 consecutive failures before this was
# spotted; do NOT remove without re-reading the script's secrets-
# injection block.
E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }}
E2E_MODE: canary
E2E_RUNTIME: hermes
E2E_RUN_ID: "canary-${{ github.run_id }}"
@ -57,6 +68,14 @@ jobs:
exit 2
fi
- name: Verify OpenAI key present
run: |
if [ -z "$E2E_OPENAI_API_KEY" ]; then
echo "::error::MOLECULE_STAGING_OPENAI_KEY secret not set — A2A will fail at request time with 'No LLM provider configured'"
exit 2
fi
echo "OpenAI key present ✓ (len=${#E2E_OPENAI_API_KEY})"
- name: Canary run
id: canary
run: bash tests/e2e/test_staging_full_saas.sh

View File

@ -0,0 +1,164 @@
name: redeploy-tenants-on-main
# Auto-refresh prod tenant EC2s after every main merge.
#
# Why this workflow exists: publish-workspace-server-image builds and
# pushes a new platform-tenant:latest + :<sha> to GHCR on every merge
# to main, but running tenants pulled their image once at boot and
# never re-pull. Users see stale code indefinitely.
#
# This workflow closes the gap by calling the control-plane admin
# endpoint that performs a canary-first, batched, health-gated rolling
# redeploy across every live tenant. Implemented in Molecule-AI/
# molecule-controlplane as POST /cp/admin/tenants/redeploy-fleet
# (feat/tenant-auto-redeploy, landing alongside this workflow).
#
# Runtime ordering:
# 1. publish-workspace-server-image completes → new :latest in GHCR.
# 2. This workflow fires via workflow_run, waits 30s for GHCR's
# CDN to propagate the new tag to the region the tenants pull from.
# 3. Calls redeploy-fleet with canary_slug=hongmingwang and a 60s
# soak. Canary proves the image boots; batches follow.
# 4. Any failure aborts the rollout and leaves older tenants on the
# prior image — safer default than half-and-half state.
#
# Rollback path: re-run this workflow with a specific SHA pinned via
# the workflow_dispatch input. That calls redeploy-fleet with
# target_tag=<sha>, re-pulling the older image on every tenant.
on:
workflow_run:
workflows: ['publish-workspace-server-image']
types: [completed]
branches: [main]
workflow_dispatch:
inputs:
target_tag:
description: 'Tenant image tag to deploy (e.g. "latest" or "a59f1a6c"). Defaults to latest when empty.'
required: false
type: string
default: 'latest'
canary_slug:
description: 'Tenant slug to deploy first + soak (empty = skip canary, fan out immediately).'
required: false
type: string
default: 'hongmingwang'
soak_seconds:
description: 'Seconds to wait after canary before fanning out.'
required: false
type: string
default: '60'
batch_size:
description: 'How many tenants SSM redeploys in parallel per batch.'
required: false
type: string
default: '3'
dry_run:
description: 'Plan only — do not actually redeploy.'
required: false
type: boolean
default: false
permissions:
contents: read
# No write scopes needed — the workflow hits an external CP endpoint,
# not the GitHub API.
jobs:
redeploy:
# Skip the auto-trigger if publish-workspace-server-image didn't
# actually succeed. workflow_run fires on any completion state; we
# don't want to redeploy against a half-built image.
if: |
github.event_name == 'workflow_dispatch' ||
(github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
runs-on: ubuntu-latest
timeout-minutes: 25
steps:
- name: Wait for GHCR tag propagation
# GHCR's edge cache takes ~15-30s to consistently serve the new
# :latest manifest after the registry accepts the push. Without
# this sleep, the first tenant's docker pull sometimes races
# and fetches the previous digest; sleeping is the cheapest
# way to reduce that without polling GHCR for the new digest.
run: sleep 30
- name: Call CP redeploy-fleet
# CP_ADMIN_API_TOKEN must be set as a repo/org secret on
# Molecule-AI/molecule-core, matching the staging/prod CP's
# CP_ADMIN_API_TOKEN env. Stored in Railway, mirrored to this
# repo's secrets for CI.
env:
CP_URL: ${{ vars.CP_URL || 'https://api.moleculesai.app' }}
CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
TARGET_TAG: ${{ inputs.target_tag || 'latest' }}
CANARY_SLUG: ${{ inputs.canary_slug || 'hongmingwang' }}
SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }}
BATCH_SIZE: ${{ inputs.batch_size || '3' }}
DRY_RUN: ${{ inputs.dry_run || false }}
run: |
set -euo pipefail
if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then
echo "::error::CP_ADMIN_API_TOKEN secret not set — skipping redeploy"
echo "::notice::Set CP_ADMIN_API_TOKEN in repo secrets to enable auto-redeploy."
exit 1
fi
BODY=$(jq -nc \
--arg tag "$TARGET_TAG" \
--arg canary "$CANARY_SLUG" \
--argjson soak "$SOAK_SECONDS" \
--argjson batch "$BATCH_SIZE" \
--argjson dry "$DRY_RUN" \
'{
target_tag: $tag,
canary_slug: $canary,
soak_seconds: $soak,
batch_size: $batch,
dry_run: $dry
}')
echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet"
echo " body: $BODY"
HTTP_RESPONSE=$(mktemp)
HTTP_CODE=$(curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
-m 1200 \
-H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
-H "Content-Type: application/json" \
-X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
-d "$BODY" || echo "000")
echo "HTTP $HTTP_CODE"
cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
# Pretty-print per-tenant results in the job summary so
# ops can see which tenants were redeployed without drilling
# into the raw response.
{
echo "## Tenant redeploy fleet"
echo ""
echo "**Target tag:** \`$TARGET_TAG\`"
echo "**Canary:** \`$CANARY_SLUG\` (soak ${SOAK_SECONDS}s)"
echo "**Batch size:** $BATCH_SIZE"
echo "**Dry run:** $DRY_RUN"
echo "**HTTP:** $HTTP_CODE"
echo ""
echo "### Per-tenant result"
echo ""
echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |'
echo '|------|-------|------------|------|---------|-------|'
jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true
} >> "$GITHUB_STEP_SUMMARY"
if [ "$HTTP_CODE" != "200" ]; then
echo "::error::redeploy-fleet returned HTTP $HTTP_CODE"
exit 1
fi
OK=$(jq -r '.ok' "$HTTP_RESPONSE")
if [ "$OK" != "true" ]; then
echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)"
exit 1
fi
echo "::notice::Tenant fleet redeploy complete."

View File

@ -33,18 +33,49 @@ jobs:
|| github.event.pull_request.user.login == 'molecule-ai[bot]'
steps:
- name: Retarget PR base to staging
id: retarget
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ github.event.pull_request.number }}
PR_AUTHOR: ${{ github.event.pull_request.user.login }}
# Issue #1884: when the bot opens a PR against main and there's
# already another PR on the same head branch targeting staging,
# GitHub's PATCH /pulls returns 422 with
# "A pull request already exists for base branch 'staging' …".
# The retarget can't proceed — but the right response is to
# close the now-redundant main-PR, not to fail the workflow
# noisily. Detect that specific 422 and close instead.
run: |
set +e
echo "Retargeting PR #${PR_NUMBER} (author: ${PR_AUTHOR}) from main → staging"
gh api -X PATCH \
PATCH_OUTPUT=$(gh api -X PATCH \
"repos/${{ github.repository }}/pulls/${PR_NUMBER}" \
-f base=staging \
--jq '.base.ref'
--jq '.base.ref' 2>&1)
PATCH_EXIT=$?
set -e
if [ "$PATCH_EXIT" -eq 0 ]; then
echo "::notice::Retargeted PR #${PR_NUMBER} → staging"
echo "outcome=retargeted" >> "$GITHUB_OUTPUT"
exit 0
fi
# Specifically match the 422 duplicate-base/head error so
# any OTHER PATCH failure (auth, deleted PR, etc.) still
# surfaces as a real workflow failure.
if echo "$PATCH_OUTPUT" | grep -q "pull request already exists for base branch 'staging'"; then
echo "::notice::PR #${PR_NUMBER}: duplicate target-staging PR exists on same head — closing this main-PR as redundant."
gh pr close "$PR_NUMBER" \
--repo "${{ github.repository }}" \
--comment "[retarget-bot] Closing — another PR on the same head branch already targets \`staging\`. This PR is redundant. See issue #1884 for the rationale."
echo "outcome=closed-as-duplicate" >> "$GITHUB_OUTPUT"
exit 0
fi
echo "::error::Retarget PATCH failed and was NOT a duplicate-base error:"
echo "$PATCH_OUTPUT" >&2
exit 1
- name: Post explainer comment
if: steps.retarget.outputs.outcome == 'retargeted'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ github.event.pull_request.number }}

View File

@ -0,0 +1,170 @@
name: Sweep stale e2e-* orgs (staging)
# Janitor for staging tenants left behind when E2E cleanup didn't run:
# CI cancellations, runner crashes, transient AWS errors mid-cascade,
# bash trap missed (signal 9), etc. Without this loop, every failed
# teardown leaks an EC2 + DNS + DB row until manual ops cleanup —
# 2026-04-23 staging hit the 64 vCPU AWS quota from ~27 such orphans.
#
# Why not rely on per-test-run teardown:
# - Per-run teardown is best-effort by definition. Any process death
# after the test starts but before the trap fires leaves debris.
# - GH Actions cancellation kills the runner without grace period.
# The workflow's `if: always()` step usually catches this, but it
# too can fail (CP transient 5xx, runner network issue at the
# wrong moment).
# - Even when teardown runs, the CP cascade is best-effort in places
# (cascadeTerminateWorkspaces logs+continues; DNS deletion same).
# - This sweep is the catch-all that converges staging back to clean
# regardless of which specific path leaked.
#
# The PROPER fix is making CP cleanup transactional + verify-after-
# terminate (filed separately as cleanup-correctness work). This
# workflow is the safety net that catches everything else AND any
# future leak source we haven't yet identified.
on:
schedule:
# Every hour on the hour. E2E orgs are short-lived (~10-25 min wall
# clock from create to teardown). Anything older than the
# MAX_AGE_MINUTES threshold below is presumed dead.
- cron: '0 * * * *'
workflow_dispatch:
inputs:
max_age_minutes:
description: "Delete e2e-* orgs older than N minutes (default 120)"
required: false
default: "120"
dry_run:
description: "Dry run only — list what would be deleted"
required: false
type: boolean
default: false
# Don't let two sweeps fight. Cron + workflow_dispatch could overlap
# on a manual trigger; queue rather than parallel-delete.
concurrency:
group: sweep-stale-e2e-orgs
cancel-in-progress: false
permissions:
contents: read
jobs:
sweep:
name: Sweep e2e orgs
runs-on: ubuntu-latest
timeout-minutes: 15
env:
MOLECULE_CP_URL: https://staging-api.moleculesai.app
ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
MAX_AGE_MINUTES: ${{ github.event.inputs.max_age_minutes || '120' }}
DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }}
# Refuse to delete more than this many orgs in one tick. If the
# CP DB is briefly empty (or the admin endpoint goes weird and
# returns no created_at), every e2e- org would look stale.
# Bailing protects against runaway nukes.
SAFETY_CAP: 50
steps:
- name: Verify admin token present
run: |
if [ -z "$ADMIN_TOKEN" ]; then
echo "::error::MOLECULE_STAGING_ADMIN_TOKEN not set"
exit 2
fi
echo "Admin token present ✓"
- name: Identify stale e2e orgs
id: identify
run: |
set -euo pipefail
# Fetch into a file so the python step reads it via stdin —
# cleaner than embedding $(curl ...) into a heredoc.
curl -sS --fail-with-body --max-time 30 \
"$MOLECULE_CP_URL/cp/admin/orgs?limit=500" \
-H "Authorization: Bearer $ADMIN_TOKEN" \
> orgs.json
# Filter:
# 1. slug starts with 'e2e-' (covers e2e-, e2e-canary-,
# e2e-canvas-* — all variants the test scripts mint)
# 2. created_at is older than MAX_AGE_MINUTES ago
# Output one slug per line to a file the next step reads.
python3 > stale_slugs.txt <<'PY'
import json, os
from datetime import datetime, timezone, timedelta
with open("orgs.json") as f:
data = json.load(f)
max_age = int(os.environ["MAX_AGE_MINUTES"])
cutoff = datetime.now(timezone.utc) - timedelta(minutes=max_age)
for o in data.get("orgs", []):
slug = o.get("slug", "")
if not slug.startswith("e2e-"):
continue
created = o.get("created_at")
if not created:
# Defensively skip rows without created_at — better
# to leave one orphan than nuke a brand-new row
# whose timestamp didn't render.
continue
# Python 3.11+ handles RFC3339 with Z directly via
# fromisoformat; older runners need the trailing Z swap.
created_dt = datetime.fromisoformat(created.replace("Z", "+00:00"))
if created_dt < cutoff:
print(slug)
PY
count=$(wc -l < stale_slugs.txt | tr -d ' ')
echo "Found $count stale e2e org(s) older than ${MAX_AGE_MINUTES}m"
if [ "$count" -gt 0 ]; then
echo "First 20:"
head -20 stale_slugs.txt | sed 's/^/ /'
fi
echo "count=$count" >> "$GITHUB_OUTPUT"
- name: Safety gate
if: steps.identify.outputs.count != '0'
run: |
count="${{ steps.identify.outputs.count }}"
if [ "$count" -gt "$SAFETY_CAP" ]; then
echo "::error::Refusing to delete $count orgs in one sweep (cap=$SAFETY_CAP). Investigate manually — this usually means the CP admin API returned no created_at or returned a degraded result. Re-run with workflow_dispatch + max_age_minutes if intentional."
exit 1
fi
echo "Within safety cap ($count ≤ $SAFETY_CAP) ✓"
- name: Delete stale orgs
if: steps.identify.outputs.count != '0' && env.DRY_RUN != 'true'
run: |
set -uo pipefail
deleted=0
failed=0
while IFS= read -r slug; do
[ -z "$slug" ] && continue
# The DELETE handler requires {"confirm": "<slug>"} matching
# the URL slug — fat-finger guard. Idempotent: re-issuing
# picks up via org_purges.last_step.
http_code=$(curl -sS -o /tmp/del_resp -w "%{http_code}" \
--max-time 60 \
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
-H "Authorization: Bearer $ADMIN_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"confirm\":\"$slug\"}" || echo "000")
if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then
deleted=$((deleted+1))
echo " deleted: $slug"
else
failed=$((failed+1))
echo " FAILED ($http_code): $slug — $(cat /tmp/del_resp 2>/dev/null | head -c 200)"
fi
done < stale_slugs.txt
echo ""
echo "Sweep summary: deleted=$deleted failed=$failed"
# Don't fail the workflow on per-org delete errors — the
# sweeper is best-effort. Next hourly tick re-attempts. We
# only fail loud at the safety-cap gate above.
- name: Dry-run summary
if: env.DRY_RUN == 'true'
run: |
echo "DRY RUN — would have deleted ${{ steps.identify.outputs.count }} org(s). Re-run with dry_run=false to actually delete."

View File

@ -1,4 +1,4 @@
FROM node:20-alpine AS builder
FROM node:22-alpine AS builder
WORKDIR /app
COPY package.json package-lock.json* ./
RUN npm install
@ -11,7 +11,7 @@ ENV NEXT_PUBLIC_WS_URL=$NEXT_PUBLIC_WS_URL
ENV NEXT_PUBLIC_ADMIN_TOKEN=$NEXT_PUBLIC_ADMIN_TOKEN
RUN npm run build
FROM node:20-alpine
FROM node:22-alpine
WORKDIR /app
COPY --from=builder /app/.next/standalone ./
COPY --from=builder /app/.next/static ./.next/static

View File

@ -5,7 +5,7 @@
* the per-tenant admin token, provisions one hermes workspace, waits
* for online, then exports:
*
* STAGING_TENANT_URL https://<slug>.moleculesai.app
* STAGING_TENANT_URL https://<slug>.staging.moleculesai.app
* STAGING_WORKSPACE_ID UUID of the hermes workspace
* STAGING_TENANT_TOKEN per-tenant admin bearer (for spec requests)
* STAGING_SLUG org slug (used by teardown)
@ -16,6 +16,11 @@
* CP_ADMIN_API_TOKEN). Drives provision +
* tenant-token retrieval + teardown via a
* single credential.
* STAGING_TENANT_DOMAIN default: staging.moleculesai.app the
* DNS suffix the CP provisioner writes for
* staging tenants. Override only when
* running this harness against a non-default
* zone.
*/
import type { FullConfig } from "@playwright/test";
@ -25,6 +30,14 @@ import { join } from "path";
const CP_URL = process.env.MOLECULE_CP_URL || "https://staging-api.moleculesai.app";
const ADMIN_TOKEN = process.env.MOLECULE_ADMIN_TOKEN;
const STAGING = process.env.CANVAS_E2E_STAGING === "1";
// Tenant DNS zone for staging. CP provisioner registers DNS as
// `<slug>.staging.moleculesai.app` (see internal/provisioner/ec2.go's
// EC2 provisioner: DNS log line). The previous default of plain
// `moleculesai.app` matched prod tenant naming and silently broke
// every staging E2E at the TLS readiness step — DNS literally didn't
// resolve, fetch threw NXDOMAIN, waitFor saw null on every poll, and
// the harness wedged at TLS_TIMEOUT_MS instead of failing loud.
const TENANT_DOMAIN = process.env.STAGING_TENANT_DOMAIN || "staging.moleculesai.app";
// Tenant cold boot on staging regularly takes 12-15 min when the
// workspace-server Docker image isn't already cached on the AMI. Raised
@ -105,22 +118,44 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
}
console.log(`[staging-setup] Org created: ${slug}`);
// 2. Wait for tenant running (admin-orgs list is the status source)
// 2. Wait for tenant running (admin-orgs list is the status source).
//
// The CP /cp/admin/orgs endpoint returns each org with an
// `instance_status` field (handlers/admin.go:adminOrgSummary,
// sourced from `org_instances.status`). NOT `status` — there's no
// top-level `status` on the row at all. A previous version of this
// test polled `row.status`, which was always undefined, so this
// waitFor never resolved truthy and the harness invariably timed
// out at 1200s — masking real CP bugs (see #242 chain) AND
// surviving real CP fixes alike.
// Capture the org UUID alongside the running check — every request
// we send to the tenant URL after this point needs an
// X-Molecule-Org-Id header (see workspace-server middleware/tenant_guard.go).
// Without it, TenantGuard returns 404 ("must not be inferable by
// probing other orgs' machines"). The CP returns the id on the
// admin-orgs row; capture it here while we're already polling.
let orgID = "";
await waitFor<boolean>(
async () => {
const r = await jsonFetch(`${CP_URL}/cp/admin/orgs`, { headers: adminAuth });
if (r.status !== 200) return null;
const row = (r.body?.orgs || []).find((o: any) => o.slug === slug);
if (!row) return null;
if (row.status === "running") return true;
if (row.status === "failed") throw new Error(`provision failed: ${slug}`);
if (row.instance_status === "running") {
orgID = row.id;
return true;
}
if (row.instance_status === "failed") throw new Error(`provision failed: ${slug}`);
return null;
},
PROVISION_TIMEOUT_MS,
15_000,
"tenant provision",
);
console.log(`[staging-setup] Tenant running`);
if (!orgID) {
throw new Error(`expected admin-orgs row to carry id, got empty for slug=${slug}`);
}
console.log(`[staging-setup] Tenant running (org_id=${orgID})`);
// 3. Fetch per-tenant admin token
const tokRes = await jsonFetch(
@ -133,7 +168,7 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
);
}
const tenantToken: string = tokRes.body.admin_token;
const tenantURL = `https://${slug}.moleculesai.app`;
const tenantURL = `https://${slug}.${TENANT_DOMAIN}`;
console.log(`[staging-setup] Tenant URL: ${tenantURL}`);
// 4. TLS readiness
@ -154,7 +189,17 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
);
// 5. Provision workspace
const tenantAuth = { Authorization: `Bearer ${tenantToken}` };
//
// tenantAuth carries TWO headers, both required:
// - Authorization: Bearer <admin-token> — wsAdmin middleware gate
// - X-Molecule-Org-Id: <uuid> — TenantGuard cross-org gate
// Missing the org-id header silently 404s every non-allowlisted
// route, with no body and no security headers. The 404 is intentional
// (existence-non-inference) which makes it look like a missing route.
const tenantAuth = {
"Authorization": `Bearer ${tenantToken}`,
"X-Molecule-Org-Id": orgID,
};
const ws = await jsonFetch(`${tenantURL}/workspaces`, {
method: "POST",
headers: tenantAuth,

View File

@ -63,6 +63,82 @@ test.describe("staging canvas tabs", () => {
Authorization: `Bearer ${tenantToken}`,
});
// canvas/src/components/AuthGate.tsx fetches /cp/auth/me on mount
// and redirects to the login page on 401. The bearer header above
// is for platform API calls — it does NOT satisfy /cp/auth/me,
// which is cookie-based (WorkOS session). Without this mock, the
// canvas page mounts AuthGate, sees 401 from /cp/auth/me, and
// redirects away from the tenant URL before the React Flow root
// ever renders. The [aria-label] selector wait then times out.
//
// Intercept /cp/auth/me + return a fake Session shape so AuthGate
// resolves to "authenticated" and renders {children}. The session
// contents are cosmetic — the canvas only inspects org_id/user_id
// in a few places that don't fail when these are dummy values.
await context.route("**/cp/auth/me", (route) =>
route.fulfill({
status: 200,
contentType: "application/json",
body: JSON.stringify({
user_id: `e2e-test-user-${workspaceId}`,
org_id: "e2e-test-org",
email: "e2e@test.local",
}),
}),
);
// Universal 401 → empty-200 fallback (defense-in-depth).
//
// The original product bug was canvas/src/lib/api.ts:62-74 calling
// `redirectToLogin` on EVERY 401 — a single workspace-scoped 401
// (e.g. /workspaces/:id/peers, /plugins) yanked the user (and the
// test) to AuthKit. That's now fixed at the source: api.ts probes
// /cp/auth/me before redirecting, so a 401 from a non-auth path
// with a live session throws a regular error instead.
//
// This route handler stays as a SAFETY NET, not the primary
// defense:
// 1. It silences resource-load console noise from the browser
// (those messages don't include the URL — useless in
// diagnostics, captured by the filter in the assertion
// block but having no 401s reach the network is cleaner).
// 2. It guards against panels that DON'T have try/catch around
// their api calls — an unhandled rejection would surface
// as console.error → fail the assertion. Panels SHOULD
// handle errors, but until they're all audited, this is
// the test's belt to api.ts's braces.
//
// Pass-through real responses; swap 401s for 200 + empty body.
// Skip /cp/auth/me (mocked above) and non-fetch resources
// (HTML/JS/CSS bundles that should NOT be intercepted).
await context.route("**", async (route, request) => {
if (request.resourceType() !== "fetch") {
return route.fallback();
}
// /cp/auth/me is mocked above with a fixed Session shape — let
// that handler win without us round-tripping the network.
if (request.url().includes("/cp/auth/me")) {
return route.fallback();
}
let resp;
try {
resp = await route.fetch();
} catch {
return route.fallback();
}
if (resp.status() !== 401) {
return route.fulfill({ response: resp });
}
const lastSeg =
new URL(request.url()).pathname.split("/").filter(Boolean).pop() || "";
const looksLikeList = !/^[0-9a-f-]{8,}$/.test(lastSeg);
await route.fulfill({
status: 200,
contentType: "application/json",
body: looksLikeList ? "[]" : "{}",
});
});
const consoleErrors: string[] = [];
page.on("console", (msg) => {
if (msg.type() === "error") {
@ -70,13 +146,38 @@ test.describe("staging canvas tabs", () => {
}
});
await page.goto(tenantURL, { waitUntil: "networkidle" });
// Capture the URL of any failed network request so a "Failed to load
// resource: 404" console message we filter out below leaves a
// breadcrumb. Browser console messages for resource-load failures
// omit the URL, so we'd otherwise be flying blind. Logged to the
// test's stdout (visible in the workflow log under the failed step).
page.on("requestfailed", (req) => {
console.log(`[e2e/requestfailed] ${req.method()} ${req.url()}: ${req.failure()?.errorText ?? "?"}`);
});
page.on("response", (res) => {
if (res.status() >= 400) {
console.log(`[e2e/response-${res.status()}] ${res.request().method()} ${res.url()}`);
}
});
// waitUntil="networkidle" is wrong here — the canvas keeps a
// WebSocket open + polls /events and /workspaces every few
// seconds, so the network is *never* idle for 500ms. page.goto
// would hang until its 45s default timeout. "domcontentloaded"
// returns as soon as the HTML is parsed; React hydration + the
// selector wait below is what actually gates ready-for-interaction.
await page.goto(tenantURL, { waitUntil: "domcontentloaded" });
// Canvas hydration races WebSocket connect + /workspaces fetch.
// Wait for the tablist element (appears after a workspace is
// selected) or the hydration-error banner — whichever wins first.
// Wait for the React Flow canvas wrapper (always present once
// hydrated, even with zero workspaces) or the hydration-error
// banner — whichever wins first. Previous version of this wait
// used `[role="tablist"]`, but that selector only appears AFTER
// a workspace node is clicked (which happens below at L100), so
// the wait would always time out at 45s before any meaningful
// failure surfaced.
await page.waitForSelector(
'[role="tablist"], [data-testid="hydration-error"]',
'[aria-label="Molecule AI workspace canvas"], [data-testid="hydration-error"]',
{ timeout: 45_000 },
);
@ -106,6 +207,15 @@ test.describe("staging canvas tabs", () => {
for (const tabId of TAB_IDS) {
await test.step(`tab: ${tabId}`, async () => {
const tabButton = page.locator(`#tab-${tabId}`);
// The TABS bar is `overflow-x-auto` (SidePanel.tsx:~tabs
// wrapper) — tabs after position ~3 are clipped behind the
// right-edge fade gradient on smaller viewports. Playwright's
// `toBeVisible()` returns false for clipped elements, so a
// bare visibility check fails on `skills` and later tabs in
// CI. scrollIntoViewIfNeeded brings the button into view
// before the visibility check, mirroring what SidePanel's own
// keyboard handler does on arrow-key navigation.
await tabButton.scrollIntoViewIfNeeded({ timeout: 5_000 });
await expect(
tabButton,
`tab-${tabId} button missing — TABS list may have drifted`,
@ -134,14 +244,22 @@ test.describe("staging canvas tabs", () => {
// Aggregate console-error budget. Known-noisy sources whitelisted:
// Sentry, Vercel analytics, WS reconnects (expected on SaaS
// terminal), favicon 404 (cosmetic).
// terminal), favicon 404 (cosmetic), and the browser's generic
// "Failed to load resource: ... 404" message which never includes
// the URL — uninformative on its own and impossible to filter
// meaningfully without a URL. The page.on('requestfailed') +
// page.on('response>=400') logging above captures the actual URLs
// so a real bug still leaves a breadcrumb in the workflow log;
// a real exception (panel crash, JS error) surfaces as a typed
// error with file path which the filter still catches.
const appErrors = consoleErrors.filter(
(msg) =>
!msg.includes("sentry") &&
!msg.includes("vercel") &&
!msg.includes("WebSocket") &&
!msg.includes("favicon") &&
!msg.includes("molecule-icon.png"), // another cosmetic 404
!msg.includes("molecule-icon.png") && // cosmetic 404
!msg.includes("Failed to load resource"),
);
expect(
appErrors,

View File

@ -61,6 +61,11 @@ export default function Home() {
{hydrationError && (
<div
role="alert"
// Stable testid so the staging E2E (canvas/e2e/staging-tabs.spec.ts)
// can detect this banner without depending on the role="alert"
// selector that's used by other transient toasts. Don't rename
// without updating that spec.
data-testid="hydration-error"
className="fixed inset-0 flex flex-col items-center justify-center bg-zinc-950 text-zinc-300 gap-4 z-[9999]"
>
<p className="text-zinc-400 text-sm">{hydrationError}</p>

View File

@ -14,7 +14,7 @@ import { PricingTable } from "@/components/PricingTable";
export const metadata = {
title: "Pricing — Molecule AI",
description:
"Free while you tinker, paid tiers for shipping production multi-agent organizations. Transparent usage-based overage pricing on Pro.",
"Flat-rate team and org pricing — no per-seat fees. Free to start, $29/month for teams, $99/month for production orgs. Full runtime stack included on every paid tier.",
};
export default function PricingPage() {
@ -25,9 +25,12 @@ export default function PricingPage() {
Pricing
</h1>
<p className="mx-auto mt-4 max-w-2xl text-lg text-zinc-300">
Free while you tinker. Pay when you ship real agents to production.
Every tier includes the full runtime stack you upgrade for scale,
support, and dedicated infrastructure.
One flat price per org not per seat. Every paid tier includes the
full runtime stack. You upgrade for scale, support, and dedicated
infrastructure.
</p>
<p className="mx-auto mt-2 max-w-xl text-sm text-zinc-400">
5-person team? You pay $29/month not $200. No seat math, ever.
</p>
</div>
@ -53,7 +56,8 @@ export default function PricingPage() {
.
</p>
<p className="mt-6 text-sm text-zinc-500">
Prices shown in USD. Enterprise / self-hosted licensing available contact us.
Prices shown in USD. Flat-rate per org no per-seat fees on any paid tier.
Enterprise / self-hosted licensing available contact us.
</p>
</section>

View File

@ -6,10 +6,16 @@ import { api } from "@/lib/api";
import { showToast } from "./Toaster";
import { ConsoleModal } from "./ConsoleModal";
/** Base provisioning timeout in milliseconds (2 minutes). Used as the
* floor; the effective threshold scales with the number of workspaces
* concurrently provisioning (see effectiveTimeoutMs below). */
export const DEFAULT_PROVISION_TIMEOUT_MS = 120_000;
import {
DEFAULT_RUNTIME_PROFILE,
provisionTimeoutForRuntime,
} from "@/lib/runtimeProfiles";
/** Re-export for backward compatibility with tests and other importers
* that previously imported DEFAULT_PROVISION_TIMEOUT_MS from this file.
* New code should read via getRuntimeProfile() from @/lib/runtimeProfiles. */
export const DEFAULT_PROVISION_TIMEOUT_MS =
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs;
/** The server provisions up to `PROVISION_CONCURRENCY` containers at
* once and paces the rest in a queue (`workspaceCreatePacingMs` =
@ -43,8 +49,12 @@ interface TimeoutEntry {
* time per node.
*/
export function ProvisioningTimeout({
timeoutMs = DEFAULT_PROVISION_TIMEOUT_MS,
timeoutMs,
}: {
// If undefined (the default when mounted without a prop), each workspace's
// threshold is resolved from its runtime via timeoutForRuntime().
// Pass an explicit number to force a single threshold for every workspace
// (used by tests that want deterministic behavior regardless of runtime).
timeoutMs?: number;
}) {
const [timedOut, setTimedOut] = useState<TimeoutEntry[]>([]);
@ -57,19 +67,28 @@ export function ProvisioningTimeout({
const [dismissed, setDismissed] = useState<Set<string>>(new Set());
// Subscribe to provisioning nodes — use shallow compare to avoid infinite re-render
// (filter+map creates new array reference on every store update)
// (filter+map creates new array reference on every store update).
// Runtime included so the timeout threshold can be resolved per-node
// (hermes cold-boot legitimately takes 8-13 min vs 30-90s for docker
// runtimes — a single threshold would false-alarm on one or the other).
// Separator: `|` between fields, `,` between nodes. Names may contain
// anything the user typed; strip `|` and `,` so serialization round-trips.
const provisioningNodes = useCanvasStore((s) => {
const result = s.nodes
.filter((n) => n.data.status === "provisioning")
.map((n) => `${n.id}:${n.data.name}`);
.map((n) => {
const safeName = (n.data.name ?? "").replace(/[|,]/g, " ");
const runtime = n.data.runtime ?? "";
return `${n.id}|${safeName}|${runtime}`;
});
return result.join(",");
});
const parsedProvisioningNodes = useMemo(
() =>
provisioningNodes
? provisioningNodes.split(",").map((entry) => {
const [id, name] = entry.split(":");
return { id, name };
const [id, name, runtime] = entry.split("|");
return { id, name, runtime };
})
: [],
[provisioningNodes],
@ -113,14 +132,21 @@ export function ProvisioningTimeout({
const interval = setInterval(() => {
const now = Date.now();
const newTimedOut: TimeoutEntry[] = [];
const effective = effectiveTimeoutMs(
timeoutMs,
parsedProvisioningNodes.length,
);
// Per-node timeout: each workspace resolves its own base via
// @/lib/runtimeProfiles (server-override → runtime profile →
// default), then scales by concurrent-provisioning count. A
// hermes workspace in a batch alongside two langgraph workspaces
// gets hermes's 12-min base, not langgraph's 2-min base.
for (const node of parsedProvisioningNodes) {
const startedAt = tracking.get(node.id);
if (startedAt && now - startedAt >= effective) {
if (!startedAt) continue;
const base = timeoutMs ?? provisionTimeoutForRuntime(node.runtime);
const effective = effectiveTimeoutMs(
base,
parsedProvisioningNodes.length,
);
if (now - startedAt >= effective) {
newTimedOut.push({
workspaceId: node.id,
workspaceName: node.name,

View File

@ -322,31 +322,6 @@ function countDescendants(nodeId: string, allNodes: Node<WorkspaceNodeData>[], v
* infinite recursion on circular parentId references and keeps the UI readable. */
const MAX_NESTING_DEPTH = 3;
/** Subscribes to allNodes only when children exist — isolates re-renders from parent */
function EmbeddedTeam({ members, depth, onSelect, onExtract }: {
members: Node<WorkspaceNodeData>[];
depth: number;
onSelect: (id: string) => void;
onExtract: (id: string) => void;
}) {
const allNodes = useCanvasStore((s) => s.nodes);
// Use grid layout at depth 0 when there are multiple members (departments side-by-side)
const useGrid = depth === 0 && members.length >= 2;
return (
<div className="mt-2 pt-2 border-t border-zinc-700/30">
<div className="text-[10px] text-zinc-500 uppercase tracking-widest mb-1.5">Team Members</div>
<div className={useGrid
? "grid grid-cols-2 gap-1.5 lg:grid-cols-3"
: "space-y-1.5"
}>
{members.map((child) => (
<TeamMemberChip key={child.id} node={child} allNodes={allNodes} depth={depth} onSelect={onSelect} onExtract={onExtract} />
))}
</div>
</div>
);
}
/** Recursive mini-card — mirrors parent card layout at smaller scale */
function TeamMemberChip({
node,

View File

@ -50,14 +50,14 @@ describe("PricingTable", () => {
it("renders all three plans with their CTAs", () => {
render(<PricingTable />);
expect(screen.getByRole("heading", { name: "Free" })).toBeTruthy();
expect(screen.getByRole("heading", { name: "Starter" })).toBeTruthy();
expect(screen.getByRole("heading", { name: "Pro" })).toBeTruthy();
expect(screen.getByRole("heading", { name: "Team" })).toBeTruthy();
expect(screen.getByRole("heading", { name: "Growth" })).toBeTruthy();
expect(screen.getByRole("button", { name: "Get started" })).toBeTruthy();
expect(screen.getByRole("button", { name: "Upgrade to Starter" })).toBeTruthy();
expect(screen.getByRole("button", { name: "Upgrade to Pro" })).toBeTruthy();
expect(screen.getByRole("button", { name: "Upgrade to Team" })).toBeTruthy();
expect(screen.getByRole("button", { name: "Upgrade to Growth" })).toBeTruthy();
});
it("shows the 'Most popular' badge only on the starter card", () => {
it("shows the 'Most popular' badge only on the Team card", () => {
render(<PricingTable />);
const badges = screen.getAllByText("Most popular");
expect(badges.length).toBe(1);
@ -74,7 +74,7 @@ describe("PricingTable", () => {
it("Paid CTA + anonymous → bounces to signup (no checkout call)", async () => {
mockedFetchSession.mockResolvedValue(null);
render(<PricingTable />);
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Starter" }));
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Team" }));
await waitFor(() => expect(mockedRedirectToLogin).toHaveBeenCalledWith("sign-up"));
expect(mockedStartCheckout).not.toHaveBeenCalled();
});
@ -91,7 +91,7 @@ describe("PricingTable", () => {
});
render(<PricingTable />);
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Pro" }));
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Growth" }));
await waitFor(() =>
expect(mockedStartCheckout).toHaveBeenCalledWith("pro", "acme"),
@ -111,7 +111,7 @@ describe("PricingTable", () => {
mockedGetTenantSlug.mockReturnValue("");
render(<PricingTable />);
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Starter" }));
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Team" }));
await waitFor(() => {
const alert = screen.getByRole("alert");
@ -129,7 +129,7 @@ describe("PricingTable", () => {
mockedStartCheckout.mockRejectedValue(new Error("checkout: 500 boom"));
render(<PricingTable />);
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Pro" }));
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Growth" }));
await waitFor(() => {
const alert = screen.getByRole("alert");
@ -140,7 +140,7 @@ describe("PricingTable", () => {
it("treats fetchSession network errors as anonymous (fail-closed to signup)", async () => {
mockedFetchSession.mockRejectedValue(new Error("network down"));
render(<PricingTable />);
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Starter" }));
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Team" }));
await waitFor(() => expect(mockedRedirectToLogin).toHaveBeenCalledWith("sign-up"));
expect(mockedStartCheckout).not.toHaveBeenCalled();
});
@ -155,7 +155,7 @@ describe("PricingTable", () => {
mockedStartCheckout.mockReturnValue(new Promise(() => {}));
render(<PricingTable />);
const button = screen.getByRole("button", { name: "Upgrade to Pro" });
const button = screen.getByRole("button", { name: "Upgrade to Growth" });
fireEvent.click(button);
await waitFor(() => {

View File

@ -8,6 +8,12 @@ global.fetch = vi.fn(() =>
import { useCanvasStore } from "../../store/canvas";
import type { WorkspaceData } from "../../store/socket";
import { DEFAULT_PROVISION_TIMEOUT_MS } from "../ProvisioningTimeout";
import {
DEFAULT_RUNTIME_PROFILE,
RUNTIME_PROFILES,
getRuntimeProfile,
provisionTimeoutForRuntime,
} from "@/lib/runtimeProfiles";
// Helper to build a WorkspaceData object
function makeWS(overrides: Partial<WorkspaceData> & { id: string }): WorkspaceData {
@ -184,4 +190,102 @@ describe("ProvisioningTimeout", () => {
.nodes.filter((n) => n.data.status === "provisioning");
expect(stillProvisioning).toHaveLength(2);
});
// ── Runtime-aware timeout regression tests (2026-04-24 outage) ────────────
// Prior to this, a hermes workspace consistently false-alarmed at 2 min
// into its 8-13 min cold boot, pushing users to retry something that
// would have come online on its own. The runtime-aware override keeps
// the 2-min floor for fast docker runtimes while giving hermes its
// honest 12-min budget.
describe("runtime profile resolution (@/lib/runtimeProfiles)", () => {
describe("provisionTimeoutForRuntime", () => {
it("returns the default for unknown/missing runtimes", () => {
expect(provisionTimeoutForRuntime(undefined)).toBe(
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
);
expect(provisionTimeoutForRuntime("")).toBe(
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
);
expect(provisionTimeoutForRuntime("some-future-runtime")).toBe(
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
);
});
it("returns default for known-fast runtimes (not in profile map)", () => {
// If someone ever adds one of these to RUNTIME_PROFILES with a
// slower value, this test catches the unintended regression.
expect(provisionTimeoutForRuntime("claude-code")).toBe(
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
);
expect(provisionTimeoutForRuntime("langgraph")).toBe(
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
);
expect(provisionTimeoutForRuntime("crewai")).toBe(
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
);
});
it("returns hermes override when runtime = hermes", () => {
expect(provisionTimeoutForRuntime("hermes")).toBe(
RUNTIME_PROFILES.hermes?.provisionTimeoutMs,
);
expect(provisionTimeoutForRuntime("hermes")).toBeGreaterThanOrEqual(
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs * 5,
);
});
it("server-side workspace override wins over runtime profile", () => {
// The resolution order is: overrides → profile → default.
// An operator-tunable per-workspace number on the backend
// (e.g. via a template manifest field) should beat the canvas
// runtime map.
expect(
provisionTimeoutForRuntime("hermes", {
provisionTimeoutMs: 60_000,
}),
).toBe(60_000);
expect(
provisionTimeoutForRuntime("some-unknown", {
provisionTimeoutMs: 300_000,
}),
).toBe(300_000);
});
});
describe("getRuntimeProfile", () => {
it("returns a structural profile with required fields", () => {
const profile = getRuntimeProfile("hermes");
expect(profile.provisionTimeoutMs).toBeTypeOf("number");
expect(profile.provisionTimeoutMs).toBeGreaterThan(0);
});
it("default profile is a valid superset of every override", () => {
// Every entry in RUNTIME_PROFILES must provide fields the
// default does — otherwise consumers could get undefined where
// they expected a number. This test enforces that contract so
// future entries can't accidentally drop fields.
for (const [runtime, profile] of Object.entries(RUNTIME_PROFILES)) {
const resolved = getRuntimeProfile(runtime);
expect(
resolved.provisionTimeoutMs,
`runtime=${runtime} must resolve to a number`,
).toBeTypeOf("number");
expect(resolved.provisionTimeoutMs).toBeGreaterThan(0);
// Profile's explicit value should be used iff present.
if (profile.provisionTimeoutMs !== undefined) {
expect(resolved.provisionTimeoutMs).toBe(profile.provisionTimeoutMs);
}
}
});
});
describe("DEFAULT_PROVISION_TIMEOUT_MS backward-compat export", () => {
it("still exports the same default for legacy importers", () => {
expect(DEFAULT_PROVISION_TIMEOUT_MS).toBe(
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
);
});
});
});
});

View File

@ -183,7 +183,31 @@ describe("ChannelsTab — htmlFor/id label associations (WCAG 1.3.1)", () => {
beforeEach(() => {
mockApiGet.mockImplementation((url: string) => {
if (url.includes("/channels/adapters")) {
return Promise.resolve([{ type: "telegram", display_name: "Telegram" }]);
// Mirror the real GET /channels/adapters shape — schema-driven form
// relies on config_schema arriving from the adapter. A bare
// {type, display_name} mock renders an empty form and every
// getByLabelText below fails.
return Promise.resolve([
{
type: "telegram",
display_name: "Telegram",
config_schema: [
{
key: "bot_token",
label: "Bot Token",
type: "password",
required: true,
sensitive: true,
},
{
key: "chat_id",
label: "Chat IDs",
type: "text",
required: true,
},
],
},
]);
}
return Promise.resolve([]);
});

View File

@ -31,12 +31,12 @@ export function UnsavedChangesGuard({
</AlertDialog.Title>
<div className="guard-dialog__actions">
<AlertDialog.Cancel asChild>
<button className="guard-dialog__keep-btn" onClick={onKeepEditing}>
<button type="button" className="guard-dialog__keep-btn">
Keep editing
</button>
</AlertDialog.Cancel>
<AlertDialog.Action asChild>
<button className="guard-dialog__discard-btn" onClick={onDiscard}>
<button type="button" className="guard-dialog__discard-btn">
Discard
</button>
</AlertDialog.Action>

View File

@ -186,7 +186,7 @@ function ActivityRow({
: "bg-zinc-800/60 border-zinc-700/40"
}`}
>
<button onClick={onToggle} className="w-full text-left px-3 py-2">
<button type="button" onClick={onToggle} className="w-full text-left px-3 py-2">
{/* Top row: type badge + method + time */}
<div className="flex items-center gap-2">
<span className={`text-[8px] font-mono px-1.5 py-0.5 rounded ${typeStyle.text} ${typeStyle.bg} border ${typeStyle.border}`}>

View File

@ -4,9 +4,23 @@ import { useState, useEffect, useCallback, useId } from "react";
import { api } from "@/lib/api";
import { ConfirmDialog } from "@/components/ConfirmDialog";
// ConfigField mirrors the Go struct returned by GET /channels/adapters —
// the UI renders one input per field in the order the adapter returns
// them, so per-platform form shape stays server-owned.
interface ConfigField {
key: string;
label: string;
type: "text" | "password" | "textarea";
required: boolean;
sensitive?: boolean;
placeholder?: string;
help?: string;
}
interface ChannelAdapter {
type: string;
display_name: string;
config_schema?: ConfigField[];
}
interface Channel {
@ -25,6 +39,11 @@ interface Props {
workspaceId: string;
}
// Telegram is the only platform that supports "Detect Chats" via
// getUpdates. Every other platform uses a webhook URL that already
// encodes the chat, so the button is only offered when useful.
const SUPPORTS_DETECT_CHATS = new Set(["telegram"]);
function relativeTime(iso: string | null | undefined): string {
if (!iso) return "never";
const diff = Date.now() - new Date(iso).getTime();
@ -41,11 +60,12 @@ export function ChannelsTab({ workspaceId }: Props) {
const [showForm, setShowForm] = useState(false);
const [testing, setTesting] = useState<string | null>(null);
const [pendingDelete, setPendingDelete] = useState<Channel | null>(null);
const [error, setError] = useState("");
// Form state
// Form state — schema-driven: formValues holds the typed-in config for
// whichever adapter is currently selected, keyed by ConfigField.key.
const [formType, setFormType] = useState("telegram");
const [formBotToken, setFormBotToken] = useState("");
const [formChatId, setFormChatId] = useState("");
const [formValues, setFormValues] = useState<Record<string, string>>({});
const [formAllowedUsers, setFormAllowedUsers] = useState("");
const [formError, setFormError] = useState("");
const [discovering, setDiscovering] = useState(false);
@ -53,18 +73,13 @@ export function ChannelsTab({ workspaceId }: Props) {
const [selectedChats, setSelectedChats] = useState<Set<string>>(new Set());
const [showManualInput, setShowManualInput] = useState(false);
// Stable IDs for label↔input associations (WCAG 1.3.1)
const platformId = useId();
const botTokenId = useId();
const chatIdId = useId();
const allowedUsersId = useId();
const currentAdapter = adapters.find((a) => a.type === formType);
const currentSchema: ConfigField[] = currentAdapter?.config_schema || [];
const load = useCallback(async () => {
// Fetch channels and adapters independently so a failure in one
// doesn't blank the other. Previously a single Promise.all + silent
// catch meant ANY request failing left both `channels` and
// `adapters` empty — the user saw a "+ Connect" button with no
// platform options, with no clue why.
const [chResult, adResult] = await Promise.allSettled([
api.get<Channel[]>(`/workspaces/${workspaceId}/channels`),
api.get<ChannelAdapter[]>(`/channels/adapters`),
@ -82,8 +97,6 @@ export function ChannelsTab({ workspaceId }: Props) {
console.warn("ChannelsTab: adapters load failed", adResult.reason);
errors.push("platforms");
}
// Surface BOTH failure modes so the user can distinguish
// "no channels configured" from "API unreachable".
if (errors.length > 0) {
setError(`Failed to load ${errors.join(" and ")} — try refreshing`);
} else {
@ -100,8 +113,24 @@ export function ChannelsTab({ workspaceId }: Props) {
return () => clearInterval(interval);
}, [load]);
// Reset form values when the selected platform changes — each platform
// has a different field set, so reusing old values would leak stale
// data across platforms.
useEffect(() => {
setFormValues({});
setDiscoveredChats([]);
setSelectedChats(new Set());
setShowManualInput(false);
setFormError("");
}, [formType]);
const setFieldValue = (key: string, value: string) => {
setFormValues((prev) => ({ ...prev, [key]: value }));
};
const handleDiscover = async () => {
if (!formBotToken) {
const botToken = formValues["bot_token"] || "";
if (!botToken) {
setFormError("Enter a bot token first");
return;
}
@ -111,16 +140,15 @@ export function ChannelsTab({ workspaceId }: Props) {
try {
const res = await api.post<{ chats: { chat_id: string; name: string; type: string }[]; hint: string }>(
`/channels/discover`,
{ channel_type: formType, bot_token: formBotToken, workspace_id: workspaceId }
{ channel_type: formType, bot_token: botToken, workspace_id: workspaceId }
);
const chats = res.chats || [];
setDiscoveredChats(chats);
if (chats.length === 0) {
setFormError("No chats found. For groups: add the bot and send a message. For DMs: send /start to the bot first. Then retry.");
} else {
// Auto-select all discovered chats
setSelectedChats(new Set(chats.map((c) => c.chat_id)));
setFormChatId(chats.map((c) => c.chat_id).join(", "));
setFieldValue("chat_id", chats.map((c) => c.chat_id).join(", "));
}
} catch (e) {
setFormError(String(e));
@ -134,15 +162,22 @@ export function ChannelsTab({ workspaceId }: Props) {
const next = new Set(prev);
if (next.has(chatId)) next.delete(chatId);
else next.add(chatId);
setFormChatId(Array.from(next).join(", "));
setFieldValue("chat_id", Array.from(next).join(", "));
return next;
});
};
const handleCreate = async () => {
setFormError("");
if (!formBotToken || !formChatId) {
setFormError("Bot token and chat ID are required");
// Client-side required-field check so the user sees the gap before
// we round-trip to the server. ValidateConfig on the backend remains
// authoritative — adapter-specific rules like "bot_token OR webhook_url"
// for Slack aren't expressible in required-flag alone.
const missing = currentSchema
.filter((f) => f.required && !(formValues[f.key] || "").trim())
.map((f) => f.label);
if (missing.length > 0) {
setFormError(`Required: ${missing.join(", ")}`);
return;
}
try {
@ -150,14 +185,20 @@ export function ChannelsTab({ workspaceId }: Props) {
.split(",")
.map((s) => s.trim())
.filter(Boolean);
// Only send keys the schema knows about — avoids accidentally
// persisting stale values when the user switched platforms mid-edit.
const config: Record<string, string> = {};
for (const f of currentSchema) {
const v = (formValues[f.key] || "").trim();
if (v) config[f.key] = v;
}
await api.post(`/workspaces/${workspaceId}/channels`, {
channel_type: formType,
config: { bot_token: formBotToken, chat_id: formChatId },
config,
allowed_users: allowed,
});
setShowForm(false);
setFormBotToken("");
setFormChatId("");
setFormValues({});
setFormAllowedUsers("");
load();
} catch (e) {
@ -165,8 +206,6 @@ export function ChannelsTab({ workspaceId }: Props) {
}
};
const [error, setError] = useState("");
const handleToggle = async (ch: Channel) => {
try {
await api.patch(`/workspaces/${workspaceId}/channels/${ch.id}`, {
@ -228,7 +267,7 @@ export function ChannelsTab({ workspaceId }: Props) {
</div>
)}
{/* Create form */}
{/* Create form — schema-driven */}
{showForm && (
<div className="space-y-2 p-3 bg-zinc-800/40 rounded border border-zinc-700/50">
<div>
@ -244,73 +283,69 @@ export function ChannelsTab({ workspaceId }: Props) {
))}
</select>
</div>
<div>
<label htmlFor={botTokenId} className="text-[10px] text-zinc-500 block mb-1">Bot Token</label>
<input
id={botTokenId}
type="password"
value={formBotToken}
onChange={(e) => setFormBotToken(e.target.value)}
placeholder="123456:ABC-DEF..."
className="w-full text-xs bg-zinc-900 border border-zinc-700 rounded px-2 py-1.5 text-zinc-300 placeholder-zinc-600"
/>
</div>
<div>
<div className="flex items-center justify-between mb-1">
<label htmlFor={chatIdId} className="text-[10px] text-zinc-500">Chat IDs</label>
<button
onClick={handleDiscover}
disabled={discovering || !formBotToken}
className="text-[10px] px-2 py-0.5 rounded bg-blue-600/20 text-blue-400 hover:bg-blue-600/30 transition disabled:opacity-40"
>
{discovering ? "Detecting..." : "Detect Chats"}
</button>
{/* Render one input per schema field. Fallback path: if the
backend didn't return a schema (older platform version) show
a single bot_token + chat_id pair to preserve the old UX. */}
{currentSchema.length === 0 ? (
<div className="text-[10px] text-yellow-500">
Platform exposes no config schema upgrade the platform to pick up first-class support.
</div>
{discoveredChats.length > 0 && (
<div className="space-y-1 mb-2">
{discoveredChats.map((chat) => (
<label
key={chat.chat_id}
className="flex items-center gap-2 px-2 py-1.5 bg-zinc-900/50 rounded border border-zinc-700/50 cursor-pointer hover:bg-zinc-800/50"
>
<input
type="checkbox"
checked={selectedChats.has(chat.chat_id)}
onChange={() => toggleChat(chat.chat_id)}
className="rounded border-zinc-600"
/>
<span className="text-xs text-zinc-300">{chat.name || "Unknown"}</span>
<span className="text-[10px] text-zinc-500 ml-auto">{chat.type} {chat.chat_id}</span>
</label>
))}
</div>
)}
{(discoveredChats.length === 0 || showManualInput) && (
<input
id={chatIdId}
value={formChatId}
onChange={(e) => setFormChatId(e.target.value)}
placeholder="-100123456789, -100987654321"
className="w-full text-xs bg-zinc-900 border border-zinc-700 rounded px-2 py-1.5 text-zinc-300 placeholder-zinc-600"
) : (
currentSchema.map((field) => (
<SchemaField
key={field.key}
field={field}
value={formValues[field.key] || ""}
onChange={(v) => setFieldValue(field.key, v)}
// Detect Chats button lives next to the chat_id input on
// Telegram only (the only platform with getUpdates).
renderExtras={
field.key === "chat_id" && SUPPORTS_DETECT_CHATS.has(formType)
? () => (
<>
<div className="flex items-center justify-end mb-1 -mt-1">
<button
onClick={handleDiscover}
disabled={discovering || !formValues["bot_token"]}
className="text-[10px] px-2 py-0.5 rounded bg-blue-600/20 text-blue-400 hover:bg-blue-600/30 transition disabled:opacity-40"
>
{discovering ? "Detecting..." : "Detect Chats"}
</button>
</div>
{discoveredChats.length > 0 && (
<div className="space-y-1 mb-2">
{discoveredChats.map((chat) => (
<label
key={chat.chat_id}
className="flex items-center gap-2 px-2 py-1.5 bg-zinc-900/50 rounded border border-zinc-700/50 cursor-pointer hover:bg-zinc-800/50"
>
<input
type="checkbox"
checked={selectedChats.has(chat.chat_id)}
onChange={() => toggleChat(chat.chat_id)}
className="rounded border-zinc-600"
/>
<span className="text-xs text-zinc-300">{chat.name || "Unknown"}</span>
<span className="text-[10px] text-zinc-500 ml-auto">{chat.type} {chat.chat_id}</span>
</label>
))}
<button
onClick={() => setShowManualInput(!showManualInput)}
className="text-[10px] text-blue-400 hover:underline"
>
{showManualInput ? "hide manual input" : "edit manually"}
</button>
</div>
)}
</>
)
: undefined
}
/>
)}
<p className="text-[11px] text-zinc-500 mt-0.5">
{discoveredChats.length > 0 ? (
<>
Chats: <span className="text-zinc-400">{formChatId || "(none selected)"}</span>
{" · "}
<button
onClick={() => setShowManualInput(!showManualInput)}
className="text-blue-400 hover:underline"
>
{showManualInput ? "hide manual input" : "edit manually"}
</button>
</>
) : (
"Click Detect Chats after adding the bot to groups or sending /start in DMs."
)}
</p>
</div>
))
)}
<div>
<label htmlFor={allowedUsersId} className="text-[10px] text-zinc-500 block mb-1">
Allowed Users <span className="text-zinc-600">(optional, comma-separated)</span>
@ -323,7 +358,7 @@ export function ChannelsTab({ workspaceId }: Props) {
className="w-full text-xs bg-zinc-900 border border-zinc-700 rounded px-2 py-1.5 text-zinc-300 placeholder-zinc-600"
/>
<p className="text-[11px] text-zinc-500 mt-0.5">
Telegram user IDs. Leave empty to allow everyone.
Platform-specific user IDs. Leave empty to allow everyone.
</p>
</div>
{formError && (
@ -343,7 +378,7 @@ export function ChannelsTab({ workspaceId }: Props) {
<div className="text-center py-8">
<p className="text-zinc-500 text-xs">No channels connected</p>
<p className="text-zinc-600 text-[10px] mt-1">
Connect Telegram, Slack, or Discord to chat with this agent from social platforms.
Connect Telegram, Slack, Discord, or Lark / Feishu to chat with this agent from social platforms.
</p>
</div>
)}
@ -364,7 +399,7 @@ export function ChannelsTab({ workspaceId }: Props) {
{ch.channel_type.charAt(0).toUpperCase() + ch.channel_type.slice(1)}
</span>
<span className="text-[10px] text-zinc-500">
{ch.config.chat_id}
{ch.config.chat_id || ch.config.channel_id || ""}
</span>
</div>
<div className="flex items-center gap-1.5">
@ -415,3 +450,53 @@ export function ChannelsTab({ workspaceId }: Props) {
</div>
);
}
// SchemaField renders one ConfigField as a label + input. Kept inline in
// this file so the ChannelsTab stays self-contained; promote to its own
// module if another tab ever needs it.
function SchemaField({
field,
value,
onChange,
renderExtras,
}: {
field: ConfigField;
value: string;
onChange: (v: string) => void;
renderExtras?: () => React.ReactNode;
}) {
const inputId = useId();
const common =
"w-full text-xs bg-zinc-900 border border-zinc-700 rounded px-2 py-1.5 text-zinc-300 placeholder-zinc-600";
return (
<div>
<label htmlFor={inputId} className="text-[10px] text-zinc-500 block mb-1">
{field.label}
{!field.required && <span className="text-zinc-600"> (optional)</span>}
</label>
{field.type === "textarea" ? (
<textarea
id={inputId}
value={value}
onChange={(e) => onChange(e.target.value)}
placeholder={field.placeholder}
rows={3}
className={common}
/>
) : (
<input
id={inputId}
type={field.type === "password" ? "password" : "text"}
value={value}
onChange={(e) => onChange(e.target.value)}
placeholder={field.placeholder}
className={common}
/>
)}
{renderExtras?.()}
{field.help && (
<p className="text-[11px] text-zinc-500 mt-0.5">{field.help}</p>
)}
</div>
);
}

View File

@ -44,7 +44,7 @@ export function FilesToolbar({
<div className="flex gap-1.5">
{root === "/configs" && (
<>
<button onClick={onNewFile} aria-label="Create new file" className="text-[10px] text-blue-400 hover:text-blue-300" title="Create new file">
<button type="button" onClick={onNewFile} aria-label="Create new file" className="text-[10px] text-blue-400 hover:text-blue-300" title="Create new file">
+ New
</button>
<input
@ -57,20 +57,20 @@ export function FilesToolbar({
className="hidden"
onChange={(e) => e.target.files && onUpload(e.target.files)}
/>
<button onClick={() => uploadRef.current?.click()} aria-label="Upload folder" className="text-[10px] text-blue-400 hover:text-blue-300" title="Upload folder">
<button type="button" onClick={() => uploadRef.current?.click()} aria-label="Upload folder" className="text-[10px] text-blue-400 hover:text-blue-300" title="Upload folder">
Upload
</button>
</>
)}
<button onClick={onDownloadAll} aria-label="Download all files" className="text-[10px] text-zinc-500 hover:text-zinc-300" title="Download all files">
<button type="button" onClick={onDownloadAll} aria-label="Download all files" className="text-[10px] text-zinc-500 hover:text-zinc-300" title="Download all files">
Export
</button>
{root === "/configs" && (
<button onClick={onClearAll} aria-label="Delete all files" className="text-[10px] text-red-400/60 hover:text-red-400" title="Delete all files">
<button type="button" onClick={onClearAll} aria-label="Delete all files" className="text-[10px] text-red-400/60 hover:text-red-400" title="Delete all files">
Clear
</button>
)}
<button onClick={onRefresh} aria-label="Refresh file list" className="text-[10px] text-zinc-500 hover:text-zinc-300" title="Refresh">
<button type="button" onClick={onRefresh} aria-label="Refresh file list" className="text-[10px] text-zinc-500 hover:text-zinc-300" title="Refresh">
</button>
</div>

View File

@ -55,7 +55,7 @@ export function TracesTab({ workspaceId }: Props) {
<div className="p-4 space-y-2">
<div className="flex items-center justify-between mb-2">
<span className="text-xs text-zinc-400">{traces.length} traces</span>
<button onClick={loadTraces} className="text-[10px] text-zinc-500 hover:text-zinc-300">
<button type="button" onClick={loadTraces} className="text-[10px] text-zinc-500 hover:text-zinc-300">
Refresh
</button>
</div>

View File

@ -104,7 +104,7 @@ export function TagList({ label, values, onChange, placeholder }: { label: strin
{values.map((v, i) => (
<span key={i} className="inline-flex items-center gap-1 px-1.5 py-0.5 bg-zinc-800 border border-zinc-700 rounded text-[10px] text-zinc-300 font-mono">
{v}
<button aria-label={`Remove tag ${v}`} onClick={() => onChange(values.filter((_, j) => j !== i))} className="text-zinc-500 hover:text-red-400">×</button>
<button type="button" aria-label={`Remove tag ${v}`} onClick={() => onChange(values.filter((_, j) => j !== i))} className="text-zinc-500 hover:text-red-400">×</button>
</span>
))}
</div>
@ -131,7 +131,7 @@ export function Section({ title, children, defaultOpen = true }: { title: string
const [open, setOpen] = useState(defaultOpen);
return (
<div className="border border-zinc-800 rounded mb-2">
<button onClick={() => setOpen(!open)} className="w-full flex items-center justify-between px-3 py-1.5 text-[10px] text-zinc-400 hover:text-zinc-200 bg-zinc-900/50">
<button type="button" onClick={() => setOpen(!open)} className="w-full flex items-center justify-between px-3 py-1.5 text-[10px] text-zinc-400 hover:text-zinc-200 bg-zinc-900/50">
<span className="font-medium uppercase tracking-wider">{title}</span>
<span>{open ? "▾" : "▸"}</span>
</button>

View File

@ -113,9 +113,9 @@ function SecretRow({ label, secretKey, isSet, scope, globalMode, onSave, onDelet
{isSet && <span className="text-[10px] text-green-500 bg-green-900/30 px-1.5 py-0.5 rounded">Set</span>}
{scope && <ScopeBadge scope={scope} />}
{!editing && isSet && (globalMode || scope !== "global") && (
<button onClick={onDelete} className="text-[11px] text-red-400 hover:text-red-300">Remove</button>
<button type="button" onClick={onDelete} className="text-[11px] text-red-400 hover:text-red-300">Remove</button>
)}
<button onClick={() => setEditing(!editing)} className="text-[11px] text-blue-400 hover:text-blue-300">
<button type="button" onClick={() => setEditing(!editing)} className="text-[11px] text-blue-400 hover:text-blue-300">
{actionLabel()}
</button>
</div>
@ -128,7 +128,7 @@ function SecretRow({ label, secretKey, isSet, scope, globalMode, onSave, onDelet
type={isPlaintext ? "text" : "password"} autoFocus
className="flex-1 bg-zinc-900 border border-zinc-600 rounded px-2 py-1 text-[10px] text-zinc-100 font-mono focus:outline-none focus:border-blue-500"
/>
<button
<button type="button"
onClick={() => { onSave(value); setEditing(false); setValue(""); }}
disabled={!value}
className="px-2 py-1 bg-blue-600 hover:bg-blue-500 text-[10px] rounded text-white disabled:opacity-30"
@ -165,10 +165,10 @@ function CustomSecretRow({ secretKey, scope, globalMode, onSave, onDelete }: {
<span className="text-[10px] text-green-500">Set</span>
{!globalMode && <ScopeBadge scope={scope} />}
{canDelete && !editing && (
<button onClick={onDelete} className="text-[11px] text-red-400 hover:text-red-300">Remove</button>
<button type="button" onClick={onDelete} className="text-[11px] text-red-400 hover:text-red-300">Remove</button>
)}
{(canDelete || showOverride) && (
<button onClick={() => setEditing(!editing)} className="text-[11px] text-blue-400 hover:text-blue-300">
<button type="button" onClick={() => setEditing(!editing)} className="text-[11px] text-blue-400 hover:text-blue-300">
{editing ? "Cancel" : showOverride ? "Override" : "Update"}
</button>
)}
@ -181,7 +181,7 @@ function CustomSecretRow({ secretKey, scope, globalMode, onSave, onDelete }: {
placeholder="New value" type="password" autoFocus
className="flex-1 bg-zinc-900 border border-zinc-600 rounded px-2 py-1 text-[10px] text-zinc-100 font-mono focus:outline-none focus:border-blue-500"
/>
<button
<button type="button"
onClick={() => { onSave(value); setEditing(false); setValue(""); }}
disabled={!value}
className="px-2 py-1 bg-blue-600 hover:bg-blue-500 text-[10px] rounded text-white disabled:opacity-30"
@ -355,16 +355,16 @@ export function SecretsSection({ workspaceId, requiredEnv }: { workspaceId: stri
<input value={newValue} onChange={(e) => setNewValue(e.target.value)} placeholder="Value" type="password"
className="w-full bg-zinc-900 border border-zinc-600 rounded px-2 py-1 text-[10px] text-zinc-100 focus:outline-none focus:border-blue-500" />
<div className="flex gap-2">
<button onClick={() => { if (newKey && newValue) handleSave(newKey, newValue); }} disabled={!newKey || !newValue}
<button type="button" onClick={() => { if (newKey && newValue) handleSave(newKey, newValue); }} disabled={!newKey || !newValue}
className="px-2 py-1 bg-blue-600 hover:bg-blue-500 text-[10px] rounded text-white disabled:opacity-30">
Save{globalMode ? " (Global)" : ""}
</button>
<button onClick={() => { setShowAdd(false); setNewKey(""); setNewValue(""); }}
<button type="button" onClick={() => { setShowAdd(false); setNewKey(""); setNewValue(""); }}
className="px-2 py-1 bg-zinc-700 hover:bg-zinc-600 text-[10px] rounded text-zinc-300">Cancel</button>
</div>
</div>
) : (
<button onClick={() => setShowAdd(true)} className="text-[10px] text-blue-400 hover:text-blue-300">
<button type="button" onClick={() => setShowAdd(true)} className="text-[10px] text-blue-400 hover:text-blue-300">
+ Add {globalMode ? "Global " : ""}Variable
</button>
)}

View File

@ -6,32 +6,44 @@ import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
// runs happily in node. Splitting keeps the node tests fast.
// ---------------------------------------------------------------------------
// 401 handling — gated on SaaS-tenant hostname
// 401 handling — session-probe-before-redirect
// ---------------------------------------------------------------------------
//
// Before fix/quickstart-bugless, any 401 from any endpoint triggered
// `redirectToLogin()`, navigating to `/cp/auth/login`. That route
// exists only on SaaS (mounted by cp_proxy when CP_UPSTREAM_URL is
// set). On localhost / self-hosted / Vercel preview it 404s, so the
// user lands on a broken login page instead of seeing the actual error.
// History:
// 1. fix/quickstart-bugless: gated redirect on SaaS hostname (slug).
// 2. fix/api-401-probe-before-redirect (this file): probe /cp/auth/me
// before redirecting on a 401 from a non-auth path. The earlier
// behaviour redirected on EVERY 401, so a single 401 from
// /workspaces/:id/plugins (workspace-scoped — refused by the
// tenant admin bearer) yanked the user to AuthKit even when
// the session was fine. The probe lets us tell "session dead"
// from "endpoint refused this token."
//
// These tests lock in:
// - SaaS tenant hostname (*.moleculesai.app) → 401 still redirects.
// - non-SaaS hostname (localhost, LAN IP, apex) → 401 throws, no
// redirect, so the caller renders a real error affordance.
// Matrix:
// slug | path | probe → me | expected
// --- | --- | --- | ---
// acme | /cp/auth/me | (n/a) | redirect (path IS auth)
// acme | /workspaces/... | 401 | redirect (session dead)
// acme | /workspaces/... | 200 | throw, no redirect
// acme | /workspaces/... | network err| throw, no redirect
// "" | /workspaces/... | (n/a) | throw, no redirect (no slug)
const mockFetch = vi.fn();
globalThis.fetch = mockFetch;
function mockFailure(status: number, text: string) {
function mockNextResponse(status: number, text = "") {
mockFetch.mockResolvedValueOnce({
ok: false,
ok: status >= 200 && status < 300,
status,
json: () => Promise.reject(new Error("no json")),
text: () => Promise.resolve(text),
} as unknown as Response);
}
function mockNextNetworkError() {
mockFetch.mockRejectedValueOnce(new Error("network"));
}
function setHostname(host: string) {
Object.defineProperty(window, "location", {
configurable: true,
@ -59,27 +71,66 @@ describe("api 401 handling", () => {
vi.resetModules();
});
it("redirects to login on SaaS tenant hostname", async () => {
it("redirects when /cp/auth/me itself 401s — that IS the session-dead signal", async () => {
setHostname("acme.moleculesai.app");
mockFailure(401, '{"error":"admin auth required"}');
// Single fetch: the /cp/auth/me call itself.
mockNextResponse(401, '{"error":"unauthenticated"}');
const { api } = await import("../api");
await expect(api.get("/workspaces")).rejects.toThrow(/Session expired/);
await expect(api.get("/cp/auth/me")).rejects.toThrow(/Session expired/);
expect(redirectSpy).toHaveBeenCalledWith("sign-in");
// No probe fired — we already know the session is dead.
expect(mockFetch).toHaveBeenCalledTimes(1);
});
it("redirects when /cp/auth/me probe ALSO 401s — session genuinely dead", async () => {
setHostname("acme.moleculesai.app");
// First call: the workspace-scoped fetch returns 401.
mockNextResponse(401, '{"error":"workspace token required"}');
// Second call: the probe to /cp/auth/me also 401s.
mockNextResponse(401, '{"error":"unauthenticated"}');
const { api } = await import("../api");
await expect(api.get("/workspaces/abc/plugins")).rejects.toThrow(/Session expired/);
expect(redirectSpy).toHaveBeenCalledWith("sign-in");
});
it("does NOT redirect when probe returns 200 — endpoint refused this token, session fine", async () => {
setHostname("acme.moleculesai.app");
// First call: workspace-scoped 401.
mockNextResponse(401, '{"error":"workspace token required"}');
// Second call: probe shows the session is alive.
mockNextResponse(200, '{"user_id":"u1","org_id":"o1","email":"x@y"}');
const { api } = await import("../api");
await expect(api.get("/workspaces/abc/plugins")).rejects.toThrow(/401/);
expect(redirectSpy).not.toHaveBeenCalled();
});
it("does NOT redirect when probe network-errors — conservative fallback", async () => {
setHostname("acme.moleculesai.app");
mockNextResponse(401, '{"error":"workspace token required"}');
mockNextNetworkError();
const { api } = await import("../api");
await expect(api.get("/workspaces/abc/plugins")).rejects.toThrow(/401/);
expect(redirectSpy).not.toHaveBeenCalled();
});
it("does NOT redirect on localhost — throws a real error instead", async () => {
setHostname("localhost");
mockFailure(401, '{"error":"admin auth required"}');
mockNextResponse(401, '{"error":"admin auth required"}');
const { api } = await import("../api");
await expect(api.get("/workspaces")).rejects.toThrow(/401/);
expect(redirectSpy).not.toHaveBeenCalled();
// No slug → no probe fires either.
expect(mockFetch).toHaveBeenCalledTimes(1);
});
it("does NOT redirect on a LAN hostname", async () => {
setHostname("192.168.1.74");
mockFailure(401, '{"error":"missing workspace auth token"}');
mockNextResponse(401, '{"error":"missing workspace auth token"}');
const { api } = await import("../api");
await expect(api.get("/workspaces/abc/activity")).rejects.toThrow(/401/);
@ -91,7 +142,7 @@ describe("api 401 handling", () => {
// Users landing on app.moleculesai.app (pre-tenant-selection) must
// see the real 401 error rather than loop on login.
setHostname("app.moleculesai.app");
mockFailure(401, '{"error":"admin auth required"}');
mockNextResponse(401, '{"error":"admin auth required"}');
const { api } = await import("../api");
await expect(api.get("/workspaces")).rejects.toThrow(/401/);

View File

@ -60,15 +60,45 @@ async function request<T>(
return request<T>(method, path, body, retryCount + 1, options);
}
if (res.status === 401) {
// Session expired or credentials lost. On SaaS (tenant subdomain)
// the login page lives at /cp/auth/login and is mounted by the
// control-plane reverse proxy — redirect. On self-hosted / local
// dev / Vercel preview there IS no /cp/* mount, so redirecting
// would navigate to a 404 ("404 page not found") instead of the
// real error the user should see. In that case, throw instead
// and let the caller render a meaningful failure (retry button,
// error banner, etc.).
if (slug) {
// Distinguish "session is dead" from "this endpoint refused this
// token." Old behaviour blanket-redirected on every 401, so a
// single transient 401 from a workspace-scoped endpoint
// (/workspaces/:id/peers, /plugins, etc. that need a workspace
// token rather than the tenant admin bearer) yanked the user
// back to AuthKit even when their session was perfectly fine.
// That broke the staging-tabs E2E for the entire 2026-04-25
// night; #2073/#2074 worked around the symptom in the test by
// mocking 401→200 for every fetch, but the user-facing bug
// stayed.
//
// The canonical "session is dead" signal is /cp/auth/me
// returning 401. For any 401 on a non-auth path, probe
// /cp/auth/me before deciding to redirect:
// - probe 401 → session is actually dead → redirect
// - probe 200 → session is fine, the endpoint just refused
// our specific token → throw a real error,
// caller renders an error state
// - probe network error → assume session-fine (conservative;
// better to throw than to redirect on a
// transient probe failure)
//
// Self-hosted / localhost / reserved subdomains still throw
// without redirecting (slug is empty in those cases) — same
// policy as before.
const isAuthPath = path.startsWith("/cp/auth/");
let sessionDead = isAuthPath;
if (!isAuthPath && slug) {
try {
const probe = await fetch(`${PLATFORM_URL}/cp/auth/me`, {
credentials: "include",
signal: AbortSignal.timeout(5000),
});
sessionDead = probe.status === 401;
} catch {
// Probe failed (network/timeout) — fall through to throw.
}
}
if (sessionDead && slug) {
const { redirectToLogin } = await import("./auth");
redirectToLogin("sign-in");
throw new Error("Session expired — redirecting to login");

View File

@ -32,6 +32,10 @@ export interface Plan {
// plans is the canonical order shown on the pricing page: free → starter
// → pro. Change the order here + the rendered columns follow. Keeping
// this as a module-level const so tests can assert against a known list.
//
// Flat-rate positioning (Issue #1833): "starter" and "pro" are flat-rate
// per-org, not per-seat. This is a deliberate wedge against Cursor/Windsurf
// ($40/seat) — at 5 engineers the Team tier is 28% cheaper.
export const plans: Plan[] = [
{
id: "free",
@ -48,8 +52,8 @@ export const plans: Plan[] = [
},
{
id: "starter",
name: "Starter",
tagline: "For small teams shipping real agents",
name: "Team",
tagline: "Flat-rate for teams — one price, no per-seat fees",
price: "$29/month",
features: [
"10 workspaces",
@ -57,14 +61,15 @@ export const plans: Plan[] = [
"Private Upstash Redis namespace",
"Email support (48h)",
"5M LLM tokens / month included",
"No per-seat pricing",
],
ctaLabel: "Upgrade to Starter",
ctaLabel: "Upgrade to Team",
highlighted: true,
},
{
id: "pro",
name: "Pro",
tagline: "For production multi-agent orgs",
name: "Growth",
tagline: "Flat-rate for production multi-agent orgs",
price: "$99/month",
features: [
"Unlimited workspaces",
@ -72,9 +77,10 @@ export const plans: Plan[] = [
"Cross-workspace A2A audit log",
"Priority support (24h)",
"25M LLM tokens / month included",
"No per-seat pricing",
"Usage-based overage billing",
],
ctaLabel: "Upgrade to Pro",
ctaLabel: "Upgrade to Growth",
},
];

View File

@ -0,0 +1,120 @@
/**
* Runtime profiles per-runtime UX metadata.
*
* Scaling target: hundreds of runtimes (plugin-architecture-v2 roadmap).
* This module is the single source of truth for runtime-specific UI knobs
* on the canvas side. Each runtime can declare:
*
* - provisionTimeoutMs: when to show the "taking longer than expected"
* banner. Fast docker runtimes = 2min; slow source-build runtimes = 12min.
* - (future) label, icon, color, helpUrl, capabilities add as needed.
*
* Resolution order (most specific wins):
*
* 1. Server-provided override on the workspace data (e.g.
* `workspace.data.provisionTimeoutMs` set from a template manifest).
* Lets operators tune without a canvas release once server-side
* declarative config lands.
* 2. Per-runtime entry in RUNTIME_PROFILES.
* 3. DEFAULT_RUNTIME_PROFILE.
*
* Adding a new runtime:
* - If it's fast ( 2min cold boot): do nothing, the default catches it.
* - If it's slow: add one entry to RUNTIME_PROFILES below.
* - Long-term: move runtime profiles server-side so this file can shrink.
*
* Architectural note: this deliberately lives under /lib, NOT
* /components/ProvisioningTimeout. Other components (e.g. a
* "create workspace" dialog that needs to know the runtime's expected
* cold-boot time) should import from here too avoids duplicating the
* runtime-name knowledge across the codebase.
*/
/**
* Structural shape of a runtime profile. Add fields as new UX knobs
* become runtime-specific. Every field should be optional so new runtimes
* can partially fill the profile without breaking older code that reads
* only some fields.
*/
export interface RuntimeProfile {
/** Milliseconds before the canvas shows the "taking too long" banner.
* Base value the ProvisioningTimeout component still scales this by
* concurrent-provisioning count. */
provisionTimeoutMs?: number;
// Future extensions (kept commented until used):
// label?: string;
// icon?: string;
// color?: string;
// helpUrl?: string;
}
/** The floor every runtime inherits unless it overrides. Calibrated for
* docker-local fast runtimes (claude-code, langgraph, crewai) where cold
* boot is 30-90s. */
export const DEFAULT_RUNTIME_PROFILE: Required<
Pick<RuntimeProfile, "provisionTimeoutMs">
> = {
provisionTimeoutMs: 120_000, // 2 min
};
/**
* Named per-runtime overrides. Keep this map small and explicit
* each entry is a deliberate statement that this runtime's cold-boot
* behavior differs materially from the default.
*
* Each override must also ship with a comment explaining WHY the default
* is wrong for this runtime. Unexplained numbers rot.
*/
export const RUNTIME_PROFILES: Record<string, RuntimeProfile> = {
hermes: {
// 12 min. Installs ripgrep + ffmpeg + node22 + builds hermes-agent
// from source + Playwright + Chromium (~300MB download). Measured
// cold boots on staging EC2 routinely land at 8-13 min. Aligns
// with SaaS E2E's PROVISION_TIMEOUT_SECS=900 (15 min) so the UI
// warning lands shortly before the backend itself gives up.
provisionTimeoutMs: 720_000,
},
};
/**
* Data fields the canvas can consult for per-workspace overrides. These
* let the backend (via workspace data on the socket payload) override
* profile values without a canvas release.
*
* Intentionally loose typing if a field isn't present on the node, we
* fall through to the runtime profile.
*/
export interface WorkspaceRuntimeOverrides {
provisionTimeoutMs?: number;
}
/**
* Resolve a runtime profile for a given runtime name, optionally merging
* server-provided per-workspace overrides on top.
*
* Resolution (most-specific wins):
* overrides.provisionTimeoutMs
* RUNTIME_PROFILES[runtime].provisionTimeoutMs
* DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs
*/
export function getRuntimeProfile(
runtime: string | undefined,
overrides?: WorkspaceRuntimeOverrides,
): Required<Pick<RuntimeProfile, "provisionTimeoutMs">> {
const profile = runtime ? RUNTIME_PROFILES[runtime] : undefined;
return {
provisionTimeoutMs:
overrides?.provisionTimeoutMs ??
profile?.provisionTimeoutMs ??
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
};
}
/** Convenience: just the provisionTimeoutMs. Equivalent to
* `getRuntimeProfile(runtime, overrides).provisionTimeoutMs`. */
export function provisionTimeoutForRuntime(
runtime: string | undefined,
overrides?: WorkspaceRuntimeOverrides,
): number {
return getRuntimeProfile(runtime, overrides).provisionTimeoutMs;
}

View File

@ -4,6 +4,7 @@
"plugins": [
{"name": "browser-automation", "repo": "Molecule-AI/molecule-ai-plugin-browser-automation", "ref": "main"},
{"name": "ecc", "repo": "Molecule-AI/molecule-ai-plugin-ecc", "ref": "main"},
{"name": "gh-identity", "repo": "Molecule-AI/molecule-ai-plugin-gh-identity", "ref": "main"},
{"name": "molecule-audit", "repo": "Molecule-AI/molecule-ai-plugin-molecule-audit", "ref": "main"},
{"name": "molecule-audit-trail", "repo": "Molecule-AI/molecule-ai-plugin-molecule-audit-trail", "ref": "main"},
{"name": "molecule-careful-bash", "repo": "Molecule-AI/molecule-ai-plugin-molecule-careful-bash", "ref": "main"},

View File

@ -32,7 +32,7 @@
set -euo pipefail
DRY_RUN=1
MAX_DELETE_PCT=50 # refuse to delete more than half the records in one run
MAX_DELETE_PCT="${MAX_DELETE_PCT:-50}" # refuse to delete more than this pct of records in one run; caller can override via env
REGION="${AWS_DEFAULT_REGION:-us-east-2}"
for arg in "$@"; do

View File

@ -23,10 +23,13 @@ import (
"github.com/Molecule-AI/molecule-monorepo/platform/internal/supervised"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/ws"
// External plugin — registers an EnvMutator that injects GITHUB_TOKEN /
// GH_TOKEN from a GitHub App installation token. Soft-dep: only active
// when GITHUB_APP_ID env var is set (see main() for the gate).
pluginloader "github.com/Molecule-AI/molecule-ai-plugin-github-app-auth/pluginloader"
// External plugins — each registers EnvMutator(s) that run at workspace
// provision time. Loaded via soft-dep gates in main() so self-hosters
// without the App or without per-agent identity configured keep working.
githubappauth "github.com/Molecule-AI/molecule-ai-plugin-github-app-auth/pluginloader"
ghidentity "github.com/Molecule-AI/molecule-ai-plugin-gh-identity/pluginloader"
"github.com/Molecule-AI/molecule-monorepo/platform/pkg/provisionhook"
)
func main() {
@ -153,22 +156,49 @@ func main() {
wh.SetCPProvisioner(cpProv)
}
// External-plugin env mutators — each plugin contributes 0+ mutators
// onto a shared registry. Order matters: gh-identity populates
// MOLECULE_AGENT_ROLE-derived attribution env vars that downstream
// mutators and the workspace's install.sh can then read. Keep
// github-app-auth last because it fails loudly on misconfig and its
// failure mode is "no GITHUB_TOKEN" — worth surfacing after the
// cheaper mutators already ran.
envReg := provisionhook.NewRegistry()
// gh-identity plugin — per-agent attribution via env injection + gh
// wrapper shipped as base64 env. Soft-dep: no config file is OK
// (plugin no-ops when no role is set on the workspace).
// Tracks molecule-core#1957.
if res, err := ghidentity.BuildRegistry(); err != nil {
log.Fatalf("gh-identity plugin: %v", err)
} else {
envReg.Register(res.Mutator)
log.Printf("gh-identity: registered (config file=%q)", os.Getenv("MOLECULE_GH_IDENTITY_CONFIG_FILE"))
}
// github-app-auth plugin — injects GITHUB_TOKEN + GH_TOKEN into every
// workspace env using the App's installation access token (rotates ~hourly).
// Soft-skip when GITHUB_APP_* env vars are absent so dev/self-hosters
// without an App configured keep working; fail-loud only on MISCONFIG
// (e.g. APP_ID set but key file missing), not on unset.
if os.Getenv("GITHUB_APP_ID") != "" {
if reg, err := pluginloader.BuildRegistry(); err != nil {
if reg, err := githubappauth.BuildRegistry(); err != nil {
log.Fatalf("github-app-auth plugin: %v", err)
} else {
wh.SetEnvMutators(reg)
log.Printf("github-app-auth: registered, %d mutator(s) in chain", reg.Len())
// Copy the plugin's mutators onto the shared registry so the
// TokenProvider probe (FirstTokenProvider) still finds them.
for _, m := range reg.Mutators() {
envReg.Register(m)
}
log.Printf("github-app-auth: registered, %d mutator(s) added to chain", reg.Len())
}
} else {
log.Println("github-app-auth: GITHUB_APP_ID unset — skipping plugin registration (agents will use any PAT from .env)")
}
wh.SetEnvMutators(envReg)
log.Printf("env-mutator chain: %v", envReg.Names())
// Offline handler: broadcast event + auto-restart the dead workspace
onWorkspaceOffline := func(innerCtx context.Context, workspaceID string) {
if err := broadcaster.RecordAndBroadcast(innerCtx, "WORKSPACE_OFFLINE", workspaceID, map[string]interface{}{}); err != nil {

View File

@ -4,6 +4,7 @@ go 1.25.0
require (
github.com/DATA-DOG/go-sqlmock v1.5.2
github.com/Molecule-AI/molecule-ai-plugin-gh-identity v0.0.0-20260424033845-4fd5ac7be30f
github.com/Molecule-AI/molecule-ai-plugin-github-app-auth v0.0.0-20260421064811-7d98ae51e31d
github.com/alicebob/miniredis/v2 v2.37.0
github.com/creack/pty v1.1.18

View File

@ -4,8 +4,12 @@ github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7Oputl
github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU=
github.com/Microsoft/go-winio v0.4.21 h1:+6mVbXh4wPzUrl1COX9A+ZCvEpYsOBZ6/+kwDnvLyro=
github.com/Microsoft/go-winio v0.4.21/go.mod h1:JPGBdM1cNvN/6ISo+n8V5iA4v8pBzdOpzfwIujj1a84=
github.com/Molecule-AI/molecule-ai-plugin-gh-identity v0.0.0-20260424033845-4fd5ac7be30f h1:YkLRhUg+9qr9OV9N8dG1Hj0Ml7TThHlRwh5F//oUJVs=
github.com/Molecule-AI/molecule-ai-plugin-gh-identity v0.0.0-20260424033845-4fd5ac7be30f/go.mod h1:NqdtlWZDJvpXNJRHnMkPhTKHdA1LZTNH+63TB66JSOU=
github.com/Molecule-AI/molecule-ai-plugin-github-app-auth v0.0.0-20260421064811-7d98ae51e31d h1:GpYhP6FxaJZc1Ljy5/YJ9ZIVGvfOqZBmDolNr2S5x2g=
github.com/Molecule-AI/molecule-ai-plugin-github-app-auth v0.0.0-20260421064811-7d98ae51e31d/go.mod h1:3a6LR/zd7FjR9ZwLTbytwYlWuCBsbCOVFlEg0WnoYiM=
github.com/Molecule-AI/molecule-ai-plugin-gh-identity v0.0.0-20260424033845-4fd5ac7be30f h1:YkLRhUg+9qr9OV9N8dG1Hj0Ml7TThHlRwh5F//oUJVs=
github.com/Molecule-AI/molecule-ai-plugin-gh-identity v0.0.0-20260424033845-4fd5ac7be30f/go.mod h1:NqdtlWZDJvpXNJRHnMkPhTKHdA1LZTNH+63TB66JSOU=
github.com/alicebob/miniredis/v2 v2.37.0 h1:RheObYW32G1aiJIj81XVt78ZHJpHonHLHW7OLIshq68=
github.com/alicebob/miniredis/v2 v2.37.0/go.mod h1:TcL7YfarKPGDAthEtl5NBeHZfeUQj6OXMm/+iu5cLMM=
github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs=

View File

@ -17,6 +17,14 @@ type ChannelAdapter interface {
// DisplayName returns the human-readable name (e.g. "Telegram").
DisplayName() string
// ConfigSchema describes the config fields each adapter needs. The UI
// renders the connect-channel form from this list, so each platform's
// field set (Telegram bot_token+chat_id, Lark webhook_url+verify_token,
// Slack bot_token+channel_id, Discord webhook_url) can be captured
// correctly without per-platform UI branching. Adapters must return the
// same schema on every call — the order is the rendering order.
ConfigSchema() []ConfigField
// ValidateConfig checks that channel_config JSONB has required fields.
ValidateConfig(config map[string]interface{}) error
@ -31,6 +39,33 @@ type ChannelAdapter interface {
StartPolling(ctx context.Context, config map[string]interface{}, onMessage MessageHandler) error
}
// ConfigField describes a single config field for the channels connect-form UI.
// Canvas renders one input per field in order. Values are strings in
// channel_config JSONB — this struct carries only presentation + validation
// hints; ValidateConfig on the adapter is still the source of truth for
// acceptance.
type ConfigField struct {
// Key is the channel_config map key (e.g. "webhook_url").
Key string `json:"key"`
// Label is the human-readable field name (e.g. "Webhook URL").
Label string `json:"label"`
// Type controls the HTML input type: "text" | "password" | "textarea".
Type string `json:"type"`
// Required marks the field as non-optional in the UI. Still enforced
// server-side via ValidateConfig regardless of this flag.
Required bool `json:"required"`
// Sensitive means the value must not be logged or shown unmasked in
// read APIs after creation. Canvas uses this to redact the value in
// list responses; server-side encryption is governed by sensitiveFields
// in secret.go (today: bot_token + webhook_secret only — this flag is
// forward-looking until that list is widened).
Sensitive bool `json:"sensitive"`
// Placeholder is rendered as the input's placeholder attribute.
Placeholder string `json:"placeholder,omitempty"`
// Help is a short one-liner shown below the input.
Help string `json:"help,omitempty"`
}
// InboundMessage is the standardized message from any social platform.
type InboundMessage struct {
ChatID string // Platform-specific chat/channel ID

View File

@ -127,10 +127,13 @@ func TestListAdapters(t *testing.T) {
}
found := false
for _, a := range list {
if a["type"] == "telegram" {
if a.Type == "telegram" {
found = true
if a["display_name"] != "Telegram" {
t.Errorf("expected display_name 'Telegram', got %q", a["display_name"])
if a.DisplayName != "Telegram" {
t.Errorf("expected display_name 'Telegram', got %q", a.DisplayName)
}
if len(a.ConfigSchema) == 0 {
t.Error("Telegram adapter must expose a non-empty ConfigSchema")
}
}
}
@ -740,10 +743,10 @@ func TestListAdapters_IncludesSlack(t *testing.T) {
list := ListAdapters()
found := false
for _, a := range list {
if a["type"] == "slack" {
if a.Type == "slack" {
found = true
if a["display_name"] != "Slack" {
t.Errorf("expected display_name 'Slack', got %q", a["display_name"])
if a.DisplayName != "Slack" {
t.Errorf("expected display_name 'Slack', got %q", a.DisplayName)
}
}
}

View File

@ -38,6 +38,32 @@ type DiscordAdapter struct{}
func (d *DiscordAdapter) Type() string { return "discord" }
func (d *DiscordAdapter) DisplayName() string { return "Discord" }
// ConfigSchema — Discord only needs a webhook URL for outbound.
// public_key is the Ed25519 pubkey used to verify inbound Interactions
// signatures (stored hex-encoded); not required if you only do outbound.
func (d *DiscordAdapter) ConfigSchema() []ConfigField {
return []ConfigField{
{
Key: "webhook_url",
Label: "Webhook URL",
Type: "password",
Required: true,
Sensitive: true,
Placeholder: "https://discord.com/api/webhooks/{id}/{token}",
Help: "From Server Settings → Integrations → Webhooks → Copy URL.",
},
{
Key: "public_key",
Label: "Interactions Public Key (hex)",
Type: "password",
Required: false,
Sensitive: true,
Placeholder: "optional — for inbound slash commands",
Help: "Ed25519 public key from the Discord Developer Portal → General Information. Only needed to receive slash commands.",
},
}
}
// ValidateConfig checks that the channel config contains a valid Discord
// Incoming Webhook URL. Returns a human-readable error for the Canvas UI.
func (d *DiscordAdapter) ValidateConfig(config map[string]interface{}) error {

View File

@ -241,10 +241,10 @@ func TestListAdapters_IncludesDiscord(t *testing.T) {
list := ListAdapters()
found := false
for _, a := range list {
if a["type"] == "discord" {
if a.Type == "discord" {
found = true
if a["display_name"] != "Discord" {
t.Errorf("expected display_name 'Discord', got %q", a["display_name"])
if a.DisplayName != "Discord" {
t.Errorf("expected display_name 'Discord', got %q", a.DisplayName)
}
}
}

View File

@ -37,6 +37,33 @@ const (
func (l *LarkAdapter) Type() string { return "lark" }
func (l *LarkAdapter) DisplayName() string { return "Lark / Feishu" }
// ConfigSchema — Lark Custom Bot webhook URL + optional Event Subscription
// verify token. The webhook URL already encodes the chat, so no separate
// chat_id field is needed (and StartPolling is a no-op for Lark — inbound
// is delivered by ParseWebhook from the Event Subscription callback).
func (l *LarkAdapter) ConfigSchema() []ConfigField {
return []ConfigField{
{
Key: "webhook_url",
Label: "Custom Bot Webhook URL",
Type: "password", // last path component is a secret
Required: true,
Sensitive: true,
Placeholder: "https://open.feishu.cn/open-apis/bot/v2/hook/XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX",
Help: "From the Lark/Feishu bot page → Webhook settings. open.feishu.cn (China) and open.larksuite.com (international) both accepted.",
},
{
Key: "verify_token",
Label: "Event Subscription Verify Token",
Type: "password",
Required: false,
Sensitive: true,
Placeholder: "optional — from Event Subscriptions page",
Help: "Only needed if you want to receive messages from Lark. Paste the \"Verification Token\" from your app's Event Subscriptions configuration.",
},
}
}
// ValidateConfig requires webhook_url to point at a Lark or Feishu Custom
// Bot endpoint. verify_token is optional — when set, inbound events with a
// mismatching token are rejected (use Lark's "Verification Token" from the

View File

@ -401,3 +401,60 @@ func TestRegistry_HasLark(t *testing.T) {
t.Errorf("got %q want lark", a.Type())
}
}
// TestLark_ConfigSchema locks in the contract: Lark exposes a required +
// sensitive webhook_url and an optional + sensitive verify_token, in that
// order. Canvas renders the connect-form from this list so the order and
// required/sensitive flags are observable surface.
func TestLark_ConfigSchema(t *testing.T) {
schema := (&LarkAdapter{}).ConfigSchema()
if len(schema) != 2 {
t.Fatalf("expected 2 fields, got %d", len(schema))
}
want := []struct {
key string
required bool
sensitive bool
}{
{"webhook_url", true, true},
{"verify_token", false, true},
}
for i, w := range want {
got := schema[i]
if got.Key != w.key {
t.Errorf("field %d: key = %q, want %q", i, got.Key, w.key)
}
if got.Required != w.required {
t.Errorf("field %d (%s): required = %v, want %v", i, w.key, got.Required, w.required)
}
if got.Sensitive != w.sensitive {
t.Errorf("field %d (%s): sensitive = %v, want %v", i, w.key, got.Sensitive, w.sensitive)
}
if got.Label == "" {
t.Errorf("field %d (%s): label must not be empty", i, w.key)
}
}
}
// TestListAdapters_IncludesLark confirms the adapter is wired into the
// registry and its schema reaches the API layer intact. Regression guard
// against future registry.go refactors silently dropping Lark.
func TestListAdapters_IncludesLark(t *testing.T) {
list := ListAdapters()
var found *AdapterInfo
for i := range list {
if list[i].Type == "lark" {
found = &list[i]
break
}
}
if found == nil {
t.Fatal("lark adapter not in ListAdapters() output")
}
if found.DisplayName != "Lark / Feishu" {
t.Errorf("DisplayName = %q, want 'Lark / Feishu'", found.DisplayName)
}
if len(found.ConfigSchema) == 0 {
t.Error("ConfigSchema must not be empty in registry output")
}
}

View File

@ -15,14 +15,31 @@ func GetAdapter(channelType string) (ChannelAdapter, bool) {
return a, ok
}
// ListAdapters returns metadata about all available adapters.
func ListAdapters() []map[string]string {
result := make([]map[string]string, 0, len(adapters))
// AdapterInfo is the metadata payload returned by ListAdapters — the Canvas
// connect-channel form renders its field list dynamically from config_schema.
type AdapterInfo struct {
Type string `json:"type"`
DisplayName string `json:"display_name"`
ConfigSchema []ConfigField `json:"config_schema"`
}
// ListAdapters returns metadata about all available adapters, in a stable
// order (sorted by display name) so UI rendering + test assertions don't
// depend on Go's random map iteration.
func ListAdapters() []AdapterInfo {
result := make([]AdapterInfo, 0, len(adapters))
for _, a := range adapters {
result = append(result, map[string]string{
"type": a.Type(),
"display_name": a.DisplayName(),
result = append(result, AdapterInfo{
Type: a.Type(),
DisplayName: a.DisplayName(),
ConfigSchema: a.ConfigSchema(),
})
}
// Sort by display name for deterministic ordering.
for i := 1; i < len(result); i++ {
for j := i; j > 0 && result[j-1].DisplayName > result[j].DisplayName; j-- {
result[j-1], result[j] = result[j], result[j-1]
}
}
return result
}

View File

@ -31,6 +31,57 @@ type SlackAdapter struct{}
func (s *SlackAdapter) Type() string { return "slack" }
func (s *SlackAdapter) DisplayName() string { return "Slack" }
// ConfigSchema — Slack supports two mutually-exclusive outbound modes:
// Bot API (bot_token + channel_id, supports per-message identity override)
// and Incoming Webhook (webhook_url, legacy, no identity override). The
// form exposes both; ValidateConfig enforces "one or the other".
func (s *SlackAdapter) ConfigSchema() []ConfigField {
return []ConfigField{
{
Key: "bot_token",
Label: "Bot Token (xoxb-…)",
Type: "password",
Required: false,
Sensitive: true,
Placeholder: "xoxb-1234-5678-abc...",
Help: "Bot API mode — supports per-agent identity override. Required scopes: chat:write, chat:write.customize. Leave empty to use Incoming Webhook mode instead.",
},
{
Key: "channel_id",
Label: "Channel ID",
Type: "text",
Required: false,
Placeholder: "C01234ABCDE",
Help: "Required when using Bot Token mode. From the channel's \"View channel details\" dialog.",
},
{
Key: "webhook_url",
Label: "Incoming Webhook URL (legacy)",
Type: "password",
Required: false,
Sensitive: true,
Placeholder: "https://hooks.slack.com/services/T.../B.../...",
Help: "Simpler mode — no per-agent identity. Either Bot Token OR Webhook URL is required.",
},
{
Key: "username",
Label: "Override Username",
Type: "text",
Required: false,
Placeholder: "optional, Bot Token mode only",
Help: "Display name to use on outbound messages. Ignored in Webhook mode.",
},
{
Key: "icon_emoji",
Label: "Override Icon Emoji",
Type: "text",
Required: false,
Placeholder: ":robot_face:",
Help: "Emoji shortcode for per-message avatar. Ignored in Webhook mode.",
},
}
}
// ValidateConfig checks that the channel config contains a valid Slack
// Incoming Webhook URL (must start with https://hooks.slack.com/).
// Returns an error whose message becomes part of the 400 response body so

View File

@ -39,6 +39,31 @@ type TelegramAdapter struct{}
func (t *TelegramAdapter) Type() string { return "telegram" }
func (t *TelegramAdapter) DisplayName() string { return "Telegram" }
// ConfigSchema — Telegram uses Bot API long-polling. The bot token comes
// from @BotFather; chat_id is a comma-separated list discovered via the
// "Detect Chats" UI flow (calls Bot.getUpdates).
func (t *TelegramAdapter) ConfigSchema() []ConfigField {
return []ConfigField{
{
Key: "bot_token",
Label: "Bot Token",
Type: "password",
Required: true,
Sensitive: true,
Placeholder: "123456789:AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
Help: "From @BotFather → /newbot (or /token on an existing bot).",
},
{
Key: "chat_id",
Label: "Chat IDs",
Type: "text",
Required: true,
Placeholder: "-100123456789, -100987654321",
Help: "Comma-separated chat IDs. Use \"Detect Chats\" after adding the bot to groups or sending /start in DMs.",
},
}
}
func (t *TelegramAdapter) ValidateConfig(config map[string]interface{}) error {
token, _ := config["bot_token"].(string)
if token == "" {

View File

@ -142,13 +142,29 @@ func validateAgentURL(rawURL string) error {
{"127.0.0.0/8", "loopback address"},
{"fe80::/10", "IPv6 link-local address (cloud metadata analogue)"},
{"::1/128", "IPv6 loopback address"},
// Always-blocked regardless of deploy mode: these ranges are never valid
// agent URLs in any deployment. TEST-NET (RFC-5737) are documentation-only
// ranges. CGNAT (RFC-6598) is never used for VPC subnets on any cloud
// provider. IPv4 multicast is never a unicast endpoint. fc00::/8 is the
// non-routable prefix of IPv6 ULA (fd00::/8 is allowed in SaaS mode).
// RFC 3849: 2001:db8::/32 is the IPv6 documentation prefix.
{"192.0.2.0/24", "TEST-NET-1 documentation range (RFC-5737)"},
{"198.51.100.0/24", "TEST-NET-2 documentation range (RFC-5737)"},
{"203.0.113.0/24", "TEST-NET-3 documentation range (RFC-5737)"},
{"100.64.0.0/10", "carrier-grade NAT address (RFC-6598)"},
{"224.0.0.0/4", "IPv4 multicast address"},
{"fc00::/8", "IPv6 ULA non-routable prefix (fc00::/8)"},
{"2001:db8::/32", "IPv6 documentation address (RFC-3849 reserved)"},
}
if !saasMode() {
blockedRanges = append(blockedRanges,
blockedRange{"10.0.0.0/8", "RFC-1918 private address"},
blockedRange{"172.16.0.0/12", "RFC-1918 private address"},
blockedRange{"192.168.0.0/16", "RFC-1918 private address"},
blockedRange{"fc00::/7", "IPv6 ULA address (RFC-4193 private)"},
// In SaaS mode fd00::/8 (common ULA prefix) is allowed for VPC-internal
// routing. fc00::/8 is already always-blocked above. In non-SaaS mode
// block the entire fc00::/7 supernet (covers both fd00 and fc00).
blockedRange{"fd00::/8", "IPv6 ULA address (RFC-4193 private)"},
)
}

View File

@ -540,6 +540,21 @@ func TestValidateAgentURL(t *testing.T) {
{"blocked IPv6 loopback [::1]", "http://[::1]:8080", true},
{"blocked IPv6 link-local [fe80::1]", "http://[fe80::1]:8080", true},
{"blocked IPv6 ULA [fd00::1]", "http://[fd00::1]:8080", true},
// ── Must be rejected: RFC 5737 TEST-NET reserved ranges ─────────────
// These addresses are reserved for documentation and example code.
// No production agent has a legitimate reason to use them.
{"blocked TEST-NET-1 192.0.2.x", "http://192.0.2.1:8080", true},
{"blocked TEST-NET-1 192.0.2.254", "http://192.0.2.254:9000", true},
{"blocked TEST-NET-2 198.51.100.x", "http://198.51.100.1:8080", true},
{"blocked TEST-NET-2 198.51.100.99", "http://198.51.100.99:8000", true},
{"blocked TEST-NET-3 203.0.113.x", "http://203.0.113.1:8080", true},
{"blocked TEST-NET-3 203.0.113.254", "http://203.0.113.254:9000", true},
// ── Must be rejected: RFC 3849 IPv6 documentation prefix ────────────
{"blocked IPv6 documentation 2001:db8::1", "http://[2001:db8::1]:8080", true},
{"blocked IPv6 documentation 2001:db8::ffff", "http://[2001:db8::ffff]:8000", true},
// IPv4-mapped IPv6 for a blocked range must also be rejected.
// Go normalises ::ffff:169.254.x.x to IPv4 via To4(), so the existing
// 169.254.0.0/16 entry catches it without a dedicated rule.
@ -570,6 +585,91 @@ func TestValidateAgentURL(t *testing.T) {
}
}
// TestValidateAgentURL_SaaSMode_AllowsRFC1918 is the integration-level wrapper test
// for the SaaS-mode SSRF relaxation in validateAgentURL (used at registration).
// It exercises validateAgentURL as called by the Register handler, not just the
// inner blockedRanges slice. Regression guard for the same class of bug as
// isSafeURL (issue #1785).
func TestValidateAgentURL_SaaSMode_AllowsRFC1918(t *testing.T) {
t.Setenv("MOLECULE_DEPLOY_MODE", "saas")
t.Setenv("MOLECULE_ORG_ID", "")
for _, url := range []string{
"http://10.1.2.3/agent",
"http://10.0.0.5:8000/a2a",
"http://172.16.0.1/agent",
"http://172.18.0.42:8000/a2a",
"http://172.31.44.78/agent",
"http://192.168.1.100/agent",
"http://192.168.255.254:9000/a2a",
"http://[fd00::1]/agent",
"http://[fd12:3456:789a::42]/a2a",
} {
if err := validateAgentURL(url); err != nil {
t.Errorf("validateAgentURL(%q) in saasMode: got %v, want nil", url, err)
}
}
}
// TestValidateAgentURL_SaaSMode_StillBlocksMetadataEtAl verifies that even in
// SaaS mode the always-blocked ranges (metadata, loopback, TEST-NET, CGNAT,
// non-fd00 ULA) stay blocked.
func TestValidateAgentURL_SaaSMode_StillBlocksMetadataEtAl(t *testing.T) {
t.Setenv("MOLECULE_DEPLOY_MODE", "saas")
t.Setenv("MOLECULE_ORG_ID", "")
for _, url := range []string{
"http://169.254.169.254/latest/meta-data/",
"http://169.254.0.1/",
"http://127.0.0.1:8080",
"http://[::1]:8080",
"http://192.0.2.5/agent",
"http://198.51.100.5/a2a",
"http://203.0.113.42/agent",
"http://100.64.0.1/agent",
"http://100.127.255.254:8000/a2a",
"http://[fc00::1]/agent",
"http://224.0.0.1/",
} {
if err := validateAgentURL(url); err == nil {
t.Errorf("validateAgentURL(%q) in saasMode: got nil, want block", url)
}
}
}
// TestValidateAgentURL_StrictMode_BlocksRFC1918 is the strict-mode counterpart
// to TestValidateAgentURL_SaaSMode_AllowsRFC1918.
func TestValidateAgentURL_StrictMode_BlocksRFC1918(t *testing.T) {
t.Setenv("MOLECULE_DEPLOY_MODE", "self-hosted")
t.Setenv("MOLECULE_ORG_ID", "")
for _, url := range []string{
"http://10.1.2.3/agent",
"http://172.16.0.1:8000/a2a",
"http://172.31.44.78/agent",
"http://192.168.1.100/agent",
"http://[fd00::1]/agent",
} {
if err := validateAgentURL(url); err == nil {
t.Errorf("validateAgentURL(%q) in strict mode: got nil, want block", url)
}
}
}
// TestValidateAgentURL_SaaSMode_LegacyOrgID covers the legacy MOLECULE_ORG_ID
// signal (no MOLECULE_DEPLOY_MODE set) for validateAgentURL.
func TestValidateAgentURL_SaaSMode_LegacyOrgID(t *testing.T) {
t.Setenv("MOLECULE_DEPLOY_MODE", "")
t.Setenv("MOLECULE_ORG_ID", "7b2179dc-8cc6-4581-a3c6-c8bff4481086")
for _, url := range []string{
"http://10.1.2.3/agent",
"http://172.18.0.42:8000/a2a",
"http://192.168.1.100/agent",
"http://[fd00::1]/agent",
} {
if err := validateAgentURL(url); err != nil {
t.Errorf("validateAgentURL(%q) with legacy MOLECULE_ORG_ID: got %v, want nil", url, err)
}
}
}
// ==================== C18 — Register ownership ====================
// TestRegister_C18_BootstrapAllowedNoTokens verifies that a workspace with NO

View File

@ -326,4 +326,101 @@ func TestDevModeAllowsLoopback_Predicate(t *testing.T) {
}
})
}
}
// TestIsSafeURL_SaaSMode_AllowsRFC1918 is the integration-level wrapper test
// for the SaaS-mode SSRF relaxation. It exercises isSafeURL (the public API),
// not isPrivateOrMetadataIP (the inner helper), ensuring the wrapper correctly
// propagates saasMode() to its helper.
//
// Regression guard: isSafeURL previously hardcoded RFC-1918 rejection and never
// called saasMode(), causing 502 on every A2A call from Docker-networked or VPC
// deployments (issue #1785 / PR #1785). The inner helper's TestIsPrivateOrMetadataIP_SaaSMode
// was green the whole time — classic "test the intent, not the integration" gap.
func TestIsSafeURL_SaaSMode_AllowsRFC1918(t *testing.T) {
t.Setenv("MOLECULE_DEPLOY_MODE", "saas")
t.Setenv("MOLECULE_ORG_ID", "")
for _, url := range []string{
"http://10.1.2.3/agent",
"http://10.0.0.5:8000/a2a",
"http://172.16.0.1/agent",
"http://172.18.0.42:8000/a2a",
"http://172.31.44.78/agent",
"http://192.168.1.100/agent",
"http://192.168.255.254:9000/a2a",
"http://[fd00::1]/agent",
"http://[fd12:3456:789a::42]/a2a",
} {
if err := isSafeURL(url); err != nil {
t.Errorf("isSafeURL(%q) in saasMode: got %v, want nil", url, err)
}
}
}
// TestIsSafeURL_SaaSMode_StillBlocksMetadataEtAl verifies that even in SaaS
// mode the always-blocked ranges (metadata, loopback, TEST-NET, CGNAT) stay blocked.
func TestIsSafeURL_SaaSMode_StillBlocksMetadataEtAl(t *testing.T) {
t.Setenv("MOLECULE_DEPLOY_MODE", "saas")
t.Setenv("MOLECULE_ORG_ID", "")
for _, url := range []string{
// Cloud metadata — must stay blocked in every mode.
"http://169.254.169.254/latest/meta-data/",
"http://169.254.0.1/",
// Loopback — must stay blocked.
"http://127.0.0.1:8080",
"http://[::1]:8080",
// TEST-NET documentation ranges — must stay blocked.
"http://192.0.2.5/agent",
"http://198.51.100.5/a2a",
"http://203.0.113.42/agent",
// CGNAT — must stay blocked.
"http://100.64.0.1/agent",
"http://100.127.255.254:8000/a2a",
// ULA fc00::/8 (non-fd00 half) — must stay blocked in SaaS.
"http://[fc00::1]/agent",
// Non-RFC-1918 private ranges still blocked.
"http://224.0.0.1/",
} {
if err := isSafeURL(url); err == nil {
t.Errorf("isSafeURL(%q) in saasMode: got nil, want block", url)
}
}
}
// TestIsSafeURL_StrictMode_BlocksRFC1918 is the strict-mode counterpart to
// TestIsSafeURL_SaaSMode_AllowsRFC1918. In self-hosted / single-container
// deployments there is no legitimate reason to reach RFC-1918 agents, so the
// wrapper must block them.
func TestIsSafeURL_StrictMode_BlocksRFC1918(t *testing.T) {
t.Setenv("MOLECULE_DEPLOY_MODE", "self-hosted")
t.Setenv("MOLECULE_ORG_ID", "")
for _, url := range []string{
"http://10.1.2.3/agent",
"http://172.16.0.1:8000/a2a",
"http://172.31.44.78/agent",
"http://192.168.1.100/agent",
"http://[fd00::1]/agent",
} {
if err := isSafeURL(url); err == nil {
t.Errorf("isSafeURL(%q) in strict mode: got nil, want block", url)
}
}
}
// TestIsSafeURL_SaasMode_LegacyOrgID covers the legacy MOLECULE_ORG_ID signal
// (no MOLECULE_DEPLOY_MODE set). An org ID alone is sufficient to activate SaaS
// mode per the saasMode() resolution ladder.
func TestIsSafeURL_SaasMode_LegacyOrgID(t *testing.T) {
t.Setenv("MOLECULE_DEPLOY_MODE", "")
t.Setenv("MOLECULE_ORG_ID", "7b2179dc-8cc6-4581-a3c6-c8bff4481086")
for _, url := range []string{
"http://10.1.2.3/agent",
"http://172.18.0.42:8000/a2a",
"http://192.168.1.100/agent",
"http://[fd00::1]/agent",
} {
if err := isSafeURL(url); err != nil {
t.Errorf("isSafeURL(%q) with legacy MOLECULE_ORG_ID: got %v, want nil", url, err)
}
}
}

View File

@ -77,17 +77,26 @@ func (h *TerminalHandler) HandleConnect(c *gin.Context) {
// A2A message-passing, so we apply the same hierarchy check here.
// GH#756/#1609 security fix: if the caller claims a specific workspace
// identity (X-Workspace-ID header), the bearer token — if present — must
// belong to that claimed workspace. ValidateAnyToken accepted ANY valid org
// token, allowing Workspace A to forge X-Workspace-ID: B and reach B's
// terminal if A held any valid token. ValidateToken binds the token to
// the claimed workspace identity.
// belong to that claimed workspace. Previously ValidateAnyToken accepted
// ANY valid org token, allowing Workspace A to forge X-Workspace-ID: B
// and reach B's terminal if A held any valid token. ValidateToken binds
// the workspace-scoped token to the claimed workspace identity. Org-level
// tokens are handled separately via the org_token_id context key.
callerID := c.GetHeader("X-Workspace-ID")
if callerID != "" && callerID != workspaceID {
tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization"))
if tok != "" {
if err := wsauth.ValidateToken(ctx, db.DB, callerID, tok); err != nil {
c.JSON(http.StatusUnauthorized, gin.H{"error": "invalid token for claimed workspace"})
return
// Org-scoped tokens (org_api_tokens) are validated at the org level
// by WorkspaceAuth and do not have a workspace_auth_tokens row, so
// ValidateToken always returns ErrInvalidToken for them. If WorkspaceAuth
// already validated an org token (org_token_id set in context), trust
// the X-Workspace-ID claim — the hierarchy is enforced by
// canCommunicateCheck below. Reject everything else.
if c.GetString("org_token_id") == "" {
c.JSON(http.StatusUnauthorized, gin.H{"error": "invalid token for claimed workspace"})
return
}
}
}
if !canCommunicateCheck(callerID, workspaceID) {

View File

@ -455,3 +455,38 @@ func TestTerminalConnect_KI005_AllowsSiblingWorkspace(t *testing.T) {
}
}
// TestKI005_OrgToken_SkipsValidateToken verifies that when WorkspaceAuth already
// validated an org token (org_token_id set in gin context), the X-Workspace-ID
// claim is trusted without a workspace_auth_tokens lookup. The hierarchy is still
// enforced by canCommunicateCheck. Regression guard for the A2A routing regression
// introduced in GH#1885: internal routing uses org tokens which are not in
// workspace_auth_tokens, so ValidateToken would always fail for them.
func TestKI005_OrgToken_SkipsValidateToken(t *testing.T) {
setupTestDB(t) // no ValidateToken ExpectQuery — none should fire
prev := canCommunicateCheck
canCommunicateCheck = func(callerID, targetID string) bool {
// Simulate platform agent → target workspace (same org).
return callerID == "ws-platform" && targetID == "ws-target"
}
defer func() { canCommunicateCheck = prev }()
h := NewTerminalHandler(nil)
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: "ws-target"}}
c.Request = httptest.NewRequest("GET", "/workspaces/ws-target/terminal", nil)
c.Request.Header.Set("X-Workspace-ID", "ws-platform")
c.Request.Header.Set("Authorization", "Bearer org-token-abc123")
// Simulate WorkspaceAuth having validated the org token (orgtoken.Validate
// succeeded). HandleConnect must skip ValidateToken and trust the claim.
c.Set("org_token_id", "tok-org-abc")
h.HandleConnect(c)
// Org token path: ValidateToken skipped → canCommunicateCheck=true →
// falls through to Docker path → 503 nil-docker (no Docker client).
if w.Code != http.StatusServiceUnavailable {
t.Errorf("org-token A2A: got %d, want 503 nil-docker (%s)", w.Code, w.Body.String())
}
}

View File

@ -6,6 +6,7 @@ package handlers
import (
"database/sql"
"errors"
"fmt"
"log"
"net/http"
@ -388,9 +389,24 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) {
// Now stop containers + remove volumes for all descendants (any depth).
// Any concurrent heartbeat / registration / liveness-triggered restart
// will see status='removed' and bail out early.
//
// #1843: Stop() errors used to be silently swallowed. On the CP/EC2
// backend, Stop() calls the control plane's DELETE workspaces endpoint
// to terminate the EC2; if that errors (CP transient 5xx, network),
// the EC2 stays running with no DB row to track it — the
// "14 orphan workspace EC2s on a 0-customer account" scenario.
// Aggregate Stop failures and surface them as 500 so the client can
// retry. The retry replays Stop with the same instance_id (still
// readable from the row even after status='removed') — idempotent on
// the CP side. RemoveVolume errors stay log-and-continue: those are
// local cleanup of /var/data, not infra-leak class.
var stopErrs []error
for _, descID := range descendantIDs {
if h.provisioner != nil {
h.provisioner.Stop(ctx, descID)
if err := h.provisioner.Stop(ctx, descID); err != nil {
log.Printf("Delete descendant %s stop error: %v", descID, err)
stopErrs = append(stopErrs, fmt.Errorf("stop descendant %s: %w", descID, err))
}
if err := h.provisioner.RemoveVolume(ctx, descID); err != nil {
log.Printf("Delete descendant %s volume removal warning: %v", descID, err)
}
@ -401,7 +417,10 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) {
// Stop + remove volume for the workspace itself
if h.provisioner != nil {
h.provisioner.Stop(ctx, id)
if err := h.provisioner.Stop(ctx, id); err != nil {
log.Printf("Delete %s stop error: %v", id, err)
stopErrs = append(stopErrs, fmt.Errorf("stop %s: %w", id, err))
}
if err := h.provisioner.RemoveVolume(ctx, id); err != nil {
log.Printf("Delete %s volume removal warning: %v", id, err)
}
@ -412,6 +431,21 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) {
"cascade_deleted": len(descendantIDs),
})
// If any Stop call failed, surface 500 so the client retries. The DB
// row is already 'removed' (idempotent), and Stop's instance_id
// lookup tolerates that — the retry replays the terminate. This is
// the loud-fail-instead-of-silent-leak choice; users see a 500
// instead of an orphaned EC2.
if len(stopErrs) > 0 {
c.JSON(http.StatusInternalServerError, gin.H{
"error": fmt.Sprintf("workspace marked removed, but %d stop call(s) failed — please retry: %v",
len(stopErrs), errors.Join(stopErrs...)),
"removed_count": len(allIDs),
"stop_failures": len(stopErrs),
})
return
}
// Hard purge: cascade delete all FK data and remove the DB row entirely (#1087)
if c.Query("purge") == "true" {
purgeIDs := pq.Array(allIDs)

View File

@ -96,6 +96,14 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri
applyAgentGitIdentity(envVars, payload.Name)
applyRuntimeModelEnv(envVars, payload.Runtime, payload.Model)
// Propagate the workspace's role into env so role-aware plugins
// (gh-identity — molecule-core#1957) can read it without the
// plugin interface having to carry the full payload. Role is
// cosmetic metadata — no auth weight on it — safe to surface as env.
if payload.Role != "" {
envVars["MOLECULE_AGENT_ROLE"] = payload.Role
}
// Plugin extension point: run any registered EnvMutators (e.g.
// github-app-auth, vault-secrets) AFTER built-in identity injection so
// plugins can override or augment GIT_AUTHOR_*, GITHUB_TOKEN, etc.
@ -688,6 +696,11 @@ func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string
applyAgentGitIdentity(envVars, payload.Name)
applyRuntimeModelEnv(envVars, payload.Runtime, payload.Model)
// Propagate role for role-aware plugins (#1957). See provisionWorkspace
// above for rationale.
if payload.Role != "" {
envVars["MOLECULE_AGENT_ROLE"] = payload.Role
}
if err := h.envMutators.Run(ctx, workspaceID, envVars); err != nil {
log.Printf("CPProvisioner: env mutator failed for %s: %v", workspaceID, err)
// F1086 / #1206: env mutator errors (missing tokens, vault paths) must not

View File

@ -304,6 +304,7 @@ func CanvasOrBearer(database *sql.DB) gin.HandlerFunc {
}
c.AbortWithStatusJSON(http.StatusUnauthorized, gin.H{"error": "admin auth required"})
return
}
}

View File

@ -1011,8 +1011,10 @@ func TestCanvasOrBearer_TokensExist_NoCreds_Returns401(t *testing.T) {
mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1))
handlerCalled := false
r := gin.New()
r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) {
handlerCalled = true
c.JSON(http.StatusOK, gin.H{"ok": true})
})
@ -1023,6 +1025,47 @@ func TestCanvasOrBearer_TokensExist_NoCreds_Returns401(t *testing.T) {
if w.Code != http.StatusUnauthorized {
t.Errorf("no creds: got %d, want 401", w.Code)
}
if handlerCalled {
t.Error("handler was called after AbortWithStatusJSON — missing return allows fall-through")
}
if body := w.Body.String(); body == `{"ok":true}` {
t.Error("handler body written after AbortWithStatusJSON")
}
}
func TestCanvasOrBearer_TokensExist_WrongOrigin_Returns401(t *testing.T) {
mockDB, mock, err := sqlmock.New()
if err != nil {
t.Fatalf("sqlmock: %v", err)
}
defer mockDB.Close()
mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1))
t.Setenv("CORS_ORIGINS", "https://acme.moleculesai.app")
handlerCalled := false
r := gin.New()
r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) {
handlerCalled = true
c.JSON(http.StatusOK, gin.H{"ok": true})
})
w := httptest.NewRecorder()
req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil)
req.Header.Set("Origin", "https://evil.example.com")
r.ServeHTTP(w, req)
if w.Code != http.StatusUnauthorized {
t.Errorf("wrong origin: got %d, want 401", w.Code)
}
if handlerCalled {
t.Error("handler was called after AbortWithStatusJSON — missing return allows fall-through")
}
if body := w.Body.String(); body == `{"ok":true}` {
t.Error("handler body written after AbortWithStatusJSON")
}
}
func TestCanvasOrBearer_TokensExist_CanvasOrigin_Passes(t *testing.T) {
@ -1100,7 +1143,7 @@ func TestAdminAuth_RemovedWorkspaceToken_Returns401(t *testing.T) {
}
}
func TestCanvasOrBearer_TokensExist_WrongOrigin_Returns401(t *testing.T) {
func TestCanvasOrBearer_WrongOrigin_Blocked(t *testing.T) {
mockDB, mock, err := sqlmock.New()
if err != nil {
t.Fatalf("sqlmock: %v", err)

View File

@ -18,30 +18,49 @@ type ProvisionTimeoutEmitter interface {
}
// DefaultProvisioningTimeout is how long a workspace may sit in
// status='provisioning' before the sweeper flips it to 'failed'. The
// container-launch path has its own 3-minute context timeout
// (provisioner.ProvisionTimeout) but that only bounds the docker API call —
// a container that started but crashes before /registry/register never
// triggers that path and would sit in provisioning forever. 10 minutes
// covers pathological image-pull + user-data execution on a cold EC2 worker
// while still getting well ahead of the "15+ minute" stuck state users see
// in production.
// status='provisioning' before the sweeper flips it to 'failed'.
// Default for non-hermes runtimes (claude-code, langgraph, crewai,
// autogen, etc.) which cold-boot in <5 min. The container-launch path
// has its own 3-minute context timeout (provisioner.ProvisionTimeout)
// but that only bounds the docker API call — a container that started
// but crashes before /registry/register never triggers that path and
// would sit in provisioning forever. 10 minutes covers pathological
// image-pull + user-data execution on a cold EC2 worker while still
// getting well ahead of the "15+ minute" stuck state users see in
// production.
const DefaultProvisioningTimeout = 10 * time.Minute
// HermesProvisioningTimeout matches the CP bootstrap-watcher's
// runtime-aware deadline (cp#245) for hermes workspaces: 25 min watcher
// + 5 min sweep slack. Hermes cold-boot does apt + uv + Python venv +
// Node + hermes-agent install — 1325 min on slow apt mirrors is
// normal. Without this, the sweep would flip the workspace to 'failed'
// at 10 min while the watcher (and the workspace itself) is still
// happily progressing through install. Issue #1843 follow-up: a
// healthy 10.5-min hermes boot was killed by the 10-min sweep on
// 2026-04-26, breaking #2061's E2E.
const HermesProvisioningTimeout = 30 * time.Minute
// DefaultProvisionSweepInterval is how often the sweeper polls. Same cadence
// as the hibernation monitor — cheap and bounded by the provisioning-state
// query which hits the primary key / status partial index.
const DefaultProvisionSweepInterval = 30 * time.Second
// provisioningTimeout reads the override from env, falling back to the
// default. Env var expressed in seconds so operators can tune via a normal
// container restart without a code change.
func provisioningTimeout() time.Duration {
// provisioningTimeoutFor picks the per-runtime sweep deadline. Mirrors
// the CP bootstrap-watcher's runtime gating (provisioner.bootstrapTimeoutFn).
// PROVISION_TIMEOUT_SECONDS env override, when set, applies to ALL
// runtimes — useful for ops debugging but loses the runtime nuance, so
// operators should prefer the defaults unless they have a specific
// reason.
func provisioningTimeoutFor(runtime string) time.Duration {
if v := os.Getenv("PROVISION_TIMEOUT_SECONDS"); v != "" {
if n, err := strconv.Atoi(v); err == nil && n > 0 {
return time.Duration(n) * time.Second
}
}
if runtime == "hermes" {
return HermesProvisioningTimeout
}
return DefaultProvisioningTimeout
}
@ -65,7 +84,8 @@ func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeout
ticker := time.NewTicker(interval)
defer ticker.Stop()
log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s)", interval, provisioningTimeout())
log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s default / %s hermes)",
interval, DefaultProvisioningTimeout, HermesProvisioningTimeout)
for {
select {
@ -80,33 +100,51 @@ func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeout
// sweepStuckProvisioning is one tick of the sweeper. Exported-for-test via
// the package boundary: keep all time.Now reads inside so tests can drive it
// deterministically by seeding updated_at rather than manipulating time.
//
// Runtime-aware: the per-workspace timeout depends on `runtime`. Hermes
// gets 30 min (matching the CP bootstrap-watcher's 25-min deadline + 5
// min slack); everything else gets 10 min. Without this distinction a
// healthy hermes cold-boot at 1025 min got killed mid-install by this
// sweep, leaving an incoherent "marked failed but actually working"
// state. See bootstrap_watcher.go's bootstrapTimeoutFn for the
// canonical CP-side gating.
func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter) {
timeout := provisioningTimeout()
timeoutSec := int(timeout / time.Second)
// Read candidates first so the event broadcast can include each id. The
// subsequent UPDATE re-checks the predicate to stay race-safe against
// concurrent restart / register paths that write updated_at.
// We can't pre-filter by age in SQL because the threshold depends
// on the row's runtime. Pull every provisioning row + its runtime
// + its age, evaluate per-row in Go. Still cheap — the
// status='provisioning' row count is bounded (workspaces in
// flight, not historical) and the partial index on status keeps
// it fast.
rows, err := db.DB.QueryContext(ctx, `
SELECT id FROM workspaces
SELECT id, COALESCE(runtime, ''), EXTRACT(EPOCH FROM (now() - updated_at))::int
FROM workspaces
WHERE status = 'provisioning'
AND updated_at < now() - ($1 || ' seconds')::interval
`, timeoutSec)
`)
if err != nil {
log.Printf("Provision-timeout sweep: query error: %v", err)
return
}
defer rows.Close()
var ids []string
type candidate struct {
id string
runtime string
ageSec int
}
var ids []candidate
for rows.Next() {
var id string
if err := rows.Scan(&id); err == nil {
ids = append(ids, id)
var c candidate
if err := rows.Scan(&c.id, &c.runtime, &c.ageSec); err == nil {
ids = append(ids, c)
}
}
for _, id := range ids {
for _, c := range ids {
timeout := provisioningTimeoutFor(c.runtime)
timeoutSec := int(timeout / time.Second)
if c.ageSec < timeoutSec {
continue
}
msg := "provisioning timed out — container started but never called /registry/register. Check container logs and network connectivity to the platform."
res, err := db.DB.ExecContext(ctx, `
UPDATE workspaces
@ -116,9 +154,9 @@ func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter
WHERE id = $1
AND status = 'provisioning'
AND updated_at < now() - ($3 || ' seconds')::interval
`, id, msg, timeoutSec)
`, c.id, msg, timeoutSec)
if err != nil {
log.Printf("Provision-timeout sweep: failed to flip %s to failed: %v", id, err)
log.Printf("Provision-timeout sweep: failed to flip %s to failed: %v", c.id, err)
continue
}
affected, _ := res.RowsAffected()
@ -126,18 +164,19 @@ func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter
// Raced with restart / register — no harm, just skip.
continue
}
log.Printf("Provision-timeout sweep: %s stuck in provisioning > %s — marked failed", id, timeout)
log.Printf("Provision-timeout sweep: %s (runtime=%q) stuck in provisioning > %s — marked failed", c.id, c.runtime, timeout)
// Emit as WORKSPACE_PROVISION_FAILED, not _TIMEOUT, because the
// canvas event handler only flips node state on the _FAILED case.
// A separate event type was considered but the UI reaction is
// identical either way — operators who need to distinguish can
// tell from the `source` payload field.
if emitErr := emitter.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", id, map[string]interface{}{
if emitErr := emitter.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", c.id, map[string]interface{}{
"error": msg,
"timeout_secs": timeoutSec,
"runtime": c.runtime,
"source": "provision_timeout_sweep",
}); emitErr != nil {
log.Printf("Provision-timeout sweep: broadcast failed for %s: %v", id, emitErr)
log.Printf("Provision-timeout sweep: broadcast failed for %s: %v", c.id, emitErr)
}
}
}

View File

@ -5,6 +5,7 @@ import (
"errors"
"sync"
"testing"
"time"
"github.com/DATA-DOG/go-sqlmock"
)
@ -40,13 +41,24 @@ func (f *fakeEmitter) count() int {
return len(f.events)
}
// candidateRows builds the new-shape query result (id, runtime, age_sec).
// Use this in every sweep test to match the runtime-aware SELECT.
func candidateRows(rows ...[3]any) *sqlmock.Rows {
r := sqlmock.NewRows([]string{"id", "runtime", "age_sec"})
for _, row := range rows {
r = r.AddRow(row[0], row[1], row[2])
}
return r
}
// TestSweepStuckProvisioning_FlipsOverdue verifies the happy path: a stuck
// provisioning workspace gets flipped to failed AND an event is broadcast.
func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
mock := setupTestDB(t)
mock.ExpectQuery(`SELECT id FROM workspaces`).
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-stuck"))
// claude-code workspace, 700s old > 600s default timeout → flipped.
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
WillReturnRows(candidateRows([3]any{"ws-stuck", "claude-code", 700}))
mock.ExpectExec(`UPDATE workspaces`).
WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg()).
@ -69,6 +81,60 @@ func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
}
}
// TestSweepStuckProvisioning_HermesGets30MinSlack — the regression that
// motivated the runtime-aware change. A hermes workspace 11 min into
// cold-boot must NOT be flipped to failed; the watcher's 25-min budget
// covers it. Without the fix, the 10-min sweep killed healthy hermes
// boots mid-install (issue #2061's E2E failure on 2026-04-26).
func TestSweepStuckProvisioning_HermesGets30MinSlack(t *testing.T) {
mock := setupTestDB(t)
// 11 min = 660 sec. < HermesProvisioningTimeout (1800s).
// No UPDATE should fire — hermes still has time.
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
WillReturnRows(candidateRows([3]any{"ws-hermes-booting", "hermes", 660}))
emit := &fakeEmitter{}
sweepStuckProvisioning(context.Background(), emit)
if emit.count() != 0 {
t.Fatalf("hermes at 11min should NOT have been flipped, got %d events", emit.count())
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet expectations: %v", err)
}
}
// TestSweepStuckProvisioning_HermesPastDeadline — a hermes workspace
// past 30 min DOES get flipped. Closes the loop on the runtime-aware
// fix: it's still bounded, just with a longer threshold than other
// runtimes.
func TestSweepStuckProvisioning_HermesPastDeadline(t *testing.T) {
mock := setupTestDB(t)
// 31 min = 1860 sec > HermesProvisioningTimeout (1800s).
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
WillReturnRows(candidateRows([3]any{"ws-hermes-stuck", "hermes", 1860}))
mock.ExpectExec(`UPDATE workspaces`).
WithArgs("ws-hermes-stuck", sqlmock.AnyArg(), sqlmock.AnyArg()).
WillReturnResult(sqlmock.NewResult(0, 1))
emit := &fakeEmitter{}
sweepStuckProvisioning(context.Background(), emit)
if emit.count() != 1 {
t.Fatalf("hermes past 30min must be flipped, got %d events", emit.count())
}
// Payload should include runtime so ops can distinguish in logs.
payload, ok := emit.events[0].Payload.(map[string]interface{})
if !ok {
t.Fatalf("payload not a map: %T", emit.events[0].Payload)
}
if payload["runtime"] != "hermes" {
t.Errorf("payload.runtime = %v, want hermes", payload["runtime"])
}
}
// TestSweepStuckProvisioning_RaceSafe covers the case where UPDATE affects
// 0 rows because the workspace flipped to online (or got restarted) between
// the SELECT and the UPDATE. We should skip the event, not emit a false
@ -76,8 +142,8 @@ func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
func TestSweepStuckProvisioning_RaceSafe(t *testing.T) {
mock := setupTestDB(t)
mock.ExpectQuery(`SELECT id FROM workspaces`).
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-raced"))
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
WillReturnRows(candidateRows([3]any{"ws-raced", "claude-code", 700}))
mock.ExpectExec(`UPDATE workspaces`).
WithArgs("ws-raced", sqlmock.AnyArg(), sqlmock.AnyArg()).
@ -99,8 +165,8 @@ func TestSweepStuckProvisioning_RaceSafe(t *testing.T) {
func TestSweepStuckProvisioning_NoStuck(t *testing.T) {
mock := setupTestDB(t)
mock.ExpectQuery(`SELECT id FROM workspaces`).
WillReturnRows(sqlmock.NewRows([]string{"id"}))
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
WillReturnRows(candidateRows())
emit := &fakeEmitter{}
sweepStuckProvisioning(context.Background(), emit)
@ -115,14 +181,16 @@ func TestSweepStuckProvisioning_NoStuck(t *testing.T) {
// TestSweepStuckProvisioning_MultipleStuck covers the realistic case where
// both agents (claude-code + hermes) are stuck — both should get flipped
// and both should get events.
// and both should get events. claude-code at 11 min (over its 10-min
// limit), hermes at 31 min (over its 30-min limit).
func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) {
mock := setupTestDB(t)
mock.ExpectQuery(`SELECT id FROM workspaces`).
WillReturnRows(sqlmock.NewRows([]string{"id"}).
AddRow("ws-claude-code").
AddRow("ws-hermes"))
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
WillReturnRows(candidateRows(
[3]any{"ws-claude-code", "claude-code", 700},
[3]any{"ws-hermes", "hermes", 1860},
))
mock.ExpectExec(`UPDATE workspaces`).
WithArgs("ws-claude-code", sqlmock.AnyArg(), sqlmock.AnyArg()).
@ -145,8 +213,8 @@ func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) {
func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) {
mock := setupTestDB(t)
mock.ExpectQuery(`SELECT id FROM workspaces`).
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-stuck"))
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
WillReturnRows(candidateRows([3]any{"ws-stuck", "claude-code", 700}))
mock.ExpectExec(`UPDATE workspaces`).
WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg()).
WillReturnResult(sqlmock.NewResult(0, 1))
@ -158,18 +226,47 @@ func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) {
// TestProvisioningTimeout_EnvOverride verifies PROVISION_TIMEOUT_SECONDS
// env var takes effect when set to a positive integer, and falls back to
// default otherwise.
// the per-runtime default otherwise.
func TestProvisioningTimeout_EnvOverride(t *testing.T) {
t.Setenv("PROVISION_TIMEOUT_SECONDS", "60")
if got := provisioningTimeout(); got.Seconds() != 60 {
t.Errorf("override: got %v, want 60s", got)
// When env override is set it wins over runtime defaults.
if got := provisioningTimeoutFor(""); got.Seconds() != 60 {
t.Errorf("override (no runtime): got %v, want 60s", got)
}
if got := provisioningTimeoutFor("hermes"); got.Seconds() != 60 {
t.Errorf("override (hermes): got %v, want 60s", got)
}
t.Setenv("PROVISION_TIMEOUT_SECONDS", "")
if got := provisioningTimeout(); got != DefaultProvisioningTimeout {
t.Errorf("default: got %v, want %v", got, DefaultProvisioningTimeout)
if got := provisioningTimeoutFor(""); got != DefaultProvisioningTimeout {
t.Errorf("default (no runtime): got %v, want %v", got, DefaultProvisioningTimeout)
}
t.Setenv("PROVISION_TIMEOUT_SECONDS", "not-a-number")
if got := provisioningTimeout(); got != DefaultProvisioningTimeout {
t.Errorf("bad override: got %v, want default %v", got, DefaultProvisioningTimeout)
if got := provisioningTimeoutFor("claude-code"); got != DefaultProvisioningTimeout {
t.Errorf("bad override (claude-code): got %v, want default %v", got, DefaultProvisioningTimeout)
}
}
// TestProvisioningTimeout_RuntimeAware verifies hermes gets the longer
// HermesProvisioningTimeout while other runtimes keep the default.
// Mirrors bootstrap_watcher.go's bootstrapTimeoutFn — these two
// timeouts must stay in sync (sweep > watcher) or healthy hermes
// boots get killed mid-install.
func TestProvisioningTimeout_RuntimeAware(t *testing.T) {
cases := []struct {
runtime string
want time.Duration
}{
{"hermes", HermesProvisioningTimeout},
{"langgraph", DefaultProvisioningTimeout},
{"claude-code", DefaultProvisioningTimeout},
{"crewai", DefaultProvisioningTimeout},
{"autogen", DefaultProvisioningTimeout},
{"", DefaultProvisioningTimeout},
{"unknown-runtime", DefaultProvisioningTimeout},
}
for _, c := range cases {
if got := provisioningTimeoutFor(c.runtime); got != c.want {
t.Errorf("runtime=%q: got %v, want %v", c.runtime, got, c.want)
}
}
}

View File

@ -8,6 +8,7 @@ import (
"strings"
"sync"
"time"
"unicode/utf8"
"github.com/google/uuid"
cronlib "github.com/robfig/cron/v3"
@ -23,8 +24,26 @@ const (
fireTimeout = 5 * time.Minute
phantomSweepInterval = 5 * time.Minute
phantomStaleThreshold = 10 * time.Minute
// #2026: per-DB-op deadline. Every scheduler DB call must complete
// within this window or the Exec/Query is cancelled and the tick
// continues. Before this, a slow/stuck DB op (bad UTF-8 rejected by
// Postgres, connection pool exhausted, replica lag) would block a
// fireSchedule goroutine indefinitely, which blocked wg.Wait() in
// tick(), which stalled the entire scheduler until operator restart.
dbQueryTimeout = 10 * time.Second
)
// sanitizeUTF8 replaces invalid UTF-8 byte sequences with the Unicode
// replacement character. Used before writing agent-produced strings to
// Postgres (text/jsonb columns reject invalid UTF-8, silently failing the
// INSERT and holding the transaction open). #2026.
func sanitizeUTF8(s string) string {
if utf8.ValidString(s) {
return s
}
return strings.ToValidUTF8(s, "<22>")
}
// A2AProxy is the interface the scheduler needs to send messages to workspaces.
// WorkspaceHandler.ProxyA2ARequest satisfies this.
type A2AProxy interface {
@ -186,7 +205,10 @@ func (s *Scheduler) Start(ctx context.Context) {
func (s *Scheduler) tick(ctx context.Context) {
supervised.Heartbeat("scheduler")
rows, err := db.DB.QueryContext(ctx, `
// #2026: bound the due-schedules query — if Postgres is slow/stuck
// this fails fast instead of blocking the tick loop indefinitely.
queryCtx, queryCancel := context.WithTimeout(ctx, dbQueryTimeout)
rows, err := db.DB.QueryContext(queryCtx, `
SELECT id, workspace_id, name, cron_expr, timezone, prompt
FROM workspace_schedules
WHERE enabled = true AND next_run_at IS NOT NULL AND next_run_at <= now()
@ -194,9 +216,11 @@ func (s *Scheduler) tick(ctx context.Context) {
LIMIT $1
`, batchLimit)
if err != nil {
queryCancel()
log.Printf("Scheduler: tick query error: %v", err)
return
}
defer queryCancel()
defer rows.Close()
var wg sync.WaitGroup
@ -276,20 +300,29 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
// to allow concurrent task processing (e.g. leaders handling A2A while cron runs).
var activeTasks int
var maxConcurrent int
if err := db.DB.QueryRowContext(ctx,
// #2026: bound the capacity check — if the DB is slow, fail open
// (skip the capacity wait, let fireTimeout catch a truly stuck fire)
// rather than blocking here indefinitely.
capCtx, capCancel := context.WithTimeout(ctx, dbQueryTimeout)
capErr := db.DB.QueryRowContext(capCtx,
`SELECT COALESCE(active_tasks, 0), COALESCE(max_concurrent_tasks, 1) FROM workspaces WHERE id = $1`,
sched.WorkspaceID,
).Scan(&activeTasks, &maxConcurrent); err == nil && activeTasks >= maxConcurrent {
).Scan(&activeTasks, &maxConcurrent)
capCancel()
if capErr == nil && activeTasks >= maxConcurrent {
log.Printf("Scheduler: '%s' workspace %s at capacity (active_tasks=%d, max=%d), deferring up to 2 min",
sched.Name, short(sched.WorkspaceID, 12), activeTasks, maxConcurrent)
// Poll every 10s for up to 2 minutes
waited := false
for i := 0; i < 12; i++ {
time.Sleep(10 * time.Second)
if err := db.DB.QueryRowContext(ctx,
pollCtx, pollCancel := context.WithTimeout(ctx, dbQueryTimeout)
err := db.DB.QueryRowContext(pollCtx,
`SELECT COALESCE(active_tasks, 0), COALESCE(max_concurrent_tasks, 1) FROM workspaces WHERE id = $1`,
sched.WorkspaceID,
).Scan(&activeTasks, &maxConcurrent); err != nil || activeTasks < maxConcurrent {
).Scan(&activeTasks, &maxConcurrent)
pollCancel()
if err != nil || activeTasks < maxConcurrent {
waited = true
break
}
@ -362,7 +395,12 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
// per schedule; at 100 tenants × dozens of schedules the saved
// query matters.
var consecEmpty int
if err := db.DB.QueryRowContext(ctx, `
// #2026: bound the empty-run UPDATE — survives outer ctx cancellation
// (uses Background()) so the bookkeeping completes even if fireTimeout
// cancelled the HTTP call, and has its own deadline so a stuck DB
// can't block the goroutine.
emptyCtx, emptyCancel := context.WithTimeout(context.Background(), dbQueryTimeout)
if err := db.DB.QueryRowContext(emptyCtx, `
UPDATE workspace_schedules
SET consecutive_empty_runs = consecutive_empty_runs + 1,
updated_at = now()
@ -370,6 +408,7 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
RETURNING consecutive_empty_runs`, sched.ID).Scan(&consecEmpty); err != nil {
log.Printf("Scheduler: '%s' empty-run bump failed: %v", sched.Name, err)
}
emptyCancel()
if consecEmpty >= 3 {
lastStatus = "stale"
lastError = fmt.Sprintf("empty response %d consecutive times — agent may be phantom-producing (#795)", consecEmpty)
@ -378,11 +417,13 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
}
} else if lastStatus == "ok" {
// Non-empty success — reset the counter
db.DB.ExecContext(ctx, `
resetCtx, resetCancel := context.WithTimeout(context.Background(), dbQueryTimeout)
_, _ = db.DB.ExecContext(resetCtx, `
UPDATE workspace_schedules
SET consecutive_empty_runs = 0,
updated_at = now()
WHERE id = $1`, sched.ID)
resetCancel()
}
nextRun, nextErr := ComputeNextRun(sched.CronExpr, sched.Timezone, time.Now())
@ -422,20 +463,31 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
// Log a dedicated cron_run activity entry with schedule metadata so the
// history endpoint can query by schedule_id.
// #2026: sanitize the truncated prompt — even UTF-8-safe truncate() can
// carry pre-existing invalid bytes from an agent-edited template. jsonb
// columns reject invalid UTF-8 and hold the transaction open.
cronMeta, _ := json.Marshal(map[string]interface{}{
"schedule_id": sched.ID,
"schedule_name": sched.Name,
"cron_expr": sched.CronExpr,
"prompt": truncate(sched.Prompt, 200),
"prompt": sanitizeUTF8(truncate(sched.Prompt, 200)),
})
// #152: persist lastError into error_detail on the activity_logs row
// so GET /workspaces/:id/schedules/:id/history can surface why a run
// failed (previously dropped — history returned status without any
// error context, making root-cause debugging impossible).
_, _ = db.DB.ExecContext(ctx, `
// #2026: bounded Background() context — this INSERT was observed wedging
// indefinitely on invalid-UTF-8 jsonb payloads, blocking wg.Wait() in
// tick() and stalling the whole scheduler. Now: 10s deadline, survives
// outer ctx cancellation, and every string is UTF-8 sanitized.
insertCtx, insertCancel := context.WithTimeout(context.Background(), dbQueryTimeout)
if _, insErr := db.DB.ExecContext(insertCtx, `
INSERT INTO activity_logs (workspace_id, activity_type, source_id, method, summary, request_body, status, error_detail, created_at)
VALUES ($1, 'cron_run', NULL, 'cron', $2, $3::jsonb, $4, $5, now())
`, sched.WorkspaceID, "Cron: "+sched.Name, string(cronMeta), lastStatus, lastError)
`, sched.WorkspaceID, sanitizeUTF8("Cron: "+sched.Name), string(cronMeta), lastStatus, sanitizeUTF8(lastError)); insErr != nil {
log.Printf("Scheduler: activity_logs insert failed for '%s' (%s): %v", sched.Name, sched.ID, insErr)
}
insertCancel()
if s.broadcaster != nil {
s.broadcaster.RecordAndBroadcast(ctx, "CRON_EXECUTED", sched.WorkspaceID, map[string]interface{}{
@ -483,7 +535,10 @@ func (s *Scheduler) recordSkipped(ctx context.Context, sched scheduleRow, active
// Advance next_run_at + bump run_count so the liveness view reflects
// that we're still ticking. last_status='skipped', last_error carries
// the reason for operators debugging via the schedule history API.
_, _ = db.DB.ExecContext(ctx, `
// #2026: bounded Background() context so the bookkeeping can't block
// on a stuck DB and stall the scheduler.
skipUpdCtx, skipUpdCancel := context.WithTimeout(context.Background(), dbQueryTimeout)
_, _ = db.DB.ExecContext(skipUpdCtx, `
UPDATE workspace_schedules
SET last_run_at = now(),
next_run_at = COALESCE($2, next_run_at),
@ -492,7 +547,8 @@ func (s *Scheduler) recordSkipped(ctx context.Context, sched scheduleRow, active
last_error = $3,
updated_at = now()
WHERE id = $1
`, sched.ID, nextRunPtr, reason)
`, sched.ID, nextRunPtr, sanitizeUTF8(reason))
skipUpdCancel()
cronMeta, _ := json.Marshal(map[string]interface{}{
"schedule_id": sched.ID,
@ -501,10 +557,14 @@ func (s *Scheduler) recordSkipped(ctx context.Context, sched scheduleRow, active
"skipped": true,
"active_tasks": activeTasks,
})
_, _ = db.DB.ExecContext(ctx, `
// #2026: bounded Background() context on the skipped activity log INSERT
// for the same reason as the fireSchedule activity_logs INSERT above.
skipInsCtx, skipInsCancel := context.WithTimeout(context.Background(), dbQueryTimeout)
_, _ = db.DB.ExecContext(skipInsCtx, `
INSERT INTO activity_logs (workspace_id, activity_type, source_id, method, summary, request_body, status, error_detail, created_at)
VALUES ($1, 'cron_run', NULL, 'cron', $2, $3::jsonb, 'skipped', $4, now())
`, sched.WorkspaceID, "Cron skipped: "+sched.Name, string(cronMeta), reason)
`, sched.WorkspaceID, sanitizeUTF8("Cron skipped: "+sched.Name), string(cronMeta), sanitizeUTF8(reason))
skipInsCancel()
if s.broadcaster != nil {
_ = s.broadcaster.RecordAndBroadcast(ctx, "CRON_SKIPPED", sched.WorkspaceID, map[string]interface{}{
@ -690,11 +750,26 @@ func isEmptyResponse(body []byte) bool {
return false
}
// truncate shortens s to at most maxLen bytes, appending "..." if truncated.
// #2026: UTF-8 safe — byte-slicing at maxLen-3 would split multi-byte runes
// (observed: U+2026 `…` = 0xe2 0x80 0xa6, sliced mid-char, concatenated with
// "..." producing 0xe2 0x80 0x2e — rejected by Postgres as invalid UTF-8,
// which wedged the activity_logs INSERT with no deadline and stalled the
// scheduler).
func truncate(s string, maxLen int) string {
if len(s) <= maxLen {
return s
}
return s[:maxLen-3] + "..."
cut := maxLen - 3
if cut < 0 {
cut = 0
}
// Back up to a rune boundary — utf8.RuneStart returns true for any
// non-continuation byte (ASCII, or the lead byte of a multi-byte rune).
for cut > 0 && !utf8.RuneStart(s[cut]) {
cut--
}
return s[:cut] + "..."
}
// short returns up to n leading characters of s without panicking when s is

View File

@ -5,6 +5,7 @@ import (
"database/sql"
"testing"
"time"
"unicode/utf8"
sqlmock "github.com/DATA-DOG/go-sqlmock"
@ -599,3 +600,55 @@ func TestRecordSkipped_AdvancesNextRunAt(t *testing.T) {
}
}
// trigger CI
// ── TestTruncate_utf8Safe_regression2026 ──────────────────────────────────────
// TestTruncate_utf8Safe_regression2026 locks in the #2026 fix: truncate must
// never split a multi-byte UTF-8 rune. Before the fix, a prompt whose byte-197
// landed mid-rune (e.g. U+2026 `…` = 0xe2 0x80 0xa6) would be sliced at
// maxLen-3 and produce the sequence 0xe2 0x80 0x2e when concatenated with
// "...", which Postgres rejects as invalid UTF-8 — wedging the activity_logs
// INSERT and stalling the entire scheduler.
func TestTruncate_utf8Safe_regression2026(t *testing.T) {
// Build a prompt where the byte at position 197 is the middle of the
// 3-byte rune U+2026 (`…`). With maxLen=200 the pre-fix code slices at
// byte 197 (maxLen-3), which lands on `0x80` — a continuation byte.
filler := ""
for len(filler) < 195 {
filler += "a"
}
input := filler + "…xxx" // 195 ASCII + 3-byte rune + 3 trailing
out := truncate(input, 200)
if !utf8.ValidString(out) {
t.Fatalf("truncate produced invalid UTF-8: %x", []byte(out))
}
// Must not contain the 0xe2 0x80 0x2e wedge sequence (partial rune
// followed by the "..." suffix).
for i := 0; i < len(out)-2; i++ {
if out[i] == 0xe2 && out[i+1] == 0x80 && out[i+2] == 0x2e {
t.Fatalf("truncate produced the 0xe2 0x80 0x2e wedge sequence at byte %d", i)
}
}
if len(out) > 200 {
t.Fatalf("truncate returned %d bytes, want <= 200", len(out))
}
}
// ── TestSanitizeUTF8 ──────────────────────────────────────────────────────────
// TestSanitizeUTF8 confirms sanitizeUTF8 leaves valid UTF-8 unchanged and
// replaces invalid sequences with the Unicode replacement character.
func TestSanitizeUTF8(t *testing.T) {
// Valid UTF-8 passes through unchanged.
valid := "hello … world"
if got := sanitizeUTF8(valid); got != valid {
t.Errorf("sanitizeUTF8(valid) = %q, want %q", got, valid)
}
// Invalid UTF-8 (orphan continuation byte) is sanitized.
bad := "hello \x80 world"
out := sanitizeUTF8(bad)
if !utf8.ValidString(out) {
t.Errorf("sanitizeUTF8 did not produce valid UTF-8: %x", []byte(out))
}
}

View File

@ -143,6 +143,21 @@ func (r *Registry) Names() []string {
return names
}
// Mutators returns a copy of the registered mutators in registration
// order. Used when multiple plugins build their own registries and need
// to merge onto a shared one at boot. Returns a copy so callers can't
// mutate internal state.
func (r *Registry) Mutators() []EnvMutator {
if r == nil {
return nil
}
r.mu.RLock()
defer r.mu.RUnlock()
out := make([]EnvMutator, len(r.mutators))
copy(out, r.mutators)
return out
}
// FirstTokenProvider returns the first registered mutator that also
// implements TokenProvider, or nil if none do. Used to back the
// GET /admin/github-installation-token endpoint so long-running

View File

@ -247,8 +247,6 @@ class LangGraphA2AExecutor(AgentExecutor):
task_span.set_attribute(A2A_TASK_ID, context.context_id or "")
task_span.set_attribute("a2a.input_preview", user_input[:256])
await set_current_task(self._heartbeat, brief_task(user_input))
# Resolve IDs — the RequestContextBuilder always sets them, but
# we generate fallbacks for safety (e.g. in unit tests).
task_id = context.task_id or str(uuid.uuid4())
@ -257,6 +255,12 @@ class LangGraphA2AExecutor(AgentExecutor):
updater = TaskUpdater(event_queue, task_id, context_id)
try:
# set_current_task INSIDE the try so active_tasks is always
# decremented by the finally block even if CancelledError hits
# during the heartbeat HTTP push. Moving it outside the try
# created a window where cancellation left active_tasks stuck
# at 1, permanently blocking queue drain. (#2026)
await set_current_task(self._heartbeat, brief_task(user_input))
messages = _extract_history(context)
if messages:
logger.info("A2A execute: injecting %d history messages", len(messages))

View File

@ -426,14 +426,19 @@ class ClaudeSDKExecutor(AgentExecutor):
# Keep a clean copy of the user's actual message for the memory record,
# BEFORE any delegation or memory injection.
original_input = user_input
await set_current_task(self.heartbeat, brief_summary(user_input))
logger.debug("SDK execute [claude-code]: %s", user_input[:200])
prompt = self._prepare_prompt(user_input)
prompt = await self._inject_memories_if_first_turn(prompt)
response_text: str = ""
try:
# set_current_task INSIDE the try so active_tasks is always
# decremented by the finally block even if CancelledError hits
# during the heartbeat HTTP push. Moving it outside the try
# created a narrow window where cancellation left active_tasks
# stuck at 1 forever, permanently blocking queue drain. (#2026)
await set_current_task(self.heartbeat, brief_summary(user_input))
prompt = await self._inject_memories_if_first_turn(prompt)
for attempt in range(_MAX_RETRIES):
options = self._build_options()
try:

View File

@ -280,9 +280,6 @@ class CLIAgentExecutor(AgentExecutor):
# delegation or memory injection happens.
original_input = user_input
# Show current task on canvas — extract a brief one-line summary
await set_current_task(self._heartbeat, brief_summary(user_input))
logger.debug("CLI execute [%s]: %s", self.runtime, user_input[:200])
# Inject delegation results that arrived since last message
@ -290,13 +287,20 @@ class CLIAgentExecutor(AgentExecutor):
if delegation_context:
user_input = f"[Delegation results received while you were idle]\n{delegation_context}\n\n[New message]\n{user_input}"
# Auto-recall: inject prior memories into every prompt. (The CLI
# runtimes don't keep a session, so there's no "first turn" concept.)
memories = await recall_memories()
if memories:
user_input = f"[Prior context from memory]\n{memories}\n\n{user_input}"
try:
# set_current_task INSIDE the try so active_tasks is always
# decremented by the finally block even if CancelledError hits
# during the heartbeat HTTP push. Moving it outside the try
# created a window where cancellation left active_tasks stuck
# at 1, permanently blocking queue drain. (#2026)
await set_current_task(self._heartbeat, brief_summary(user_input))
# Auto-recall: inject prior memories into every prompt. (The CLI
# runtimes don't keep a session, so there's no "first turn" concept.)
memories = await recall_memories()
if memories:
user_input = f"[Prior context from memory]\n{memories}\n\n{user_input}"
await self._run_cli(user_input, event_queue)
finally:
await set_current_task(self._heartbeat, "")

View File

@ -166,23 +166,42 @@ class SecurityScanConfig:
class ComplianceConfig:
"""OWASP Top 10 for Agentic Applications compliance settings.
Set ``mode: owasp_agentic`` to enable all checks. When ``mode`` is
empty or absent the compliance layer is a complete no-op.
Default is ``mode: owasp_agentic`` + ``prompt_injection: detect``.
The detect mode logs injection attempts as audit events without
blocking the request so there is no false-positive UX cost, only
a gain in visibility. Operators opt into stricter ``block`` mode per
workspace. To disable compliance entirely (not recommended), set
``mode: ""`` in config.yaml.
Example config.yaml snippet::
Before 2026-04-24, the default was ``mode: ""`` (fully off). A
review of the A2A inbound path showed that no shipped template set
``mode`` explicitly, so prompt-injection detection was silently
disabled for every live workspace despite the machinery existing.
Flipping the default to ``owasp_agentic`` with ``prompt_injection:
detect`` closes that gap with zero user-visible behavior change.
Example config.yaml snippet to opt OUT::
compliance:
mode: owasp_agentic
prompt_injection: block # detect | block (default: detect)
mode: "" # disables all compliance checks
Example config.yaml snippet to tighten::
compliance:
mode: owasp_agentic # (default)
prompt_injection: block # (default: detect)
max_tool_calls_per_task: 30
max_task_duration_seconds: 180
"""
mode: str = ""
"""Enable compliance mode. Set to ``owasp_agentic`` to activate."""
mode: str = "owasp_agentic"
"""Enable compliance mode. ``owasp_agentic`` (default) activates the
OA-01/OA-02/OA-03/OA-06 checks; ``""`` disables everything."""
prompt_injection: str = "detect"
"""``detect`` logs injection attempts; ``block`` raises PromptInjectionError."""
"""``detect`` logs injection attempts (default, zero UX cost);
``block`` raises PromptInjectionError before the agent sees the
text. Operators can tighten to ``block`` per workspace."""
max_tool_calls_per_task: int = 50
"""Maximum number of tool invocations per task before ExcessiveAgencyError."""
@ -353,7 +372,9 @@ def load_config(config_path: Optional[str] = None) -> WorkspaceConfig:
fail_open_if_no_scanner=security_scan_raw.get("fail_open_if_no_scanner", True),
),
compliance=ComplianceConfig(
mode=compliance_raw.get("mode", ""),
# Default must match ComplianceConfig.mode's dataclass default
# (see class docstring for rationale — 2026-04-24 flip).
mode=compliance_raw.get("mode", "owasp_agentic"),
prompt_injection=compliance_raw.get("prompt_injection", "detect"),
max_tool_calls_per_task=int(compliance_raw.get("max_tool_calls_per_task", 50)),
max_task_duration_seconds=int(compliance_raw.get("max_task_duration_seconds", 300)),