Merge branch 'staging' into feat/external-runtime-first-class
This commit is contained in:
commit
775406d7fe
19
.github/workflows/canary-staging.yml
vendored
19
.github/workflows/canary-staging.yml
vendored
@ -43,6 +43,17 @@ jobs:
|
||||
env:
|
||||
MOLECULE_CP_URL: https://staging-api.moleculesai.app
|
||||
MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
||||
# Without an LLM key the test_staging_full_saas.sh script provisions
|
||||
# the workspace with empty secrets, hermes derive-provider.sh resolves
|
||||
# `openai/gpt-4o` to PROVIDER=openrouter, no OPENROUTER_API_KEY is
|
||||
# found in env, and A2A returns "No LLM provider configured" at
|
||||
# request time (canary step 8/11). The full-lifecycle workflow
|
||||
# (e2e-staging-saas.yml) has carried this secret since launch — the
|
||||
# canary regressed when it was first split out and lost the env
|
||||
# block. Issue #1500 had ~30 consecutive failures before this was
|
||||
# spotted; do NOT remove without re-reading the script's secrets-
|
||||
# injection block.
|
||||
E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }}
|
||||
E2E_MODE: canary
|
||||
E2E_RUNTIME: hermes
|
||||
E2E_RUN_ID: "canary-${{ github.run_id }}"
|
||||
@ -57,6 +68,14 @@ jobs:
|
||||
exit 2
|
||||
fi
|
||||
|
||||
- name: Verify OpenAI key present
|
||||
run: |
|
||||
if [ -z "$E2E_OPENAI_API_KEY" ]; then
|
||||
echo "::error::MOLECULE_STAGING_OPENAI_KEY secret not set — A2A will fail at request time with 'No LLM provider configured'"
|
||||
exit 2
|
||||
fi
|
||||
echo "OpenAI key present ✓ (len=${#E2E_OPENAI_API_KEY})"
|
||||
|
||||
- name: Canary run
|
||||
id: canary
|
||||
run: bash tests/e2e/test_staging_full_saas.sh
|
||||
|
||||
164
.github/workflows/redeploy-tenants-on-main.yml
vendored
Normal file
164
.github/workflows/redeploy-tenants-on-main.yml
vendored
Normal file
@ -0,0 +1,164 @@
|
||||
name: redeploy-tenants-on-main
|
||||
|
||||
# Auto-refresh prod tenant EC2s after every main merge.
|
||||
#
|
||||
# Why this workflow exists: publish-workspace-server-image builds and
|
||||
# pushes a new platform-tenant:latest + :<sha> to GHCR on every merge
|
||||
# to main, but running tenants pulled their image once at boot and
|
||||
# never re-pull. Users see stale code indefinitely.
|
||||
#
|
||||
# This workflow closes the gap by calling the control-plane admin
|
||||
# endpoint that performs a canary-first, batched, health-gated rolling
|
||||
# redeploy across every live tenant. Implemented in Molecule-AI/
|
||||
# molecule-controlplane as POST /cp/admin/tenants/redeploy-fleet
|
||||
# (feat/tenant-auto-redeploy, landing alongside this workflow).
|
||||
#
|
||||
# Runtime ordering:
|
||||
# 1. publish-workspace-server-image completes → new :latest in GHCR.
|
||||
# 2. This workflow fires via workflow_run, waits 30s for GHCR's
|
||||
# CDN to propagate the new tag to the region the tenants pull from.
|
||||
# 3. Calls redeploy-fleet with canary_slug=hongmingwang and a 60s
|
||||
# soak. Canary proves the image boots; batches follow.
|
||||
# 4. Any failure aborts the rollout and leaves older tenants on the
|
||||
# prior image — safer default than half-and-half state.
|
||||
#
|
||||
# Rollback path: re-run this workflow with a specific SHA pinned via
|
||||
# the workflow_dispatch input. That calls redeploy-fleet with
|
||||
# target_tag=<sha>, re-pulling the older image on every tenant.
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: ['publish-workspace-server-image']
|
||||
types: [completed]
|
||||
branches: [main]
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
target_tag:
|
||||
description: 'Tenant image tag to deploy (e.g. "latest" or "a59f1a6c"). Defaults to latest when empty.'
|
||||
required: false
|
||||
type: string
|
||||
default: 'latest'
|
||||
canary_slug:
|
||||
description: 'Tenant slug to deploy first + soak (empty = skip canary, fan out immediately).'
|
||||
required: false
|
||||
type: string
|
||||
default: 'hongmingwang'
|
||||
soak_seconds:
|
||||
description: 'Seconds to wait after canary before fanning out.'
|
||||
required: false
|
||||
type: string
|
||||
default: '60'
|
||||
batch_size:
|
||||
description: 'How many tenants SSM redeploys in parallel per batch.'
|
||||
required: false
|
||||
type: string
|
||||
default: '3'
|
||||
dry_run:
|
||||
description: 'Plan only — do not actually redeploy.'
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
# No write scopes needed — the workflow hits an external CP endpoint,
|
||||
# not the GitHub API.
|
||||
|
||||
jobs:
|
||||
redeploy:
|
||||
# Skip the auto-trigger if publish-workspace-server-image didn't
|
||||
# actually succeed. workflow_run fires on any completion state; we
|
||||
# don't want to redeploy against a half-built image.
|
||||
if: |
|
||||
github.event_name == 'workflow_dispatch' ||
|
||||
(github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 25
|
||||
steps:
|
||||
- name: Wait for GHCR tag propagation
|
||||
# GHCR's edge cache takes ~15-30s to consistently serve the new
|
||||
# :latest manifest after the registry accepts the push. Without
|
||||
# this sleep, the first tenant's docker pull sometimes races
|
||||
# and fetches the previous digest; sleeping is the cheapest
|
||||
# way to reduce that without polling GHCR for the new digest.
|
||||
run: sleep 30
|
||||
|
||||
- name: Call CP redeploy-fleet
|
||||
# CP_ADMIN_API_TOKEN must be set as a repo/org secret on
|
||||
# Molecule-AI/molecule-core, matching the staging/prod CP's
|
||||
# CP_ADMIN_API_TOKEN env. Stored in Railway, mirrored to this
|
||||
# repo's secrets for CI.
|
||||
env:
|
||||
CP_URL: ${{ vars.CP_URL || 'https://api.moleculesai.app' }}
|
||||
CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
|
||||
TARGET_TAG: ${{ inputs.target_tag || 'latest' }}
|
||||
CANARY_SLUG: ${{ inputs.canary_slug || 'hongmingwang' }}
|
||||
SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }}
|
||||
BATCH_SIZE: ${{ inputs.batch_size || '3' }}
|
||||
DRY_RUN: ${{ inputs.dry_run || false }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then
|
||||
echo "::error::CP_ADMIN_API_TOKEN secret not set — skipping redeploy"
|
||||
echo "::notice::Set CP_ADMIN_API_TOKEN in repo secrets to enable auto-redeploy."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
BODY=$(jq -nc \
|
||||
--arg tag "$TARGET_TAG" \
|
||||
--arg canary "$CANARY_SLUG" \
|
||||
--argjson soak "$SOAK_SECONDS" \
|
||||
--argjson batch "$BATCH_SIZE" \
|
||||
--argjson dry "$DRY_RUN" \
|
||||
'{
|
||||
target_tag: $tag,
|
||||
canary_slug: $canary,
|
||||
soak_seconds: $soak,
|
||||
batch_size: $batch,
|
||||
dry_run: $dry
|
||||
}')
|
||||
|
||||
echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet"
|
||||
echo " body: $BODY"
|
||||
|
||||
HTTP_RESPONSE=$(mktemp)
|
||||
HTTP_CODE=$(curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
|
||||
-m 1200 \
|
||||
-H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
|
||||
-d "$BODY" || echo "000")
|
||||
|
||||
echo "HTTP $HTTP_CODE"
|
||||
cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
|
||||
|
||||
# Pretty-print per-tenant results in the job summary so
|
||||
# ops can see which tenants were redeployed without drilling
|
||||
# into the raw response.
|
||||
{
|
||||
echo "## Tenant redeploy fleet"
|
||||
echo ""
|
||||
echo "**Target tag:** \`$TARGET_TAG\`"
|
||||
echo "**Canary:** \`$CANARY_SLUG\` (soak ${SOAK_SECONDS}s)"
|
||||
echo "**Batch size:** $BATCH_SIZE"
|
||||
echo "**Dry run:** $DRY_RUN"
|
||||
echo "**HTTP:** $HTTP_CODE"
|
||||
echo ""
|
||||
echo "### Per-tenant result"
|
||||
echo ""
|
||||
echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |'
|
||||
echo '|------|-------|------------|------|---------|-------|'
|
||||
jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
if [ "$HTTP_CODE" != "200" ]; then
|
||||
echo "::error::redeploy-fleet returned HTTP $HTTP_CODE"
|
||||
exit 1
|
||||
fi
|
||||
OK=$(jq -r '.ok' "$HTTP_RESPONSE")
|
||||
if [ "$OK" != "true" ]; then
|
||||
echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)"
|
||||
exit 1
|
||||
fi
|
||||
echo "::notice::Tenant fleet redeploy complete."
|
||||
35
.github/workflows/retarget-main-to-staging.yml
vendored
35
.github/workflows/retarget-main-to-staging.yml
vendored
@ -33,18 +33,49 @@ jobs:
|
||||
|| github.event.pull_request.user.login == 'molecule-ai[bot]'
|
||||
steps:
|
||||
- name: Retarget PR base to staging
|
||||
id: retarget
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
PR_AUTHOR: ${{ github.event.pull_request.user.login }}
|
||||
# Issue #1884: when the bot opens a PR against main and there's
|
||||
# already another PR on the same head branch targeting staging,
|
||||
# GitHub's PATCH /pulls returns 422 with
|
||||
# "A pull request already exists for base branch 'staging' …".
|
||||
# The retarget can't proceed — but the right response is to
|
||||
# close the now-redundant main-PR, not to fail the workflow
|
||||
# noisily. Detect that specific 422 and close instead.
|
||||
run: |
|
||||
set +e
|
||||
echo "Retargeting PR #${PR_NUMBER} (author: ${PR_AUTHOR}) from main → staging"
|
||||
gh api -X PATCH \
|
||||
PATCH_OUTPUT=$(gh api -X PATCH \
|
||||
"repos/${{ github.repository }}/pulls/${PR_NUMBER}" \
|
||||
-f base=staging \
|
||||
--jq '.base.ref'
|
||||
--jq '.base.ref' 2>&1)
|
||||
PATCH_EXIT=$?
|
||||
set -e
|
||||
if [ "$PATCH_EXIT" -eq 0 ]; then
|
||||
echo "::notice::Retargeted PR #${PR_NUMBER} → staging"
|
||||
echo "outcome=retargeted" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
# Specifically match the 422 duplicate-base/head error so
|
||||
# any OTHER PATCH failure (auth, deleted PR, etc.) still
|
||||
# surfaces as a real workflow failure.
|
||||
if echo "$PATCH_OUTPUT" | grep -q "pull request already exists for base branch 'staging'"; then
|
||||
echo "::notice::PR #${PR_NUMBER}: duplicate target-staging PR exists on same head — closing this main-PR as redundant."
|
||||
gh pr close "$PR_NUMBER" \
|
||||
--repo "${{ github.repository }}" \
|
||||
--comment "[retarget-bot] Closing — another PR on the same head branch already targets \`staging\`. This PR is redundant. See issue #1884 for the rationale."
|
||||
echo "outcome=closed-as-duplicate" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
echo "::error::Retarget PATCH failed and was NOT a duplicate-base error:"
|
||||
echo "$PATCH_OUTPUT" >&2
|
||||
exit 1
|
||||
|
||||
- name: Post explainer comment
|
||||
if: steps.retarget.outputs.outcome == 'retargeted'
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
170
.github/workflows/sweep-stale-e2e-orgs.yml
vendored
Normal file
170
.github/workflows/sweep-stale-e2e-orgs.yml
vendored
Normal file
@ -0,0 +1,170 @@
|
||||
name: Sweep stale e2e-* orgs (staging)
|
||||
|
||||
# Janitor for staging tenants left behind when E2E cleanup didn't run:
|
||||
# CI cancellations, runner crashes, transient AWS errors mid-cascade,
|
||||
# bash trap missed (signal 9), etc. Without this loop, every failed
|
||||
# teardown leaks an EC2 + DNS + DB row until manual ops cleanup —
|
||||
# 2026-04-23 staging hit the 64 vCPU AWS quota from ~27 such orphans.
|
||||
#
|
||||
# Why not rely on per-test-run teardown:
|
||||
# - Per-run teardown is best-effort by definition. Any process death
|
||||
# after the test starts but before the trap fires leaves debris.
|
||||
# - GH Actions cancellation kills the runner without grace period.
|
||||
# The workflow's `if: always()` step usually catches this, but it
|
||||
# too can fail (CP transient 5xx, runner network issue at the
|
||||
# wrong moment).
|
||||
# - Even when teardown runs, the CP cascade is best-effort in places
|
||||
# (cascadeTerminateWorkspaces logs+continues; DNS deletion same).
|
||||
# - This sweep is the catch-all that converges staging back to clean
|
||||
# regardless of which specific path leaked.
|
||||
#
|
||||
# The PROPER fix is making CP cleanup transactional + verify-after-
|
||||
# terminate (filed separately as cleanup-correctness work). This
|
||||
# workflow is the safety net that catches everything else AND any
|
||||
# future leak source we haven't yet identified.
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Every hour on the hour. E2E orgs are short-lived (~10-25 min wall
|
||||
# clock from create to teardown). Anything older than the
|
||||
# MAX_AGE_MINUTES threshold below is presumed dead.
|
||||
- cron: '0 * * * *'
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
max_age_minutes:
|
||||
description: "Delete e2e-* orgs older than N minutes (default 120)"
|
||||
required: false
|
||||
default: "120"
|
||||
dry_run:
|
||||
description: "Dry run only — list what would be deleted"
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
# Don't let two sweeps fight. Cron + workflow_dispatch could overlap
|
||||
# on a manual trigger; queue rather than parallel-delete.
|
||||
concurrency:
|
||||
group: sweep-stale-e2e-orgs
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
sweep:
|
||||
name: Sweep e2e orgs
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 15
|
||||
env:
|
||||
MOLECULE_CP_URL: https://staging-api.moleculesai.app
|
||||
ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
||||
MAX_AGE_MINUTES: ${{ github.event.inputs.max_age_minutes || '120' }}
|
||||
DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }}
|
||||
# Refuse to delete more than this many orgs in one tick. If the
|
||||
# CP DB is briefly empty (or the admin endpoint goes weird and
|
||||
# returns no created_at), every e2e- org would look stale.
|
||||
# Bailing protects against runaway nukes.
|
||||
SAFETY_CAP: 50
|
||||
|
||||
steps:
|
||||
- name: Verify admin token present
|
||||
run: |
|
||||
if [ -z "$ADMIN_TOKEN" ]; then
|
||||
echo "::error::MOLECULE_STAGING_ADMIN_TOKEN not set"
|
||||
exit 2
|
||||
fi
|
||||
echo "Admin token present ✓"
|
||||
|
||||
- name: Identify stale e2e orgs
|
||||
id: identify
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# Fetch into a file so the python step reads it via stdin —
|
||||
# cleaner than embedding $(curl ...) into a heredoc.
|
||||
curl -sS --fail-with-body --max-time 30 \
|
||||
"$MOLECULE_CP_URL/cp/admin/orgs?limit=500" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
> orgs.json
|
||||
|
||||
# Filter:
|
||||
# 1. slug starts with 'e2e-' (covers e2e-, e2e-canary-,
|
||||
# e2e-canvas-* — all variants the test scripts mint)
|
||||
# 2. created_at is older than MAX_AGE_MINUTES ago
|
||||
# Output one slug per line to a file the next step reads.
|
||||
python3 > stale_slugs.txt <<'PY'
|
||||
import json, os
|
||||
from datetime import datetime, timezone, timedelta
|
||||
with open("orgs.json") as f:
|
||||
data = json.load(f)
|
||||
max_age = int(os.environ["MAX_AGE_MINUTES"])
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(minutes=max_age)
|
||||
for o in data.get("orgs", []):
|
||||
slug = o.get("slug", "")
|
||||
if not slug.startswith("e2e-"):
|
||||
continue
|
||||
created = o.get("created_at")
|
||||
if not created:
|
||||
# Defensively skip rows without created_at — better
|
||||
# to leave one orphan than nuke a brand-new row
|
||||
# whose timestamp didn't render.
|
||||
continue
|
||||
# Python 3.11+ handles RFC3339 with Z directly via
|
||||
# fromisoformat; older runners need the trailing Z swap.
|
||||
created_dt = datetime.fromisoformat(created.replace("Z", "+00:00"))
|
||||
if created_dt < cutoff:
|
||||
print(slug)
|
||||
PY
|
||||
|
||||
count=$(wc -l < stale_slugs.txt | tr -d ' ')
|
||||
echo "Found $count stale e2e org(s) older than ${MAX_AGE_MINUTES}m"
|
||||
if [ "$count" -gt 0 ]; then
|
||||
echo "First 20:"
|
||||
head -20 stale_slugs.txt | sed 's/^/ /'
|
||||
fi
|
||||
echo "count=$count" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Safety gate
|
||||
if: steps.identify.outputs.count != '0'
|
||||
run: |
|
||||
count="${{ steps.identify.outputs.count }}"
|
||||
if [ "$count" -gt "$SAFETY_CAP" ]; then
|
||||
echo "::error::Refusing to delete $count orgs in one sweep (cap=$SAFETY_CAP). Investigate manually — this usually means the CP admin API returned no created_at or returned a degraded result. Re-run with workflow_dispatch + max_age_minutes if intentional."
|
||||
exit 1
|
||||
fi
|
||||
echo "Within safety cap ($count ≤ $SAFETY_CAP) ✓"
|
||||
|
||||
- name: Delete stale orgs
|
||||
if: steps.identify.outputs.count != '0' && env.DRY_RUN != 'true'
|
||||
run: |
|
||||
set -uo pipefail
|
||||
deleted=0
|
||||
failed=0
|
||||
while IFS= read -r slug; do
|
||||
[ -z "$slug" ] && continue
|
||||
# The DELETE handler requires {"confirm": "<slug>"} matching
|
||||
# the URL slug — fat-finger guard. Idempotent: re-issuing
|
||||
# picks up via org_purges.last_step.
|
||||
http_code=$(curl -sS -o /tmp/del_resp -w "%{http_code}" \
|
||||
--max-time 60 \
|
||||
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"confirm\":\"$slug\"}" || echo "000")
|
||||
if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then
|
||||
deleted=$((deleted+1))
|
||||
echo " deleted: $slug"
|
||||
else
|
||||
failed=$((failed+1))
|
||||
echo " FAILED ($http_code): $slug — $(cat /tmp/del_resp 2>/dev/null | head -c 200)"
|
||||
fi
|
||||
done < stale_slugs.txt
|
||||
echo ""
|
||||
echo "Sweep summary: deleted=$deleted failed=$failed"
|
||||
# Don't fail the workflow on per-org delete errors — the
|
||||
# sweeper is best-effort. Next hourly tick re-attempts. We
|
||||
# only fail loud at the safety-cap gate above.
|
||||
|
||||
- name: Dry-run summary
|
||||
if: env.DRY_RUN == 'true'
|
||||
run: |
|
||||
echo "DRY RUN — would have deleted ${{ steps.identify.outputs.count }} org(s). Re-run with dry_run=false to actually delete."
|
||||
@ -1,4 +1,4 @@
|
||||
FROM node:20-alpine AS builder
|
||||
FROM node:22-alpine AS builder
|
||||
WORKDIR /app
|
||||
COPY package.json package-lock.json* ./
|
||||
RUN npm install
|
||||
@ -11,7 +11,7 @@ ENV NEXT_PUBLIC_WS_URL=$NEXT_PUBLIC_WS_URL
|
||||
ENV NEXT_PUBLIC_ADMIN_TOKEN=$NEXT_PUBLIC_ADMIN_TOKEN
|
||||
RUN npm run build
|
||||
|
||||
FROM node:20-alpine
|
||||
FROM node:22-alpine
|
||||
WORKDIR /app
|
||||
COPY --from=builder /app/.next/standalone ./
|
||||
COPY --from=builder /app/.next/static ./.next/static
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
* the per-tenant admin token, provisions one hermes workspace, waits
|
||||
* for online, then exports:
|
||||
*
|
||||
* STAGING_TENANT_URL https://<slug>.moleculesai.app
|
||||
* STAGING_TENANT_URL https://<slug>.staging.moleculesai.app
|
||||
* STAGING_WORKSPACE_ID UUID of the hermes workspace
|
||||
* STAGING_TENANT_TOKEN per-tenant admin bearer (for spec requests)
|
||||
* STAGING_SLUG org slug (used by teardown)
|
||||
@ -16,6 +16,11 @@
|
||||
* CP_ADMIN_API_TOKEN). Drives provision +
|
||||
* tenant-token retrieval + teardown via a
|
||||
* single credential.
|
||||
* STAGING_TENANT_DOMAIN default: staging.moleculesai.app — the
|
||||
* DNS suffix the CP provisioner writes for
|
||||
* staging tenants. Override only when
|
||||
* running this harness against a non-default
|
||||
* zone.
|
||||
*/
|
||||
|
||||
import type { FullConfig } from "@playwright/test";
|
||||
@ -25,6 +30,14 @@ import { join } from "path";
|
||||
const CP_URL = process.env.MOLECULE_CP_URL || "https://staging-api.moleculesai.app";
|
||||
const ADMIN_TOKEN = process.env.MOLECULE_ADMIN_TOKEN;
|
||||
const STAGING = process.env.CANVAS_E2E_STAGING === "1";
|
||||
// Tenant DNS zone for staging. CP provisioner registers DNS as
|
||||
// `<slug>.staging.moleculesai.app` (see internal/provisioner/ec2.go's
|
||||
// EC2 provisioner: DNS log line). The previous default of plain
|
||||
// `moleculesai.app` matched prod tenant naming and silently broke
|
||||
// every staging E2E at the TLS readiness step — DNS literally didn't
|
||||
// resolve, fetch threw NXDOMAIN, waitFor saw null on every poll, and
|
||||
// the harness wedged at TLS_TIMEOUT_MS instead of failing loud.
|
||||
const TENANT_DOMAIN = process.env.STAGING_TENANT_DOMAIN || "staging.moleculesai.app";
|
||||
|
||||
// Tenant cold boot on staging regularly takes 12-15 min when the
|
||||
// workspace-server Docker image isn't already cached on the AMI. Raised
|
||||
@ -105,22 +118,44 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
|
||||
}
|
||||
console.log(`[staging-setup] Org created: ${slug}`);
|
||||
|
||||
// 2. Wait for tenant running (admin-orgs list is the status source)
|
||||
// 2. Wait for tenant running (admin-orgs list is the status source).
|
||||
//
|
||||
// The CP /cp/admin/orgs endpoint returns each org with an
|
||||
// `instance_status` field (handlers/admin.go:adminOrgSummary,
|
||||
// sourced from `org_instances.status`). NOT `status` — there's no
|
||||
// top-level `status` on the row at all. A previous version of this
|
||||
// test polled `row.status`, which was always undefined, so this
|
||||
// waitFor never resolved truthy and the harness invariably timed
|
||||
// out at 1200s — masking real CP bugs (see #242 chain) AND
|
||||
// surviving real CP fixes alike.
|
||||
// Capture the org UUID alongside the running check — every request
|
||||
// we send to the tenant URL after this point needs an
|
||||
// X-Molecule-Org-Id header (see workspace-server middleware/tenant_guard.go).
|
||||
// Without it, TenantGuard returns 404 ("must not be inferable by
|
||||
// probing other orgs' machines"). The CP returns the id on the
|
||||
// admin-orgs row; capture it here while we're already polling.
|
||||
let orgID = "";
|
||||
await waitFor<boolean>(
|
||||
async () => {
|
||||
const r = await jsonFetch(`${CP_URL}/cp/admin/orgs`, { headers: adminAuth });
|
||||
if (r.status !== 200) return null;
|
||||
const row = (r.body?.orgs || []).find((o: any) => o.slug === slug);
|
||||
if (!row) return null;
|
||||
if (row.status === "running") return true;
|
||||
if (row.status === "failed") throw new Error(`provision failed: ${slug}`);
|
||||
if (row.instance_status === "running") {
|
||||
orgID = row.id;
|
||||
return true;
|
||||
}
|
||||
if (row.instance_status === "failed") throw new Error(`provision failed: ${slug}`);
|
||||
return null;
|
||||
},
|
||||
PROVISION_TIMEOUT_MS,
|
||||
15_000,
|
||||
"tenant provision",
|
||||
);
|
||||
console.log(`[staging-setup] Tenant running`);
|
||||
if (!orgID) {
|
||||
throw new Error(`expected admin-orgs row to carry id, got empty for slug=${slug}`);
|
||||
}
|
||||
console.log(`[staging-setup] Tenant running (org_id=${orgID})`);
|
||||
|
||||
// 3. Fetch per-tenant admin token
|
||||
const tokRes = await jsonFetch(
|
||||
@ -133,7 +168,7 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
|
||||
);
|
||||
}
|
||||
const tenantToken: string = tokRes.body.admin_token;
|
||||
const tenantURL = `https://${slug}.moleculesai.app`;
|
||||
const tenantURL = `https://${slug}.${TENANT_DOMAIN}`;
|
||||
console.log(`[staging-setup] Tenant URL: ${tenantURL}`);
|
||||
|
||||
// 4. TLS readiness
|
||||
@ -154,7 +189,17 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
|
||||
);
|
||||
|
||||
// 5. Provision workspace
|
||||
const tenantAuth = { Authorization: `Bearer ${tenantToken}` };
|
||||
//
|
||||
// tenantAuth carries TWO headers, both required:
|
||||
// - Authorization: Bearer <admin-token> — wsAdmin middleware gate
|
||||
// - X-Molecule-Org-Id: <uuid> — TenantGuard cross-org gate
|
||||
// Missing the org-id header silently 404s every non-allowlisted
|
||||
// route, with no body and no security headers. The 404 is intentional
|
||||
// (existence-non-inference) which makes it look like a missing route.
|
||||
const tenantAuth = {
|
||||
"Authorization": `Bearer ${tenantToken}`,
|
||||
"X-Molecule-Org-Id": orgID,
|
||||
};
|
||||
const ws = await jsonFetch(`${tenantURL}/workspaces`, {
|
||||
method: "POST",
|
||||
headers: tenantAuth,
|
||||
|
||||
@ -63,6 +63,82 @@ test.describe("staging canvas tabs", () => {
|
||||
Authorization: `Bearer ${tenantToken}`,
|
||||
});
|
||||
|
||||
// canvas/src/components/AuthGate.tsx fetches /cp/auth/me on mount
|
||||
// and redirects to the login page on 401. The bearer header above
|
||||
// is for platform API calls — it does NOT satisfy /cp/auth/me,
|
||||
// which is cookie-based (WorkOS session). Without this mock, the
|
||||
// canvas page mounts AuthGate, sees 401 from /cp/auth/me, and
|
||||
// redirects away from the tenant URL before the React Flow root
|
||||
// ever renders. The [aria-label] selector wait then times out.
|
||||
//
|
||||
// Intercept /cp/auth/me + return a fake Session shape so AuthGate
|
||||
// resolves to "authenticated" and renders {children}. The session
|
||||
// contents are cosmetic — the canvas only inspects org_id/user_id
|
||||
// in a few places that don't fail when these are dummy values.
|
||||
await context.route("**/cp/auth/me", (route) =>
|
||||
route.fulfill({
|
||||
status: 200,
|
||||
contentType: "application/json",
|
||||
body: JSON.stringify({
|
||||
user_id: `e2e-test-user-${workspaceId}`,
|
||||
org_id: "e2e-test-org",
|
||||
email: "e2e@test.local",
|
||||
}),
|
||||
}),
|
||||
);
|
||||
|
||||
// Universal 401 → empty-200 fallback (defense-in-depth).
|
||||
//
|
||||
// The original product bug was canvas/src/lib/api.ts:62-74 calling
|
||||
// `redirectToLogin` on EVERY 401 — a single workspace-scoped 401
|
||||
// (e.g. /workspaces/:id/peers, /plugins) yanked the user (and the
|
||||
// test) to AuthKit. That's now fixed at the source: api.ts probes
|
||||
// /cp/auth/me before redirecting, so a 401 from a non-auth path
|
||||
// with a live session throws a regular error instead.
|
||||
//
|
||||
// This route handler stays as a SAFETY NET, not the primary
|
||||
// defense:
|
||||
// 1. It silences resource-load console noise from the browser
|
||||
// (those messages don't include the URL — useless in
|
||||
// diagnostics, captured by the filter in the assertion
|
||||
// block but having no 401s reach the network is cleaner).
|
||||
// 2. It guards against panels that DON'T have try/catch around
|
||||
// their api calls — an unhandled rejection would surface
|
||||
// as console.error → fail the assertion. Panels SHOULD
|
||||
// handle errors, but until they're all audited, this is
|
||||
// the test's belt to api.ts's braces.
|
||||
//
|
||||
// Pass-through real responses; swap 401s for 200 + empty body.
|
||||
// Skip /cp/auth/me (mocked above) and non-fetch resources
|
||||
// (HTML/JS/CSS bundles that should NOT be intercepted).
|
||||
await context.route("**", async (route, request) => {
|
||||
if (request.resourceType() !== "fetch") {
|
||||
return route.fallback();
|
||||
}
|
||||
// /cp/auth/me is mocked above with a fixed Session shape — let
|
||||
// that handler win without us round-tripping the network.
|
||||
if (request.url().includes("/cp/auth/me")) {
|
||||
return route.fallback();
|
||||
}
|
||||
let resp;
|
||||
try {
|
||||
resp = await route.fetch();
|
||||
} catch {
|
||||
return route.fallback();
|
||||
}
|
||||
if (resp.status() !== 401) {
|
||||
return route.fulfill({ response: resp });
|
||||
}
|
||||
const lastSeg =
|
||||
new URL(request.url()).pathname.split("/").filter(Boolean).pop() || "";
|
||||
const looksLikeList = !/^[0-9a-f-]{8,}$/.test(lastSeg);
|
||||
await route.fulfill({
|
||||
status: 200,
|
||||
contentType: "application/json",
|
||||
body: looksLikeList ? "[]" : "{}",
|
||||
});
|
||||
});
|
||||
|
||||
const consoleErrors: string[] = [];
|
||||
page.on("console", (msg) => {
|
||||
if (msg.type() === "error") {
|
||||
@ -70,13 +146,38 @@ test.describe("staging canvas tabs", () => {
|
||||
}
|
||||
});
|
||||
|
||||
await page.goto(tenantURL, { waitUntil: "networkidle" });
|
||||
// Capture the URL of any failed network request so a "Failed to load
|
||||
// resource: 404" console message we filter out below leaves a
|
||||
// breadcrumb. Browser console messages for resource-load failures
|
||||
// omit the URL, so we'd otherwise be flying blind. Logged to the
|
||||
// test's stdout (visible in the workflow log under the failed step).
|
||||
page.on("requestfailed", (req) => {
|
||||
console.log(`[e2e/requestfailed] ${req.method()} ${req.url()}: ${req.failure()?.errorText ?? "?"}`);
|
||||
});
|
||||
page.on("response", (res) => {
|
||||
if (res.status() >= 400) {
|
||||
console.log(`[e2e/response-${res.status()}] ${res.request().method()} ${res.url()}`);
|
||||
}
|
||||
});
|
||||
|
||||
// waitUntil="networkidle" is wrong here — the canvas keeps a
|
||||
// WebSocket open + polls /events and /workspaces every few
|
||||
// seconds, so the network is *never* idle for 500ms. page.goto
|
||||
// would hang until its 45s default timeout. "domcontentloaded"
|
||||
// returns as soon as the HTML is parsed; React hydration + the
|
||||
// selector wait below is what actually gates ready-for-interaction.
|
||||
await page.goto(tenantURL, { waitUntil: "domcontentloaded" });
|
||||
|
||||
// Canvas hydration races WebSocket connect + /workspaces fetch.
|
||||
// Wait for the tablist element (appears after a workspace is
|
||||
// selected) or the hydration-error banner — whichever wins first.
|
||||
// Wait for the React Flow canvas wrapper (always present once
|
||||
// hydrated, even with zero workspaces) or the hydration-error
|
||||
// banner — whichever wins first. Previous version of this wait
|
||||
// used `[role="tablist"]`, but that selector only appears AFTER
|
||||
// a workspace node is clicked (which happens below at L100), so
|
||||
// the wait would always time out at 45s before any meaningful
|
||||
// failure surfaced.
|
||||
await page.waitForSelector(
|
||||
'[role="tablist"], [data-testid="hydration-error"]',
|
||||
'[aria-label="Molecule AI workspace canvas"], [data-testid="hydration-error"]',
|
||||
{ timeout: 45_000 },
|
||||
);
|
||||
|
||||
@ -106,6 +207,15 @@ test.describe("staging canvas tabs", () => {
|
||||
for (const tabId of TAB_IDS) {
|
||||
await test.step(`tab: ${tabId}`, async () => {
|
||||
const tabButton = page.locator(`#tab-${tabId}`);
|
||||
// The TABS bar is `overflow-x-auto` (SidePanel.tsx:~tabs
|
||||
// wrapper) — tabs after position ~3 are clipped behind the
|
||||
// right-edge fade gradient on smaller viewports. Playwright's
|
||||
// `toBeVisible()` returns false for clipped elements, so a
|
||||
// bare visibility check fails on `skills` and later tabs in
|
||||
// CI. scrollIntoViewIfNeeded brings the button into view
|
||||
// before the visibility check, mirroring what SidePanel's own
|
||||
// keyboard handler does on arrow-key navigation.
|
||||
await tabButton.scrollIntoViewIfNeeded({ timeout: 5_000 });
|
||||
await expect(
|
||||
tabButton,
|
||||
`tab-${tabId} button missing — TABS list may have drifted`,
|
||||
@ -134,14 +244,22 @@ test.describe("staging canvas tabs", () => {
|
||||
|
||||
// Aggregate console-error budget. Known-noisy sources whitelisted:
|
||||
// Sentry, Vercel analytics, WS reconnects (expected on SaaS
|
||||
// terminal), favicon 404 (cosmetic).
|
||||
// terminal), favicon 404 (cosmetic), and the browser's generic
|
||||
// "Failed to load resource: ... 404" message which never includes
|
||||
// the URL — uninformative on its own and impossible to filter
|
||||
// meaningfully without a URL. The page.on('requestfailed') +
|
||||
// page.on('response>=400') logging above captures the actual URLs
|
||||
// so a real bug still leaves a breadcrumb in the workflow log;
|
||||
// a real exception (panel crash, JS error) surfaces as a typed
|
||||
// error with file path which the filter still catches.
|
||||
const appErrors = consoleErrors.filter(
|
||||
(msg) =>
|
||||
!msg.includes("sentry") &&
|
||||
!msg.includes("vercel") &&
|
||||
!msg.includes("WebSocket") &&
|
||||
!msg.includes("favicon") &&
|
||||
!msg.includes("molecule-icon.png"), // another cosmetic 404
|
||||
!msg.includes("molecule-icon.png") && // cosmetic 404
|
||||
!msg.includes("Failed to load resource"),
|
||||
);
|
||||
expect(
|
||||
appErrors,
|
||||
|
||||
@ -61,6 +61,11 @@ export default function Home() {
|
||||
{hydrationError && (
|
||||
<div
|
||||
role="alert"
|
||||
// Stable testid so the staging E2E (canvas/e2e/staging-tabs.spec.ts)
|
||||
// can detect this banner without depending on the role="alert"
|
||||
// selector that's used by other transient toasts. Don't rename
|
||||
// without updating that spec.
|
||||
data-testid="hydration-error"
|
||||
className="fixed inset-0 flex flex-col items-center justify-center bg-zinc-950 text-zinc-300 gap-4 z-[9999]"
|
||||
>
|
||||
<p className="text-zinc-400 text-sm">{hydrationError}</p>
|
||||
|
||||
@ -14,7 +14,7 @@ import { PricingTable } from "@/components/PricingTable";
|
||||
export const metadata = {
|
||||
title: "Pricing — Molecule AI",
|
||||
description:
|
||||
"Free while you tinker, paid tiers for shipping production multi-agent organizations. Transparent usage-based overage pricing on Pro.",
|
||||
"Flat-rate team and org pricing — no per-seat fees. Free to start, $29/month for teams, $99/month for production orgs. Full runtime stack included on every paid tier.",
|
||||
};
|
||||
|
||||
export default function PricingPage() {
|
||||
@ -25,9 +25,12 @@ export default function PricingPage() {
|
||||
Pricing
|
||||
</h1>
|
||||
<p className="mx-auto mt-4 max-w-2xl text-lg text-zinc-300">
|
||||
Free while you tinker. Pay when you ship real agents to production.
|
||||
Every tier includes the full runtime stack — you upgrade for scale,
|
||||
support, and dedicated infrastructure.
|
||||
One flat price per org — not per seat. Every paid tier includes the
|
||||
full runtime stack. You upgrade for scale, support, and dedicated
|
||||
infrastructure.
|
||||
</p>
|
||||
<p className="mx-auto mt-2 max-w-xl text-sm text-zinc-400">
|
||||
5-person team? You pay $29/month — not $200. No seat math, ever.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
@ -53,7 +56,8 @@ export default function PricingPage() {
|
||||
.
|
||||
</p>
|
||||
<p className="mt-6 text-sm text-zinc-500">
|
||||
Prices shown in USD. Enterprise / self-hosted licensing available — contact us.
|
||||
Prices shown in USD. Flat-rate per org — no per-seat fees on any paid tier.
|
||||
Enterprise / self-hosted licensing available — contact us.
|
||||
</p>
|
||||
</section>
|
||||
|
||||
|
||||
@ -6,10 +6,16 @@ import { api } from "@/lib/api";
|
||||
import { showToast } from "./Toaster";
|
||||
import { ConsoleModal } from "./ConsoleModal";
|
||||
|
||||
/** Base provisioning timeout in milliseconds (2 minutes). Used as the
|
||||
* floor; the effective threshold scales with the number of workspaces
|
||||
* concurrently provisioning (see effectiveTimeoutMs below). */
|
||||
export const DEFAULT_PROVISION_TIMEOUT_MS = 120_000;
|
||||
import {
|
||||
DEFAULT_RUNTIME_PROFILE,
|
||||
provisionTimeoutForRuntime,
|
||||
} from "@/lib/runtimeProfiles";
|
||||
|
||||
/** Re-export for backward compatibility with tests and other importers
|
||||
* that previously imported DEFAULT_PROVISION_TIMEOUT_MS from this file.
|
||||
* New code should read via getRuntimeProfile() from @/lib/runtimeProfiles. */
|
||||
export const DEFAULT_PROVISION_TIMEOUT_MS =
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs;
|
||||
|
||||
/** The server provisions up to `PROVISION_CONCURRENCY` containers at
|
||||
* once and paces the rest in a queue (`workspaceCreatePacingMs` =
|
||||
@ -43,8 +49,12 @@ interface TimeoutEntry {
|
||||
* time per node.
|
||||
*/
|
||||
export function ProvisioningTimeout({
|
||||
timeoutMs = DEFAULT_PROVISION_TIMEOUT_MS,
|
||||
timeoutMs,
|
||||
}: {
|
||||
// If undefined (the default when mounted without a prop), each workspace's
|
||||
// threshold is resolved from its runtime via timeoutForRuntime().
|
||||
// Pass an explicit number to force a single threshold for every workspace
|
||||
// (used by tests that want deterministic behavior regardless of runtime).
|
||||
timeoutMs?: number;
|
||||
}) {
|
||||
const [timedOut, setTimedOut] = useState<TimeoutEntry[]>([]);
|
||||
@ -57,19 +67,28 @@ export function ProvisioningTimeout({
|
||||
const [dismissed, setDismissed] = useState<Set<string>>(new Set());
|
||||
|
||||
// Subscribe to provisioning nodes — use shallow compare to avoid infinite re-render
|
||||
// (filter+map creates new array reference on every store update)
|
||||
// (filter+map creates new array reference on every store update).
|
||||
// Runtime included so the timeout threshold can be resolved per-node
|
||||
// (hermes cold-boot legitimately takes 8-13 min vs 30-90s for docker
|
||||
// runtimes — a single threshold would false-alarm on one or the other).
|
||||
// Separator: `|` between fields, `,` between nodes. Names may contain
|
||||
// anything the user typed; strip `|` and `,` so serialization round-trips.
|
||||
const provisioningNodes = useCanvasStore((s) => {
|
||||
const result = s.nodes
|
||||
.filter((n) => n.data.status === "provisioning")
|
||||
.map((n) => `${n.id}:${n.data.name}`);
|
||||
.map((n) => {
|
||||
const safeName = (n.data.name ?? "").replace(/[|,]/g, " ");
|
||||
const runtime = n.data.runtime ?? "";
|
||||
return `${n.id}|${safeName}|${runtime}`;
|
||||
});
|
||||
return result.join(",");
|
||||
});
|
||||
const parsedProvisioningNodes = useMemo(
|
||||
() =>
|
||||
provisioningNodes
|
||||
? provisioningNodes.split(",").map((entry) => {
|
||||
const [id, name] = entry.split(":");
|
||||
return { id, name };
|
||||
const [id, name, runtime] = entry.split("|");
|
||||
return { id, name, runtime };
|
||||
})
|
||||
: [],
|
||||
[provisioningNodes],
|
||||
@ -113,14 +132,21 @@ export function ProvisioningTimeout({
|
||||
const interval = setInterval(() => {
|
||||
const now = Date.now();
|
||||
const newTimedOut: TimeoutEntry[] = [];
|
||||
const effective = effectiveTimeoutMs(
|
||||
timeoutMs,
|
||||
parsedProvisioningNodes.length,
|
||||
);
|
||||
|
||||
// Per-node timeout: each workspace resolves its own base via
|
||||
// @/lib/runtimeProfiles (server-override → runtime profile →
|
||||
// default), then scales by concurrent-provisioning count. A
|
||||
// hermes workspace in a batch alongside two langgraph workspaces
|
||||
// gets hermes's 12-min base, not langgraph's 2-min base.
|
||||
for (const node of parsedProvisioningNodes) {
|
||||
const startedAt = tracking.get(node.id);
|
||||
if (startedAt && now - startedAt >= effective) {
|
||||
if (!startedAt) continue;
|
||||
const base = timeoutMs ?? provisionTimeoutForRuntime(node.runtime);
|
||||
const effective = effectiveTimeoutMs(
|
||||
base,
|
||||
parsedProvisioningNodes.length,
|
||||
);
|
||||
if (now - startedAt >= effective) {
|
||||
newTimedOut.push({
|
||||
workspaceId: node.id,
|
||||
workspaceName: node.name,
|
||||
|
||||
@ -322,31 +322,6 @@ function countDescendants(nodeId: string, allNodes: Node<WorkspaceNodeData>[], v
|
||||
* infinite recursion on circular parentId references and keeps the UI readable. */
|
||||
const MAX_NESTING_DEPTH = 3;
|
||||
|
||||
/** Subscribes to allNodes only when children exist — isolates re-renders from parent */
|
||||
function EmbeddedTeam({ members, depth, onSelect, onExtract }: {
|
||||
members: Node<WorkspaceNodeData>[];
|
||||
depth: number;
|
||||
onSelect: (id: string) => void;
|
||||
onExtract: (id: string) => void;
|
||||
}) {
|
||||
const allNodes = useCanvasStore((s) => s.nodes);
|
||||
// Use grid layout at depth 0 when there are multiple members (departments side-by-side)
|
||||
const useGrid = depth === 0 && members.length >= 2;
|
||||
return (
|
||||
<div className="mt-2 pt-2 border-t border-zinc-700/30">
|
||||
<div className="text-[10px] text-zinc-500 uppercase tracking-widest mb-1.5">Team Members</div>
|
||||
<div className={useGrid
|
||||
? "grid grid-cols-2 gap-1.5 lg:grid-cols-3"
|
||||
: "space-y-1.5"
|
||||
}>
|
||||
{members.map((child) => (
|
||||
<TeamMemberChip key={child.id} node={child} allNodes={allNodes} depth={depth} onSelect={onSelect} onExtract={onExtract} />
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
/** Recursive mini-card — mirrors parent card layout at smaller scale */
|
||||
function TeamMemberChip({
|
||||
node,
|
||||
|
||||
@ -50,14 +50,14 @@ describe("PricingTable", () => {
|
||||
it("renders all three plans with their CTAs", () => {
|
||||
render(<PricingTable />);
|
||||
expect(screen.getByRole("heading", { name: "Free" })).toBeTruthy();
|
||||
expect(screen.getByRole("heading", { name: "Starter" })).toBeTruthy();
|
||||
expect(screen.getByRole("heading", { name: "Pro" })).toBeTruthy();
|
||||
expect(screen.getByRole("heading", { name: "Team" })).toBeTruthy();
|
||||
expect(screen.getByRole("heading", { name: "Growth" })).toBeTruthy();
|
||||
expect(screen.getByRole("button", { name: "Get started" })).toBeTruthy();
|
||||
expect(screen.getByRole("button", { name: "Upgrade to Starter" })).toBeTruthy();
|
||||
expect(screen.getByRole("button", { name: "Upgrade to Pro" })).toBeTruthy();
|
||||
expect(screen.getByRole("button", { name: "Upgrade to Team" })).toBeTruthy();
|
||||
expect(screen.getByRole("button", { name: "Upgrade to Growth" })).toBeTruthy();
|
||||
});
|
||||
|
||||
it("shows the 'Most popular' badge only on the starter card", () => {
|
||||
it("shows the 'Most popular' badge only on the Team card", () => {
|
||||
render(<PricingTable />);
|
||||
const badges = screen.getAllByText("Most popular");
|
||||
expect(badges.length).toBe(1);
|
||||
@ -74,7 +74,7 @@ describe("PricingTable", () => {
|
||||
it("Paid CTA + anonymous → bounces to signup (no checkout call)", async () => {
|
||||
mockedFetchSession.mockResolvedValue(null);
|
||||
render(<PricingTable />);
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Starter" }));
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Team" }));
|
||||
await waitFor(() => expect(mockedRedirectToLogin).toHaveBeenCalledWith("sign-up"));
|
||||
expect(mockedStartCheckout).not.toHaveBeenCalled();
|
||||
});
|
||||
@ -91,7 +91,7 @@ describe("PricingTable", () => {
|
||||
});
|
||||
|
||||
render(<PricingTable />);
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Pro" }));
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Growth" }));
|
||||
|
||||
await waitFor(() =>
|
||||
expect(mockedStartCheckout).toHaveBeenCalledWith("pro", "acme"),
|
||||
@ -111,7 +111,7 @@ describe("PricingTable", () => {
|
||||
mockedGetTenantSlug.mockReturnValue("");
|
||||
|
||||
render(<PricingTable />);
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Starter" }));
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Team" }));
|
||||
|
||||
await waitFor(() => {
|
||||
const alert = screen.getByRole("alert");
|
||||
@ -129,7 +129,7 @@ describe("PricingTable", () => {
|
||||
mockedStartCheckout.mockRejectedValue(new Error("checkout: 500 boom"));
|
||||
|
||||
render(<PricingTable />);
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Pro" }));
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Growth" }));
|
||||
|
||||
await waitFor(() => {
|
||||
const alert = screen.getByRole("alert");
|
||||
@ -140,7 +140,7 @@ describe("PricingTable", () => {
|
||||
it("treats fetchSession network errors as anonymous (fail-closed to signup)", async () => {
|
||||
mockedFetchSession.mockRejectedValue(new Error("network down"));
|
||||
render(<PricingTable />);
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Starter" }));
|
||||
fireEvent.click(screen.getByRole("button", { name: "Upgrade to Team" }));
|
||||
await waitFor(() => expect(mockedRedirectToLogin).toHaveBeenCalledWith("sign-up"));
|
||||
expect(mockedStartCheckout).not.toHaveBeenCalled();
|
||||
});
|
||||
@ -155,7 +155,7 @@ describe("PricingTable", () => {
|
||||
mockedStartCheckout.mockReturnValue(new Promise(() => {}));
|
||||
|
||||
render(<PricingTable />);
|
||||
const button = screen.getByRole("button", { name: "Upgrade to Pro" });
|
||||
const button = screen.getByRole("button", { name: "Upgrade to Growth" });
|
||||
fireEvent.click(button);
|
||||
|
||||
await waitFor(() => {
|
||||
|
||||
@ -8,6 +8,12 @@ global.fetch = vi.fn(() =>
|
||||
import { useCanvasStore } from "../../store/canvas";
|
||||
import type { WorkspaceData } from "../../store/socket";
|
||||
import { DEFAULT_PROVISION_TIMEOUT_MS } from "../ProvisioningTimeout";
|
||||
import {
|
||||
DEFAULT_RUNTIME_PROFILE,
|
||||
RUNTIME_PROFILES,
|
||||
getRuntimeProfile,
|
||||
provisionTimeoutForRuntime,
|
||||
} from "@/lib/runtimeProfiles";
|
||||
|
||||
// Helper to build a WorkspaceData object
|
||||
function makeWS(overrides: Partial<WorkspaceData> & { id: string }): WorkspaceData {
|
||||
@ -184,4 +190,102 @@ describe("ProvisioningTimeout", () => {
|
||||
.nodes.filter((n) => n.data.status === "provisioning");
|
||||
expect(stillProvisioning).toHaveLength(2);
|
||||
});
|
||||
|
||||
// ── Runtime-aware timeout regression tests (2026-04-24 outage) ────────────
|
||||
// Prior to this, a hermes workspace consistently false-alarmed at 2 min
|
||||
// into its 8-13 min cold boot, pushing users to retry something that
|
||||
// would have come online on its own. The runtime-aware override keeps
|
||||
// the 2-min floor for fast docker runtimes while giving hermes its
|
||||
// honest 12-min budget.
|
||||
|
||||
describe("runtime profile resolution (@/lib/runtimeProfiles)", () => {
|
||||
describe("provisionTimeoutForRuntime", () => {
|
||||
it("returns the default for unknown/missing runtimes", () => {
|
||||
expect(provisionTimeoutForRuntime(undefined)).toBe(
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
|
||||
);
|
||||
expect(provisionTimeoutForRuntime("")).toBe(
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
|
||||
);
|
||||
expect(provisionTimeoutForRuntime("some-future-runtime")).toBe(
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
|
||||
);
|
||||
});
|
||||
|
||||
it("returns default for known-fast runtimes (not in profile map)", () => {
|
||||
// If someone ever adds one of these to RUNTIME_PROFILES with a
|
||||
// slower value, this test catches the unintended regression.
|
||||
expect(provisionTimeoutForRuntime("claude-code")).toBe(
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
|
||||
);
|
||||
expect(provisionTimeoutForRuntime("langgraph")).toBe(
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
|
||||
);
|
||||
expect(provisionTimeoutForRuntime("crewai")).toBe(
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
|
||||
);
|
||||
});
|
||||
|
||||
it("returns hermes override when runtime = hermes", () => {
|
||||
expect(provisionTimeoutForRuntime("hermes")).toBe(
|
||||
RUNTIME_PROFILES.hermes?.provisionTimeoutMs,
|
||||
);
|
||||
expect(provisionTimeoutForRuntime("hermes")).toBeGreaterThanOrEqual(
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs * 5,
|
||||
);
|
||||
});
|
||||
|
||||
it("server-side workspace override wins over runtime profile", () => {
|
||||
// The resolution order is: overrides → profile → default.
|
||||
// An operator-tunable per-workspace number on the backend
|
||||
// (e.g. via a template manifest field) should beat the canvas
|
||||
// runtime map.
|
||||
expect(
|
||||
provisionTimeoutForRuntime("hermes", {
|
||||
provisionTimeoutMs: 60_000,
|
||||
}),
|
||||
).toBe(60_000);
|
||||
expect(
|
||||
provisionTimeoutForRuntime("some-unknown", {
|
||||
provisionTimeoutMs: 300_000,
|
||||
}),
|
||||
).toBe(300_000);
|
||||
});
|
||||
});
|
||||
|
||||
describe("getRuntimeProfile", () => {
|
||||
it("returns a structural profile with required fields", () => {
|
||||
const profile = getRuntimeProfile("hermes");
|
||||
expect(profile.provisionTimeoutMs).toBeTypeOf("number");
|
||||
expect(profile.provisionTimeoutMs).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it("default profile is a valid superset of every override", () => {
|
||||
// Every entry in RUNTIME_PROFILES must provide fields the
|
||||
// default does — otherwise consumers could get undefined where
|
||||
// they expected a number. This test enforces that contract so
|
||||
// future entries can't accidentally drop fields.
|
||||
for (const [runtime, profile] of Object.entries(RUNTIME_PROFILES)) {
|
||||
const resolved = getRuntimeProfile(runtime);
|
||||
expect(
|
||||
resolved.provisionTimeoutMs,
|
||||
`runtime=${runtime} must resolve to a number`,
|
||||
).toBeTypeOf("number");
|
||||
expect(resolved.provisionTimeoutMs).toBeGreaterThan(0);
|
||||
// Profile's explicit value should be used iff present.
|
||||
if (profile.provisionTimeoutMs !== undefined) {
|
||||
expect(resolved.provisionTimeoutMs).toBe(profile.provisionTimeoutMs);
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe("DEFAULT_PROVISION_TIMEOUT_MS backward-compat export", () => {
|
||||
it("still exports the same default for legacy importers", () => {
|
||||
expect(DEFAULT_PROVISION_TIMEOUT_MS).toBe(
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
|
||||
);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@ -183,7 +183,31 @@ describe("ChannelsTab — htmlFor/id label associations (WCAG 1.3.1)", () => {
|
||||
beforeEach(() => {
|
||||
mockApiGet.mockImplementation((url: string) => {
|
||||
if (url.includes("/channels/adapters")) {
|
||||
return Promise.resolve([{ type: "telegram", display_name: "Telegram" }]);
|
||||
// Mirror the real GET /channels/adapters shape — schema-driven form
|
||||
// relies on config_schema arriving from the adapter. A bare
|
||||
// {type, display_name} mock renders an empty form and every
|
||||
// getByLabelText below fails.
|
||||
return Promise.resolve([
|
||||
{
|
||||
type: "telegram",
|
||||
display_name: "Telegram",
|
||||
config_schema: [
|
||||
{
|
||||
key: "bot_token",
|
||||
label: "Bot Token",
|
||||
type: "password",
|
||||
required: true,
|
||||
sensitive: true,
|
||||
},
|
||||
{
|
||||
key: "chat_id",
|
||||
label: "Chat IDs",
|
||||
type: "text",
|
||||
required: true,
|
||||
},
|
||||
],
|
||||
},
|
||||
]);
|
||||
}
|
||||
return Promise.resolve([]);
|
||||
});
|
||||
|
||||
@ -31,12 +31,12 @@ export function UnsavedChangesGuard({
|
||||
</AlertDialog.Title>
|
||||
<div className="guard-dialog__actions">
|
||||
<AlertDialog.Cancel asChild>
|
||||
<button className="guard-dialog__keep-btn" onClick={onKeepEditing}>
|
||||
<button type="button" className="guard-dialog__keep-btn">
|
||||
Keep editing
|
||||
</button>
|
||||
</AlertDialog.Cancel>
|
||||
<AlertDialog.Action asChild>
|
||||
<button className="guard-dialog__discard-btn" onClick={onDiscard}>
|
||||
<button type="button" className="guard-dialog__discard-btn">
|
||||
Discard
|
||||
</button>
|
||||
</AlertDialog.Action>
|
||||
|
||||
@ -186,7 +186,7 @@ function ActivityRow({
|
||||
: "bg-zinc-800/60 border-zinc-700/40"
|
||||
}`}
|
||||
>
|
||||
<button onClick={onToggle} className="w-full text-left px-3 py-2">
|
||||
<button type="button" onClick={onToggle} className="w-full text-left px-3 py-2">
|
||||
{/* Top row: type badge + method + time */}
|
||||
<div className="flex items-center gap-2">
|
||||
<span className={`text-[8px] font-mono px-1.5 py-0.5 rounded ${typeStyle.text} ${typeStyle.bg} border ${typeStyle.border}`}>
|
||||
|
||||
@ -4,9 +4,23 @@ import { useState, useEffect, useCallback, useId } from "react";
|
||||
import { api } from "@/lib/api";
|
||||
import { ConfirmDialog } from "@/components/ConfirmDialog";
|
||||
|
||||
// ConfigField mirrors the Go struct returned by GET /channels/adapters —
|
||||
// the UI renders one input per field in the order the adapter returns
|
||||
// them, so per-platform form shape stays server-owned.
|
||||
interface ConfigField {
|
||||
key: string;
|
||||
label: string;
|
||||
type: "text" | "password" | "textarea";
|
||||
required: boolean;
|
||||
sensitive?: boolean;
|
||||
placeholder?: string;
|
||||
help?: string;
|
||||
}
|
||||
|
||||
interface ChannelAdapter {
|
||||
type: string;
|
||||
display_name: string;
|
||||
config_schema?: ConfigField[];
|
||||
}
|
||||
|
||||
interface Channel {
|
||||
@ -25,6 +39,11 @@ interface Props {
|
||||
workspaceId: string;
|
||||
}
|
||||
|
||||
// Telegram is the only platform that supports "Detect Chats" via
|
||||
// getUpdates. Every other platform uses a webhook URL that already
|
||||
// encodes the chat, so the button is only offered when useful.
|
||||
const SUPPORTS_DETECT_CHATS = new Set(["telegram"]);
|
||||
|
||||
function relativeTime(iso: string | null | undefined): string {
|
||||
if (!iso) return "never";
|
||||
const diff = Date.now() - new Date(iso).getTime();
|
||||
@ -41,11 +60,12 @@ export function ChannelsTab({ workspaceId }: Props) {
|
||||
const [showForm, setShowForm] = useState(false);
|
||||
const [testing, setTesting] = useState<string | null>(null);
|
||||
const [pendingDelete, setPendingDelete] = useState<Channel | null>(null);
|
||||
const [error, setError] = useState("");
|
||||
|
||||
// Form state
|
||||
// Form state — schema-driven: formValues holds the typed-in config for
|
||||
// whichever adapter is currently selected, keyed by ConfigField.key.
|
||||
const [formType, setFormType] = useState("telegram");
|
||||
const [formBotToken, setFormBotToken] = useState("");
|
||||
const [formChatId, setFormChatId] = useState("");
|
||||
const [formValues, setFormValues] = useState<Record<string, string>>({});
|
||||
const [formAllowedUsers, setFormAllowedUsers] = useState("");
|
||||
const [formError, setFormError] = useState("");
|
||||
const [discovering, setDiscovering] = useState(false);
|
||||
@ -53,18 +73,13 @@ export function ChannelsTab({ workspaceId }: Props) {
|
||||
const [selectedChats, setSelectedChats] = useState<Set<string>>(new Set());
|
||||
const [showManualInput, setShowManualInput] = useState(false);
|
||||
|
||||
// Stable IDs for label↔input associations (WCAG 1.3.1)
|
||||
const platformId = useId();
|
||||
const botTokenId = useId();
|
||||
const chatIdId = useId();
|
||||
const allowedUsersId = useId();
|
||||
|
||||
const currentAdapter = adapters.find((a) => a.type === formType);
|
||||
const currentSchema: ConfigField[] = currentAdapter?.config_schema || [];
|
||||
|
||||
const load = useCallback(async () => {
|
||||
// Fetch channels and adapters independently so a failure in one
|
||||
// doesn't blank the other. Previously a single Promise.all + silent
|
||||
// catch meant ANY request failing left both `channels` and
|
||||
// `adapters` empty — the user saw a "+ Connect" button with no
|
||||
// platform options, with no clue why.
|
||||
const [chResult, adResult] = await Promise.allSettled([
|
||||
api.get<Channel[]>(`/workspaces/${workspaceId}/channels`),
|
||||
api.get<ChannelAdapter[]>(`/channels/adapters`),
|
||||
@ -82,8 +97,6 @@ export function ChannelsTab({ workspaceId }: Props) {
|
||||
console.warn("ChannelsTab: adapters load failed", adResult.reason);
|
||||
errors.push("platforms");
|
||||
}
|
||||
// Surface BOTH failure modes so the user can distinguish
|
||||
// "no channels configured" from "API unreachable".
|
||||
if (errors.length > 0) {
|
||||
setError(`Failed to load ${errors.join(" and ")} — try refreshing`);
|
||||
} else {
|
||||
@ -100,8 +113,24 @@ export function ChannelsTab({ workspaceId }: Props) {
|
||||
return () => clearInterval(interval);
|
||||
}, [load]);
|
||||
|
||||
// Reset form values when the selected platform changes — each platform
|
||||
// has a different field set, so reusing old values would leak stale
|
||||
// data across platforms.
|
||||
useEffect(() => {
|
||||
setFormValues({});
|
||||
setDiscoveredChats([]);
|
||||
setSelectedChats(new Set());
|
||||
setShowManualInput(false);
|
||||
setFormError("");
|
||||
}, [formType]);
|
||||
|
||||
const setFieldValue = (key: string, value: string) => {
|
||||
setFormValues((prev) => ({ ...prev, [key]: value }));
|
||||
};
|
||||
|
||||
const handleDiscover = async () => {
|
||||
if (!formBotToken) {
|
||||
const botToken = formValues["bot_token"] || "";
|
||||
if (!botToken) {
|
||||
setFormError("Enter a bot token first");
|
||||
return;
|
||||
}
|
||||
@ -111,16 +140,15 @@ export function ChannelsTab({ workspaceId }: Props) {
|
||||
try {
|
||||
const res = await api.post<{ chats: { chat_id: string; name: string; type: string }[]; hint: string }>(
|
||||
`/channels/discover`,
|
||||
{ channel_type: formType, bot_token: formBotToken, workspace_id: workspaceId }
|
||||
{ channel_type: formType, bot_token: botToken, workspace_id: workspaceId }
|
||||
);
|
||||
const chats = res.chats || [];
|
||||
setDiscoveredChats(chats);
|
||||
if (chats.length === 0) {
|
||||
setFormError("No chats found. For groups: add the bot and send a message. For DMs: send /start to the bot first. Then retry.");
|
||||
} else {
|
||||
// Auto-select all discovered chats
|
||||
setSelectedChats(new Set(chats.map((c) => c.chat_id)));
|
||||
setFormChatId(chats.map((c) => c.chat_id).join(", "));
|
||||
setFieldValue("chat_id", chats.map((c) => c.chat_id).join(", "));
|
||||
}
|
||||
} catch (e) {
|
||||
setFormError(String(e));
|
||||
@ -134,15 +162,22 @@ export function ChannelsTab({ workspaceId }: Props) {
|
||||
const next = new Set(prev);
|
||||
if (next.has(chatId)) next.delete(chatId);
|
||||
else next.add(chatId);
|
||||
setFormChatId(Array.from(next).join(", "));
|
||||
setFieldValue("chat_id", Array.from(next).join(", "));
|
||||
return next;
|
||||
});
|
||||
};
|
||||
|
||||
const handleCreate = async () => {
|
||||
setFormError("");
|
||||
if (!formBotToken || !formChatId) {
|
||||
setFormError("Bot token and chat ID are required");
|
||||
// Client-side required-field check so the user sees the gap before
|
||||
// we round-trip to the server. ValidateConfig on the backend remains
|
||||
// authoritative — adapter-specific rules like "bot_token OR webhook_url"
|
||||
// for Slack aren't expressible in required-flag alone.
|
||||
const missing = currentSchema
|
||||
.filter((f) => f.required && !(formValues[f.key] || "").trim())
|
||||
.map((f) => f.label);
|
||||
if (missing.length > 0) {
|
||||
setFormError(`Required: ${missing.join(", ")}`);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
@ -150,14 +185,20 @@ export function ChannelsTab({ workspaceId }: Props) {
|
||||
.split(",")
|
||||
.map((s) => s.trim())
|
||||
.filter(Boolean);
|
||||
// Only send keys the schema knows about — avoids accidentally
|
||||
// persisting stale values when the user switched platforms mid-edit.
|
||||
const config: Record<string, string> = {};
|
||||
for (const f of currentSchema) {
|
||||
const v = (formValues[f.key] || "").trim();
|
||||
if (v) config[f.key] = v;
|
||||
}
|
||||
await api.post(`/workspaces/${workspaceId}/channels`, {
|
||||
channel_type: formType,
|
||||
config: { bot_token: formBotToken, chat_id: formChatId },
|
||||
config,
|
||||
allowed_users: allowed,
|
||||
});
|
||||
setShowForm(false);
|
||||
setFormBotToken("");
|
||||
setFormChatId("");
|
||||
setFormValues({});
|
||||
setFormAllowedUsers("");
|
||||
load();
|
||||
} catch (e) {
|
||||
@ -165,8 +206,6 @@ export function ChannelsTab({ workspaceId }: Props) {
|
||||
}
|
||||
};
|
||||
|
||||
const [error, setError] = useState("");
|
||||
|
||||
const handleToggle = async (ch: Channel) => {
|
||||
try {
|
||||
await api.patch(`/workspaces/${workspaceId}/channels/${ch.id}`, {
|
||||
@ -228,7 +267,7 @@ export function ChannelsTab({ workspaceId }: Props) {
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Create form */}
|
||||
{/* Create form — schema-driven */}
|
||||
{showForm && (
|
||||
<div className="space-y-2 p-3 bg-zinc-800/40 rounded border border-zinc-700/50">
|
||||
<div>
|
||||
@ -244,73 +283,69 @@ export function ChannelsTab({ workspaceId }: Props) {
|
||||
))}
|
||||
</select>
|
||||
</div>
|
||||
<div>
|
||||
<label htmlFor={botTokenId} className="text-[10px] text-zinc-500 block mb-1">Bot Token</label>
|
||||
<input
|
||||
id={botTokenId}
|
||||
type="password"
|
||||
value={formBotToken}
|
||||
onChange={(e) => setFormBotToken(e.target.value)}
|
||||
placeholder="123456:ABC-DEF..."
|
||||
className="w-full text-xs bg-zinc-900 border border-zinc-700 rounded px-2 py-1.5 text-zinc-300 placeholder-zinc-600"
|
||||
/>
|
||||
</div>
|
||||
<div>
|
||||
<div className="flex items-center justify-between mb-1">
|
||||
<label htmlFor={chatIdId} className="text-[10px] text-zinc-500">Chat IDs</label>
|
||||
<button
|
||||
onClick={handleDiscover}
|
||||
disabled={discovering || !formBotToken}
|
||||
className="text-[10px] px-2 py-0.5 rounded bg-blue-600/20 text-blue-400 hover:bg-blue-600/30 transition disabled:opacity-40"
|
||||
>
|
||||
{discovering ? "Detecting..." : "Detect Chats"}
|
||||
</button>
|
||||
|
||||
{/* Render one input per schema field. Fallback path: if the
|
||||
backend didn't return a schema (older platform version) show
|
||||
a single bot_token + chat_id pair to preserve the old UX. */}
|
||||
{currentSchema.length === 0 ? (
|
||||
<div className="text-[10px] text-yellow-500">
|
||||
Platform exposes no config schema — upgrade the platform to pick up first-class support.
|
||||
</div>
|
||||
{discoveredChats.length > 0 && (
|
||||
<div className="space-y-1 mb-2">
|
||||
{discoveredChats.map((chat) => (
|
||||
<label
|
||||
key={chat.chat_id}
|
||||
className="flex items-center gap-2 px-2 py-1.5 bg-zinc-900/50 rounded border border-zinc-700/50 cursor-pointer hover:bg-zinc-800/50"
|
||||
>
|
||||
<input
|
||||
type="checkbox"
|
||||
checked={selectedChats.has(chat.chat_id)}
|
||||
onChange={() => toggleChat(chat.chat_id)}
|
||||
className="rounded border-zinc-600"
|
||||
/>
|
||||
<span className="text-xs text-zinc-300">{chat.name || "Unknown"}</span>
|
||||
<span className="text-[10px] text-zinc-500 ml-auto">{chat.type} {chat.chat_id}</span>
|
||||
</label>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
{(discoveredChats.length === 0 || showManualInput) && (
|
||||
<input
|
||||
id={chatIdId}
|
||||
value={formChatId}
|
||||
onChange={(e) => setFormChatId(e.target.value)}
|
||||
placeholder="-100123456789, -100987654321"
|
||||
className="w-full text-xs bg-zinc-900 border border-zinc-700 rounded px-2 py-1.5 text-zinc-300 placeholder-zinc-600"
|
||||
) : (
|
||||
currentSchema.map((field) => (
|
||||
<SchemaField
|
||||
key={field.key}
|
||||
field={field}
|
||||
value={formValues[field.key] || ""}
|
||||
onChange={(v) => setFieldValue(field.key, v)}
|
||||
// Detect Chats button lives next to the chat_id input on
|
||||
// Telegram only (the only platform with getUpdates).
|
||||
renderExtras={
|
||||
field.key === "chat_id" && SUPPORTS_DETECT_CHATS.has(formType)
|
||||
? () => (
|
||||
<>
|
||||
<div className="flex items-center justify-end mb-1 -mt-1">
|
||||
<button
|
||||
onClick={handleDiscover}
|
||||
disabled={discovering || !formValues["bot_token"]}
|
||||
className="text-[10px] px-2 py-0.5 rounded bg-blue-600/20 text-blue-400 hover:bg-blue-600/30 transition disabled:opacity-40"
|
||||
>
|
||||
{discovering ? "Detecting..." : "Detect Chats"}
|
||||
</button>
|
||||
</div>
|
||||
{discoveredChats.length > 0 && (
|
||||
<div className="space-y-1 mb-2">
|
||||
{discoveredChats.map((chat) => (
|
||||
<label
|
||||
key={chat.chat_id}
|
||||
className="flex items-center gap-2 px-2 py-1.5 bg-zinc-900/50 rounded border border-zinc-700/50 cursor-pointer hover:bg-zinc-800/50"
|
||||
>
|
||||
<input
|
||||
type="checkbox"
|
||||
checked={selectedChats.has(chat.chat_id)}
|
||||
onChange={() => toggleChat(chat.chat_id)}
|
||||
className="rounded border-zinc-600"
|
||||
/>
|
||||
<span className="text-xs text-zinc-300">{chat.name || "Unknown"}</span>
|
||||
<span className="text-[10px] text-zinc-500 ml-auto">{chat.type} {chat.chat_id}</span>
|
||||
</label>
|
||||
))}
|
||||
<button
|
||||
onClick={() => setShowManualInput(!showManualInput)}
|
||||
className="text-[10px] text-blue-400 hover:underline"
|
||||
>
|
||||
{showManualInput ? "hide manual input" : "edit manually"}
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
</>
|
||||
)
|
||||
: undefined
|
||||
}
|
||||
/>
|
||||
)}
|
||||
<p className="text-[11px] text-zinc-500 mt-0.5">
|
||||
{discoveredChats.length > 0 ? (
|
||||
<>
|
||||
Chats: <span className="text-zinc-400">{formChatId || "(none selected)"}</span>
|
||||
{" · "}
|
||||
<button
|
||||
onClick={() => setShowManualInput(!showManualInput)}
|
||||
className="text-blue-400 hover:underline"
|
||||
>
|
||||
{showManualInput ? "hide manual input" : "edit manually"}
|
||||
</button>
|
||||
</>
|
||||
) : (
|
||||
"Click Detect Chats after adding the bot to groups or sending /start in DMs."
|
||||
)}
|
||||
</p>
|
||||
</div>
|
||||
))
|
||||
)}
|
||||
|
||||
<div>
|
||||
<label htmlFor={allowedUsersId} className="text-[10px] text-zinc-500 block mb-1">
|
||||
Allowed Users <span className="text-zinc-600">(optional, comma-separated)</span>
|
||||
@ -323,7 +358,7 @@ export function ChannelsTab({ workspaceId }: Props) {
|
||||
className="w-full text-xs bg-zinc-900 border border-zinc-700 rounded px-2 py-1.5 text-zinc-300 placeholder-zinc-600"
|
||||
/>
|
||||
<p className="text-[11px] text-zinc-500 mt-0.5">
|
||||
Telegram user IDs. Leave empty to allow everyone.
|
||||
Platform-specific user IDs. Leave empty to allow everyone.
|
||||
</p>
|
||||
</div>
|
||||
{formError && (
|
||||
@ -343,7 +378,7 @@ export function ChannelsTab({ workspaceId }: Props) {
|
||||
<div className="text-center py-8">
|
||||
<p className="text-zinc-500 text-xs">No channels connected</p>
|
||||
<p className="text-zinc-600 text-[10px] mt-1">
|
||||
Connect Telegram, Slack, or Discord to chat with this agent from social platforms.
|
||||
Connect Telegram, Slack, Discord, or Lark / Feishu to chat with this agent from social platforms.
|
||||
</p>
|
||||
</div>
|
||||
)}
|
||||
@ -364,7 +399,7 @@ export function ChannelsTab({ workspaceId }: Props) {
|
||||
{ch.channel_type.charAt(0).toUpperCase() + ch.channel_type.slice(1)}
|
||||
</span>
|
||||
<span className="text-[10px] text-zinc-500">
|
||||
{ch.config.chat_id}
|
||||
{ch.config.chat_id || ch.config.channel_id || ""}
|
||||
</span>
|
||||
</div>
|
||||
<div className="flex items-center gap-1.5">
|
||||
@ -415,3 +450,53 @@ export function ChannelsTab({ workspaceId }: Props) {
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
// SchemaField renders one ConfigField as a label + input. Kept inline in
|
||||
// this file so the ChannelsTab stays self-contained; promote to its own
|
||||
// module if another tab ever needs it.
|
||||
function SchemaField({
|
||||
field,
|
||||
value,
|
||||
onChange,
|
||||
renderExtras,
|
||||
}: {
|
||||
field: ConfigField;
|
||||
value: string;
|
||||
onChange: (v: string) => void;
|
||||
renderExtras?: () => React.ReactNode;
|
||||
}) {
|
||||
const inputId = useId();
|
||||
const common =
|
||||
"w-full text-xs bg-zinc-900 border border-zinc-700 rounded px-2 py-1.5 text-zinc-300 placeholder-zinc-600";
|
||||
return (
|
||||
<div>
|
||||
<label htmlFor={inputId} className="text-[10px] text-zinc-500 block mb-1">
|
||||
{field.label}
|
||||
{!field.required && <span className="text-zinc-600"> (optional)</span>}
|
||||
</label>
|
||||
{field.type === "textarea" ? (
|
||||
<textarea
|
||||
id={inputId}
|
||||
value={value}
|
||||
onChange={(e) => onChange(e.target.value)}
|
||||
placeholder={field.placeholder}
|
||||
rows={3}
|
||||
className={common}
|
||||
/>
|
||||
) : (
|
||||
<input
|
||||
id={inputId}
|
||||
type={field.type === "password" ? "password" : "text"}
|
||||
value={value}
|
||||
onChange={(e) => onChange(e.target.value)}
|
||||
placeholder={field.placeholder}
|
||||
className={common}
|
||||
/>
|
||||
)}
|
||||
{renderExtras?.()}
|
||||
{field.help && (
|
||||
<p className="text-[11px] text-zinc-500 mt-0.5">{field.help}</p>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
@ -44,7 +44,7 @@ export function FilesToolbar({
|
||||
<div className="flex gap-1.5">
|
||||
{root === "/configs" && (
|
||||
<>
|
||||
<button onClick={onNewFile} aria-label="Create new file" className="text-[10px] text-blue-400 hover:text-blue-300" title="Create new file">
|
||||
<button type="button" onClick={onNewFile} aria-label="Create new file" className="text-[10px] text-blue-400 hover:text-blue-300" title="Create new file">
|
||||
+ New
|
||||
</button>
|
||||
<input
|
||||
@ -57,20 +57,20 @@ export function FilesToolbar({
|
||||
className="hidden"
|
||||
onChange={(e) => e.target.files && onUpload(e.target.files)}
|
||||
/>
|
||||
<button onClick={() => uploadRef.current?.click()} aria-label="Upload folder" className="text-[10px] text-blue-400 hover:text-blue-300" title="Upload folder">
|
||||
<button type="button" onClick={() => uploadRef.current?.click()} aria-label="Upload folder" className="text-[10px] text-blue-400 hover:text-blue-300" title="Upload folder">
|
||||
Upload
|
||||
</button>
|
||||
</>
|
||||
)}
|
||||
<button onClick={onDownloadAll} aria-label="Download all files" className="text-[10px] text-zinc-500 hover:text-zinc-300" title="Download all files">
|
||||
<button type="button" onClick={onDownloadAll} aria-label="Download all files" className="text-[10px] text-zinc-500 hover:text-zinc-300" title="Download all files">
|
||||
Export
|
||||
</button>
|
||||
{root === "/configs" && (
|
||||
<button onClick={onClearAll} aria-label="Delete all files" className="text-[10px] text-red-400/60 hover:text-red-400" title="Delete all files">
|
||||
<button type="button" onClick={onClearAll} aria-label="Delete all files" className="text-[10px] text-red-400/60 hover:text-red-400" title="Delete all files">
|
||||
Clear
|
||||
</button>
|
||||
)}
|
||||
<button onClick={onRefresh} aria-label="Refresh file list" className="text-[10px] text-zinc-500 hover:text-zinc-300" title="Refresh">
|
||||
<button type="button" onClick={onRefresh} aria-label="Refresh file list" className="text-[10px] text-zinc-500 hover:text-zinc-300" title="Refresh">
|
||||
↻
|
||||
</button>
|
||||
</div>
|
||||
|
||||
@ -55,7 +55,7 @@ export function TracesTab({ workspaceId }: Props) {
|
||||
<div className="p-4 space-y-2">
|
||||
<div className="flex items-center justify-between mb-2">
|
||||
<span className="text-xs text-zinc-400">{traces.length} traces</span>
|
||||
<button onClick={loadTraces} className="text-[10px] text-zinc-500 hover:text-zinc-300">
|
||||
<button type="button" onClick={loadTraces} className="text-[10px] text-zinc-500 hover:text-zinc-300">
|
||||
Refresh
|
||||
</button>
|
||||
</div>
|
||||
|
||||
@ -104,7 +104,7 @@ export function TagList({ label, values, onChange, placeholder }: { label: strin
|
||||
{values.map((v, i) => (
|
||||
<span key={i} className="inline-flex items-center gap-1 px-1.5 py-0.5 bg-zinc-800 border border-zinc-700 rounded text-[10px] text-zinc-300 font-mono">
|
||||
{v}
|
||||
<button aria-label={`Remove tag ${v}`} onClick={() => onChange(values.filter((_, j) => j !== i))} className="text-zinc-500 hover:text-red-400">×</button>
|
||||
<button type="button" aria-label={`Remove tag ${v}`} onClick={() => onChange(values.filter((_, j) => j !== i))} className="text-zinc-500 hover:text-red-400">×</button>
|
||||
</span>
|
||||
))}
|
||||
</div>
|
||||
@ -131,7 +131,7 @@ export function Section({ title, children, defaultOpen = true }: { title: string
|
||||
const [open, setOpen] = useState(defaultOpen);
|
||||
return (
|
||||
<div className="border border-zinc-800 rounded mb-2">
|
||||
<button onClick={() => setOpen(!open)} className="w-full flex items-center justify-between px-3 py-1.5 text-[10px] text-zinc-400 hover:text-zinc-200 bg-zinc-900/50">
|
||||
<button type="button" onClick={() => setOpen(!open)} className="w-full flex items-center justify-between px-3 py-1.5 text-[10px] text-zinc-400 hover:text-zinc-200 bg-zinc-900/50">
|
||||
<span className="font-medium uppercase tracking-wider">{title}</span>
|
||||
<span>{open ? "▾" : "▸"}</span>
|
||||
</button>
|
||||
|
||||
@ -113,9 +113,9 @@ function SecretRow({ label, secretKey, isSet, scope, globalMode, onSave, onDelet
|
||||
{isSet && <span className="text-[10px] text-green-500 bg-green-900/30 px-1.5 py-0.5 rounded">Set</span>}
|
||||
{scope && <ScopeBadge scope={scope} />}
|
||||
{!editing && isSet && (globalMode || scope !== "global") && (
|
||||
<button onClick={onDelete} className="text-[11px] text-red-400 hover:text-red-300">Remove</button>
|
||||
<button type="button" onClick={onDelete} className="text-[11px] text-red-400 hover:text-red-300">Remove</button>
|
||||
)}
|
||||
<button onClick={() => setEditing(!editing)} className="text-[11px] text-blue-400 hover:text-blue-300">
|
||||
<button type="button" onClick={() => setEditing(!editing)} className="text-[11px] text-blue-400 hover:text-blue-300">
|
||||
{actionLabel()}
|
||||
</button>
|
||||
</div>
|
||||
@ -128,7 +128,7 @@ function SecretRow({ label, secretKey, isSet, scope, globalMode, onSave, onDelet
|
||||
type={isPlaintext ? "text" : "password"} autoFocus
|
||||
className="flex-1 bg-zinc-900 border border-zinc-600 rounded px-2 py-1 text-[10px] text-zinc-100 font-mono focus:outline-none focus:border-blue-500"
|
||||
/>
|
||||
<button
|
||||
<button type="button"
|
||||
onClick={() => { onSave(value); setEditing(false); setValue(""); }}
|
||||
disabled={!value}
|
||||
className="px-2 py-1 bg-blue-600 hover:bg-blue-500 text-[10px] rounded text-white disabled:opacity-30"
|
||||
@ -165,10 +165,10 @@ function CustomSecretRow({ secretKey, scope, globalMode, onSave, onDelete }: {
|
||||
<span className="text-[10px] text-green-500">Set</span>
|
||||
{!globalMode && <ScopeBadge scope={scope} />}
|
||||
{canDelete && !editing && (
|
||||
<button onClick={onDelete} className="text-[11px] text-red-400 hover:text-red-300">Remove</button>
|
||||
<button type="button" onClick={onDelete} className="text-[11px] text-red-400 hover:text-red-300">Remove</button>
|
||||
)}
|
||||
{(canDelete || showOverride) && (
|
||||
<button onClick={() => setEditing(!editing)} className="text-[11px] text-blue-400 hover:text-blue-300">
|
||||
<button type="button" onClick={() => setEditing(!editing)} className="text-[11px] text-blue-400 hover:text-blue-300">
|
||||
{editing ? "Cancel" : showOverride ? "Override" : "Update"}
|
||||
</button>
|
||||
)}
|
||||
@ -181,7 +181,7 @@ function CustomSecretRow({ secretKey, scope, globalMode, onSave, onDelete }: {
|
||||
placeholder="New value" type="password" autoFocus
|
||||
className="flex-1 bg-zinc-900 border border-zinc-600 rounded px-2 py-1 text-[10px] text-zinc-100 font-mono focus:outline-none focus:border-blue-500"
|
||||
/>
|
||||
<button
|
||||
<button type="button"
|
||||
onClick={() => { onSave(value); setEditing(false); setValue(""); }}
|
||||
disabled={!value}
|
||||
className="px-2 py-1 bg-blue-600 hover:bg-blue-500 text-[10px] rounded text-white disabled:opacity-30"
|
||||
@ -355,16 +355,16 @@ export function SecretsSection({ workspaceId, requiredEnv }: { workspaceId: stri
|
||||
<input value={newValue} onChange={(e) => setNewValue(e.target.value)} placeholder="Value" type="password"
|
||||
className="w-full bg-zinc-900 border border-zinc-600 rounded px-2 py-1 text-[10px] text-zinc-100 focus:outline-none focus:border-blue-500" />
|
||||
<div className="flex gap-2">
|
||||
<button onClick={() => { if (newKey && newValue) handleSave(newKey, newValue); }} disabled={!newKey || !newValue}
|
||||
<button type="button" onClick={() => { if (newKey && newValue) handleSave(newKey, newValue); }} disabled={!newKey || !newValue}
|
||||
className="px-2 py-1 bg-blue-600 hover:bg-blue-500 text-[10px] rounded text-white disabled:opacity-30">
|
||||
Save{globalMode ? " (Global)" : ""}
|
||||
</button>
|
||||
<button onClick={() => { setShowAdd(false); setNewKey(""); setNewValue(""); }}
|
||||
<button type="button" onClick={() => { setShowAdd(false); setNewKey(""); setNewValue(""); }}
|
||||
className="px-2 py-1 bg-zinc-700 hover:bg-zinc-600 text-[10px] rounded text-zinc-300">Cancel</button>
|
||||
</div>
|
||||
</div>
|
||||
) : (
|
||||
<button onClick={() => setShowAdd(true)} className="text-[10px] text-blue-400 hover:text-blue-300">
|
||||
<button type="button" onClick={() => setShowAdd(true)} className="text-[10px] text-blue-400 hover:text-blue-300">
|
||||
+ Add {globalMode ? "Global " : ""}Variable
|
||||
</button>
|
||||
)}
|
||||
|
||||
@ -6,32 +6,44 @@ import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
|
||||
// runs happily in node. Splitting keeps the node tests fast.
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// 401 handling — gated on SaaS-tenant hostname
|
||||
// 401 handling — session-probe-before-redirect
|
||||
// ---------------------------------------------------------------------------
|
||||
//
|
||||
// Before fix/quickstart-bugless, any 401 from any endpoint triggered
|
||||
// `redirectToLogin()`, navigating to `/cp/auth/login`. That route
|
||||
// exists only on SaaS (mounted by cp_proxy when CP_UPSTREAM_URL is
|
||||
// set). On localhost / self-hosted / Vercel preview it 404s, so the
|
||||
// user lands on a broken login page instead of seeing the actual error.
|
||||
// History:
|
||||
// 1. fix/quickstart-bugless: gated redirect on SaaS hostname (slug).
|
||||
// 2. fix/api-401-probe-before-redirect (this file): probe /cp/auth/me
|
||||
// before redirecting on a 401 from a non-auth path. The earlier
|
||||
// behaviour redirected on EVERY 401, so a single 401 from
|
||||
// /workspaces/:id/plugins (workspace-scoped — refused by the
|
||||
// tenant admin bearer) yanked the user to AuthKit even when
|
||||
// the session was fine. The probe lets us tell "session dead"
|
||||
// from "endpoint refused this token."
|
||||
//
|
||||
// These tests lock in:
|
||||
// - SaaS tenant hostname (*.moleculesai.app) → 401 still redirects.
|
||||
// - non-SaaS hostname (localhost, LAN IP, apex) → 401 throws, no
|
||||
// redirect, so the caller renders a real error affordance.
|
||||
// Matrix:
|
||||
// slug | path | probe → me | expected
|
||||
// --- | --- | --- | ---
|
||||
// acme | /cp/auth/me | (n/a) | redirect (path IS auth)
|
||||
// acme | /workspaces/... | 401 | redirect (session dead)
|
||||
// acme | /workspaces/... | 200 | throw, no redirect
|
||||
// acme | /workspaces/... | network err| throw, no redirect
|
||||
// "" | /workspaces/... | (n/a) | throw, no redirect (no slug)
|
||||
|
||||
const mockFetch = vi.fn();
|
||||
globalThis.fetch = mockFetch;
|
||||
|
||||
function mockFailure(status: number, text: string) {
|
||||
function mockNextResponse(status: number, text = "") {
|
||||
mockFetch.mockResolvedValueOnce({
|
||||
ok: false,
|
||||
ok: status >= 200 && status < 300,
|
||||
status,
|
||||
json: () => Promise.reject(new Error("no json")),
|
||||
text: () => Promise.resolve(text),
|
||||
} as unknown as Response);
|
||||
}
|
||||
|
||||
function mockNextNetworkError() {
|
||||
mockFetch.mockRejectedValueOnce(new Error("network"));
|
||||
}
|
||||
|
||||
function setHostname(host: string) {
|
||||
Object.defineProperty(window, "location", {
|
||||
configurable: true,
|
||||
@ -59,27 +71,66 @@ describe("api 401 handling", () => {
|
||||
vi.resetModules();
|
||||
});
|
||||
|
||||
it("redirects to login on SaaS tenant hostname", async () => {
|
||||
it("redirects when /cp/auth/me itself 401s — that IS the session-dead signal", async () => {
|
||||
setHostname("acme.moleculesai.app");
|
||||
mockFailure(401, '{"error":"admin auth required"}');
|
||||
// Single fetch: the /cp/auth/me call itself.
|
||||
mockNextResponse(401, '{"error":"unauthenticated"}');
|
||||
|
||||
const { api } = await import("../api");
|
||||
await expect(api.get("/workspaces")).rejects.toThrow(/Session expired/);
|
||||
await expect(api.get("/cp/auth/me")).rejects.toThrow(/Session expired/);
|
||||
expect(redirectSpy).toHaveBeenCalledWith("sign-in");
|
||||
// No probe fired — we already know the session is dead.
|
||||
expect(mockFetch).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("redirects when /cp/auth/me probe ALSO 401s — session genuinely dead", async () => {
|
||||
setHostname("acme.moleculesai.app");
|
||||
// First call: the workspace-scoped fetch returns 401.
|
||||
mockNextResponse(401, '{"error":"workspace token required"}');
|
||||
// Second call: the probe to /cp/auth/me also 401s.
|
||||
mockNextResponse(401, '{"error":"unauthenticated"}');
|
||||
|
||||
const { api } = await import("../api");
|
||||
await expect(api.get("/workspaces/abc/plugins")).rejects.toThrow(/Session expired/);
|
||||
expect(redirectSpy).toHaveBeenCalledWith("sign-in");
|
||||
});
|
||||
|
||||
it("does NOT redirect when probe returns 200 — endpoint refused this token, session fine", async () => {
|
||||
setHostname("acme.moleculesai.app");
|
||||
// First call: workspace-scoped 401.
|
||||
mockNextResponse(401, '{"error":"workspace token required"}');
|
||||
// Second call: probe shows the session is alive.
|
||||
mockNextResponse(200, '{"user_id":"u1","org_id":"o1","email":"x@y"}');
|
||||
|
||||
const { api } = await import("../api");
|
||||
await expect(api.get("/workspaces/abc/plugins")).rejects.toThrow(/401/);
|
||||
expect(redirectSpy).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("does NOT redirect when probe network-errors — conservative fallback", async () => {
|
||||
setHostname("acme.moleculesai.app");
|
||||
mockNextResponse(401, '{"error":"workspace token required"}');
|
||||
mockNextNetworkError();
|
||||
|
||||
const { api } = await import("../api");
|
||||
await expect(api.get("/workspaces/abc/plugins")).rejects.toThrow(/401/);
|
||||
expect(redirectSpy).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("does NOT redirect on localhost — throws a real error instead", async () => {
|
||||
setHostname("localhost");
|
||||
mockFailure(401, '{"error":"admin auth required"}');
|
||||
mockNextResponse(401, '{"error":"admin auth required"}');
|
||||
|
||||
const { api } = await import("../api");
|
||||
await expect(api.get("/workspaces")).rejects.toThrow(/401/);
|
||||
expect(redirectSpy).not.toHaveBeenCalled();
|
||||
// No slug → no probe fires either.
|
||||
expect(mockFetch).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("does NOT redirect on a LAN hostname", async () => {
|
||||
setHostname("192.168.1.74");
|
||||
mockFailure(401, '{"error":"missing workspace auth token"}');
|
||||
mockNextResponse(401, '{"error":"missing workspace auth token"}');
|
||||
|
||||
const { api } = await import("../api");
|
||||
await expect(api.get("/workspaces/abc/activity")).rejects.toThrow(/401/);
|
||||
@ -91,7 +142,7 @@ describe("api 401 handling", () => {
|
||||
// Users landing on app.moleculesai.app (pre-tenant-selection) must
|
||||
// see the real 401 error rather than loop on login.
|
||||
setHostname("app.moleculesai.app");
|
||||
mockFailure(401, '{"error":"admin auth required"}');
|
||||
mockNextResponse(401, '{"error":"admin auth required"}');
|
||||
|
||||
const { api } = await import("../api");
|
||||
await expect(api.get("/workspaces")).rejects.toThrow(/401/);
|
||||
|
||||
@ -60,15 +60,45 @@ async function request<T>(
|
||||
return request<T>(method, path, body, retryCount + 1, options);
|
||||
}
|
||||
if (res.status === 401) {
|
||||
// Session expired or credentials lost. On SaaS (tenant subdomain)
|
||||
// the login page lives at /cp/auth/login and is mounted by the
|
||||
// control-plane reverse proxy — redirect. On self-hosted / local
|
||||
// dev / Vercel preview there IS no /cp/* mount, so redirecting
|
||||
// would navigate to a 404 ("404 page not found") instead of the
|
||||
// real error the user should see. In that case, throw instead
|
||||
// and let the caller render a meaningful failure (retry button,
|
||||
// error banner, etc.).
|
||||
if (slug) {
|
||||
// Distinguish "session is dead" from "this endpoint refused this
|
||||
// token." Old behaviour blanket-redirected on every 401, so a
|
||||
// single transient 401 from a workspace-scoped endpoint
|
||||
// (/workspaces/:id/peers, /plugins, etc. that need a workspace
|
||||
// token rather than the tenant admin bearer) yanked the user
|
||||
// back to AuthKit even when their session was perfectly fine.
|
||||
// That broke the staging-tabs E2E for the entire 2026-04-25
|
||||
// night; #2073/#2074 worked around the symptom in the test by
|
||||
// mocking 401→200 for every fetch, but the user-facing bug
|
||||
// stayed.
|
||||
//
|
||||
// The canonical "session is dead" signal is /cp/auth/me
|
||||
// returning 401. For any 401 on a non-auth path, probe
|
||||
// /cp/auth/me before deciding to redirect:
|
||||
// - probe 401 → session is actually dead → redirect
|
||||
// - probe 200 → session is fine, the endpoint just refused
|
||||
// our specific token → throw a real error,
|
||||
// caller renders an error state
|
||||
// - probe network error → assume session-fine (conservative;
|
||||
// better to throw than to redirect on a
|
||||
// transient probe failure)
|
||||
//
|
||||
// Self-hosted / localhost / reserved subdomains still throw
|
||||
// without redirecting (slug is empty in those cases) — same
|
||||
// policy as before.
|
||||
const isAuthPath = path.startsWith("/cp/auth/");
|
||||
let sessionDead = isAuthPath;
|
||||
if (!isAuthPath && slug) {
|
||||
try {
|
||||
const probe = await fetch(`${PLATFORM_URL}/cp/auth/me`, {
|
||||
credentials: "include",
|
||||
signal: AbortSignal.timeout(5000),
|
||||
});
|
||||
sessionDead = probe.status === 401;
|
||||
} catch {
|
||||
// Probe failed (network/timeout) — fall through to throw.
|
||||
}
|
||||
}
|
||||
if (sessionDead && slug) {
|
||||
const { redirectToLogin } = await import("./auth");
|
||||
redirectToLogin("sign-in");
|
||||
throw new Error("Session expired — redirecting to login");
|
||||
|
||||
@ -32,6 +32,10 @@ export interface Plan {
|
||||
// plans is the canonical order shown on the pricing page: free → starter
|
||||
// → pro. Change the order here + the rendered columns follow. Keeping
|
||||
// this as a module-level const so tests can assert against a known list.
|
||||
//
|
||||
// Flat-rate positioning (Issue #1833): "starter" and "pro" are flat-rate
|
||||
// per-org, not per-seat. This is a deliberate wedge against Cursor/Windsurf
|
||||
// ($40/seat) — at 5 engineers the Team tier is 28% cheaper.
|
||||
export const plans: Plan[] = [
|
||||
{
|
||||
id: "free",
|
||||
@ -48,8 +52,8 @@ export const plans: Plan[] = [
|
||||
},
|
||||
{
|
||||
id: "starter",
|
||||
name: "Starter",
|
||||
tagline: "For small teams shipping real agents",
|
||||
name: "Team",
|
||||
tagline: "Flat-rate for teams — one price, no per-seat fees",
|
||||
price: "$29/month",
|
||||
features: [
|
||||
"10 workspaces",
|
||||
@ -57,14 +61,15 @@ export const plans: Plan[] = [
|
||||
"Private Upstash Redis namespace",
|
||||
"Email support (48h)",
|
||||
"5M LLM tokens / month included",
|
||||
"No per-seat pricing",
|
||||
],
|
||||
ctaLabel: "Upgrade to Starter",
|
||||
ctaLabel: "Upgrade to Team",
|
||||
highlighted: true,
|
||||
},
|
||||
{
|
||||
id: "pro",
|
||||
name: "Pro",
|
||||
tagline: "For production multi-agent orgs",
|
||||
name: "Growth",
|
||||
tagline: "Flat-rate for production multi-agent orgs",
|
||||
price: "$99/month",
|
||||
features: [
|
||||
"Unlimited workspaces",
|
||||
@ -72,9 +77,10 @@ export const plans: Plan[] = [
|
||||
"Cross-workspace A2A audit log",
|
||||
"Priority support (24h)",
|
||||
"25M LLM tokens / month included",
|
||||
"No per-seat pricing",
|
||||
"Usage-based overage billing",
|
||||
],
|
||||
ctaLabel: "Upgrade to Pro",
|
||||
ctaLabel: "Upgrade to Growth",
|
||||
},
|
||||
];
|
||||
|
||||
|
||||
120
canvas/src/lib/runtimeProfiles.ts
Normal file
120
canvas/src/lib/runtimeProfiles.ts
Normal file
@ -0,0 +1,120 @@
|
||||
/**
|
||||
* Runtime profiles — per-runtime UX metadata.
|
||||
*
|
||||
* Scaling target: hundreds of runtimes (plugin-architecture-v2 roadmap).
|
||||
* This module is the single source of truth for runtime-specific UI knobs
|
||||
* on the canvas side. Each runtime can declare:
|
||||
*
|
||||
* - provisionTimeoutMs: when to show the "taking longer than expected"
|
||||
* banner. Fast docker runtimes = 2min; slow source-build runtimes = 12min.
|
||||
* - (future) label, icon, color, helpUrl, capabilities — add as needed.
|
||||
*
|
||||
* Resolution order (most specific wins):
|
||||
*
|
||||
* 1. Server-provided override on the workspace data (e.g.
|
||||
* `workspace.data.provisionTimeoutMs` set from a template manifest).
|
||||
* Lets operators tune without a canvas release once server-side
|
||||
* declarative config lands.
|
||||
* 2. Per-runtime entry in RUNTIME_PROFILES.
|
||||
* 3. DEFAULT_RUNTIME_PROFILE.
|
||||
*
|
||||
* Adding a new runtime:
|
||||
* - If it's fast (≤ 2min cold boot): do nothing, the default catches it.
|
||||
* - If it's slow: add one entry to RUNTIME_PROFILES below.
|
||||
* - Long-term: move runtime profiles server-side so this file can shrink.
|
||||
*
|
||||
* Architectural note: this deliberately lives under /lib, NOT
|
||||
* /components/ProvisioningTimeout. Other components (e.g. a
|
||||
* "create workspace" dialog that needs to know the runtime's expected
|
||||
* cold-boot time) should import from here too — avoids duplicating the
|
||||
* runtime-name knowledge across the codebase.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Structural shape of a runtime profile. Add fields as new UX knobs
|
||||
* become runtime-specific. Every field should be optional so new runtimes
|
||||
* can partially fill the profile without breaking older code that reads
|
||||
* only some fields.
|
||||
*/
|
||||
export interface RuntimeProfile {
|
||||
/** Milliseconds before the canvas shows the "taking too long" banner.
|
||||
* Base value — the ProvisioningTimeout component still scales this by
|
||||
* concurrent-provisioning count. */
|
||||
provisionTimeoutMs?: number;
|
||||
// Future extensions (kept commented until used):
|
||||
// label?: string;
|
||||
// icon?: string;
|
||||
// color?: string;
|
||||
// helpUrl?: string;
|
||||
}
|
||||
|
||||
/** The floor every runtime inherits unless it overrides. Calibrated for
|
||||
* docker-local fast runtimes (claude-code, langgraph, crewai) where cold
|
||||
* boot is 30-90s. */
|
||||
export const DEFAULT_RUNTIME_PROFILE: Required<
|
||||
Pick<RuntimeProfile, "provisionTimeoutMs">
|
||||
> = {
|
||||
provisionTimeoutMs: 120_000, // 2 min
|
||||
};
|
||||
|
||||
/**
|
||||
* Named per-runtime overrides. Keep this map small and explicit —
|
||||
* each entry is a deliberate statement that this runtime's cold-boot
|
||||
* behavior differs materially from the default.
|
||||
*
|
||||
* Each override must also ship with a comment explaining WHY the default
|
||||
* is wrong for this runtime. Unexplained numbers rot.
|
||||
*/
|
||||
export const RUNTIME_PROFILES: Record<string, RuntimeProfile> = {
|
||||
hermes: {
|
||||
// 12 min. Installs ripgrep + ffmpeg + node22 + builds hermes-agent
|
||||
// from source + Playwright + Chromium (~300MB download). Measured
|
||||
// cold boots on staging EC2 routinely land at 8-13 min. Aligns
|
||||
// with SaaS E2E's PROVISION_TIMEOUT_SECS=900 (15 min) so the UI
|
||||
// warning lands shortly before the backend itself gives up.
|
||||
provisionTimeoutMs: 720_000,
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* Data fields the canvas can consult for per-workspace overrides. These
|
||||
* let the backend (via workspace data on the socket payload) override
|
||||
* profile values without a canvas release.
|
||||
*
|
||||
* Intentionally loose typing — if a field isn't present on the node, we
|
||||
* fall through to the runtime profile.
|
||||
*/
|
||||
export interface WorkspaceRuntimeOverrides {
|
||||
provisionTimeoutMs?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve a runtime profile for a given runtime name, optionally merging
|
||||
* server-provided per-workspace overrides on top.
|
||||
*
|
||||
* Resolution (most-specific wins):
|
||||
* overrides.provisionTimeoutMs
|
||||
* → RUNTIME_PROFILES[runtime].provisionTimeoutMs
|
||||
* → DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs
|
||||
*/
|
||||
export function getRuntimeProfile(
|
||||
runtime: string | undefined,
|
||||
overrides?: WorkspaceRuntimeOverrides,
|
||||
): Required<Pick<RuntimeProfile, "provisionTimeoutMs">> {
|
||||
const profile = runtime ? RUNTIME_PROFILES[runtime] : undefined;
|
||||
return {
|
||||
provisionTimeoutMs:
|
||||
overrides?.provisionTimeoutMs ??
|
||||
profile?.provisionTimeoutMs ??
|
||||
DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
|
||||
};
|
||||
}
|
||||
|
||||
/** Convenience: just the provisionTimeoutMs. Equivalent to
|
||||
* `getRuntimeProfile(runtime, overrides).provisionTimeoutMs`. */
|
||||
export function provisionTimeoutForRuntime(
|
||||
runtime: string | undefined,
|
||||
overrides?: WorkspaceRuntimeOverrides,
|
||||
): number {
|
||||
return getRuntimeProfile(runtime, overrides).provisionTimeoutMs;
|
||||
}
|
||||
@ -4,6 +4,7 @@
|
||||
"plugins": [
|
||||
{"name": "browser-automation", "repo": "Molecule-AI/molecule-ai-plugin-browser-automation", "ref": "main"},
|
||||
{"name": "ecc", "repo": "Molecule-AI/molecule-ai-plugin-ecc", "ref": "main"},
|
||||
{"name": "gh-identity", "repo": "Molecule-AI/molecule-ai-plugin-gh-identity", "ref": "main"},
|
||||
{"name": "molecule-audit", "repo": "Molecule-AI/molecule-ai-plugin-molecule-audit", "ref": "main"},
|
||||
{"name": "molecule-audit-trail", "repo": "Molecule-AI/molecule-ai-plugin-molecule-audit-trail", "ref": "main"},
|
||||
{"name": "molecule-careful-bash", "repo": "Molecule-AI/molecule-ai-plugin-molecule-careful-bash", "ref": "main"},
|
||||
|
||||
@ -32,7 +32,7 @@
|
||||
set -euo pipefail
|
||||
|
||||
DRY_RUN=1
|
||||
MAX_DELETE_PCT=50 # refuse to delete more than half the records in one run
|
||||
MAX_DELETE_PCT="${MAX_DELETE_PCT:-50}" # refuse to delete more than this pct of records in one run; caller can override via env
|
||||
REGION="${AWS_DEFAULT_REGION:-us-east-2}"
|
||||
|
||||
for arg in "$@"; do
|
||||
|
||||
@ -23,10 +23,13 @@ import (
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/supervised"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/ws"
|
||||
|
||||
// External plugin — registers an EnvMutator that injects GITHUB_TOKEN /
|
||||
// GH_TOKEN from a GitHub App installation token. Soft-dep: only active
|
||||
// when GITHUB_APP_ID env var is set (see main() for the gate).
|
||||
pluginloader "github.com/Molecule-AI/molecule-ai-plugin-github-app-auth/pluginloader"
|
||||
// External plugins — each registers EnvMutator(s) that run at workspace
|
||||
// provision time. Loaded via soft-dep gates in main() so self-hosters
|
||||
// without the App or without per-agent identity configured keep working.
|
||||
githubappauth "github.com/Molecule-AI/molecule-ai-plugin-github-app-auth/pluginloader"
|
||||
ghidentity "github.com/Molecule-AI/molecule-ai-plugin-gh-identity/pluginloader"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/pkg/provisionhook"
|
||||
)
|
||||
|
||||
func main() {
|
||||
@ -153,22 +156,49 @@ func main() {
|
||||
wh.SetCPProvisioner(cpProv)
|
||||
}
|
||||
|
||||
// External-plugin env mutators — each plugin contributes 0+ mutators
|
||||
// onto a shared registry. Order matters: gh-identity populates
|
||||
// MOLECULE_AGENT_ROLE-derived attribution env vars that downstream
|
||||
// mutators and the workspace's install.sh can then read. Keep
|
||||
// github-app-auth last because it fails loudly on misconfig and its
|
||||
// failure mode is "no GITHUB_TOKEN" — worth surfacing after the
|
||||
// cheaper mutators already ran.
|
||||
envReg := provisionhook.NewRegistry()
|
||||
|
||||
// gh-identity plugin — per-agent attribution via env injection + gh
|
||||
// wrapper shipped as base64 env. Soft-dep: no config file is OK
|
||||
// (plugin no-ops when no role is set on the workspace).
|
||||
// Tracks molecule-core#1957.
|
||||
if res, err := ghidentity.BuildRegistry(); err != nil {
|
||||
log.Fatalf("gh-identity plugin: %v", err)
|
||||
} else {
|
||||
envReg.Register(res.Mutator)
|
||||
log.Printf("gh-identity: registered (config file=%q)", os.Getenv("MOLECULE_GH_IDENTITY_CONFIG_FILE"))
|
||||
}
|
||||
|
||||
// github-app-auth plugin — injects GITHUB_TOKEN + GH_TOKEN into every
|
||||
// workspace env using the App's installation access token (rotates ~hourly).
|
||||
// Soft-skip when GITHUB_APP_* env vars are absent so dev/self-hosters
|
||||
// without an App configured keep working; fail-loud only on MISCONFIG
|
||||
// (e.g. APP_ID set but key file missing), not on unset.
|
||||
if os.Getenv("GITHUB_APP_ID") != "" {
|
||||
if reg, err := pluginloader.BuildRegistry(); err != nil {
|
||||
if reg, err := githubappauth.BuildRegistry(); err != nil {
|
||||
log.Fatalf("github-app-auth plugin: %v", err)
|
||||
} else {
|
||||
wh.SetEnvMutators(reg)
|
||||
log.Printf("github-app-auth: registered, %d mutator(s) in chain", reg.Len())
|
||||
// Copy the plugin's mutators onto the shared registry so the
|
||||
// TokenProvider probe (FirstTokenProvider) still finds them.
|
||||
for _, m := range reg.Mutators() {
|
||||
envReg.Register(m)
|
||||
}
|
||||
log.Printf("github-app-auth: registered, %d mutator(s) added to chain", reg.Len())
|
||||
}
|
||||
} else {
|
||||
log.Println("github-app-auth: GITHUB_APP_ID unset — skipping plugin registration (agents will use any PAT from .env)")
|
||||
}
|
||||
|
||||
wh.SetEnvMutators(envReg)
|
||||
log.Printf("env-mutator chain: %v", envReg.Names())
|
||||
|
||||
// Offline handler: broadcast event + auto-restart the dead workspace
|
||||
onWorkspaceOffline := func(innerCtx context.Context, workspaceID string) {
|
||||
if err := broadcaster.RecordAndBroadcast(innerCtx, "WORKSPACE_OFFLINE", workspaceID, map[string]interface{}{}); err != nil {
|
||||
|
||||
@ -4,6 +4,7 @@ go 1.25.0
|
||||
|
||||
require (
|
||||
github.com/DATA-DOG/go-sqlmock v1.5.2
|
||||
github.com/Molecule-AI/molecule-ai-plugin-gh-identity v0.0.0-20260424033845-4fd5ac7be30f
|
||||
github.com/Molecule-AI/molecule-ai-plugin-github-app-auth v0.0.0-20260421064811-7d98ae51e31d
|
||||
github.com/alicebob/miniredis/v2 v2.37.0
|
||||
github.com/creack/pty v1.1.18
|
||||
|
||||
@ -4,8 +4,12 @@ github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7Oputl
|
||||
github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU=
|
||||
github.com/Microsoft/go-winio v0.4.21 h1:+6mVbXh4wPzUrl1COX9A+ZCvEpYsOBZ6/+kwDnvLyro=
|
||||
github.com/Microsoft/go-winio v0.4.21/go.mod h1:JPGBdM1cNvN/6ISo+n8V5iA4v8pBzdOpzfwIujj1a84=
|
||||
github.com/Molecule-AI/molecule-ai-plugin-gh-identity v0.0.0-20260424033845-4fd5ac7be30f h1:YkLRhUg+9qr9OV9N8dG1Hj0Ml7TThHlRwh5F//oUJVs=
|
||||
github.com/Molecule-AI/molecule-ai-plugin-gh-identity v0.0.0-20260424033845-4fd5ac7be30f/go.mod h1:NqdtlWZDJvpXNJRHnMkPhTKHdA1LZTNH+63TB66JSOU=
|
||||
github.com/Molecule-AI/molecule-ai-plugin-github-app-auth v0.0.0-20260421064811-7d98ae51e31d h1:GpYhP6FxaJZc1Ljy5/YJ9ZIVGvfOqZBmDolNr2S5x2g=
|
||||
github.com/Molecule-AI/molecule-ai-plugin-github-app-auth v0.0.0-20260421064811-7d98ae51e31d/go.mod h1:3a6LR/zd7FjR9ZwLTbytwYlWuCBsbCOVFlEg0WnoYiM=
|
||||
github.com/Molecule-AI/molecule-ai-plugin-gh-identity v0.0.0-20260424033845-4fd5ac7be30f h1:YkLRhUg+9qr9OV9N8dG1Hj0Ml7TThHlRwh5F//oUJVs=
|
||||
github.com/Molecule-AI/molecule-ai-plugin-gh-identity v0.0.0-20260424033845-4fd5ac7be30f/go.mod h1:NqdtlWZDJvpXNJRHnMkPhTKHdA1LZTNH+63TB66JSOU=
|
||||
github.com/alicebob/miniredis/v2 v2.37.0 h1:RheObYW32G1aiJIj81XVt78ZHJpHonHLHW7OLIshq68=
|
||||
github.com/alicebob/miniredis/v2 v2.37.0/go.mod h1:TcL7YfarKPGDAthEtl5NBeHZfeUQj6OXMm/+iu5cLMM=
|
||||
github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs=
|
||||
|
||||
@ -17,6 +17,14 @@ type ChannelAdapter interface {
|
||||
// DisplayName returns the human-readable name (e.g. "Telegram").
|
||||
DisplayName() string
|
||||
|
||||
// ConfigSchema describes the config fields each adapter needs. The UI
|
||||
// renders the connect-channel form from this list, so each platform's
|
||||
// field set (Telegram bot_token+chat_id, Lark webhook_url+verify_token,
|
||||
// Slack bot_token+channel_id, Discord webhook_url) can be captured
|
||||
// correctly without per-platform UI branching. Adapters must return the
|
||||
// same schema on every call — the order is the rendering order.
|
||||
ConfigSchema() []ConfigField
|
||||
|
||||
// ValidateConfig checks that channel_config JSONB has required fields.
|
||||
ValidateConfig(config map[string]interface{}) error
|
||||
|
||||
@ -31,6 +39,33 @@ type ChannelAdapter interface {
|
||||
StartPolling(ctx context.Context, config map[string]interface{}, onMessage MessageHandler) error
|
||||
}
|
||||
|
||||
// ConfigField describes a single config field for the channels connect-form UI.
|
||||
// Canvas renders one input per field in order. Values are strings in
|
||||
// channel_config JSONB — this struct carries only presentation + validation
|
||||
// hints; ValidateConfig on the adapter is still the source of truth for
|
||||
// acceptance.
|
||||
type ConfigField struct {
|
||||
// Key is the channel_config map key (e.g. "webhook_url").
|
||||
Key string `json:"key"`
|
||||
// Label is the human-readable field name (e.g. "Webhook URL").
|
||||
Label string `json:"label"`
|
||||
// Type controls the HTML input type: "text" | "password" | "textarea".
|
||||
Type string `json:"type"`
|
||||
// Required marks the field as non-optional in the UI. Still enforced
|
||||
// server-side via ValidateConfig regardless of this flag.
|
||||
Required bool `json:"required"`
|
||||
// Sensitive means the value must not be logged or shown unmasked in
|
||||
// read APIs after creation. Canvas uses this to redact the value in
|
||||
// list responses; server-side encryption is governed by sensitiveFields
|
||||
// in secret.go (today: bot_token + webhook_secret only — this flag is
|
||||
// forward-looking until that list is widened).
|
||||
Sensitive bool `json:"sensitive"`
|
||||
// Placeholder is rendered as the input's placeholder attribute.
|
||||
Placeholder string `json:"placeholder,omitempty"`
|
||||
// Help is a short one-liner shown below the input.
|
||||
Help string `json:"help,omitempty"`
|
||||
}
|
||||
|
||||
// InboundMessage is the standardized message from any social platform.
|
||||
type InboundMessage struct {
|
||||
ChatID string // Platform-specific chat/channel ID
|
||||
|
||||
@ -127,10 +127,13 @@ func TestListAdapters(t *testing.T) {
|
||||
}
|
||||
found := false
|
||||
for _, a := range list {
|
||||
if a["type"] == "telegram" {
|
||||
if a.Type == "telegram" {
|
||||
found = true
|
||||
if a["display_name"] != "Telegram" {
|
||||
t.Errorf("expected display_name 'Telegram', got %q", a["display_name"])
|
||||
if a.DisplayName != "Telegram" {
|
||||
t.Errorf("expected display_name 'Telegram', got %q", a.DisplayName)
|
||||
}
|
||||
if len(a.ConfigSchema) == 0 {
|
||||
t.Error("Telegram adapter must expose a non-empty ConfigSchema")
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -740,10 +743,10 @@ func TestListAdapters_IncludesSlack(t *testing.T) {
|
||||
list := ListAdapters()
|
||||
found := false
|
||||
for _, a := range list {
|
||||
if a["type"] == "slack" {
|
||||
if a.Type == "slack" {
|
||||
found = true
|
||||
if a["display_name"] != "Slack" {
|
||||
t.Errorf("expected display_name 'Slack', got %q", a["display_name"])
|
||||
if a.DisplayName != "Slack" {
|
||||
t.Errorf("expected display_name 'Slack', got %q", a.DisplayName)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -38,6 +38,32 @@ type DiscordAdapter struct{}
|
||||
func (d *DiscordAdapter) Type() string { return "discord" }
|
||||
func (d *DiscordAdapter) DisplayName() string { return "Discord" }
|
||||
|
||||
// ConfigSchema — Discord only needs a webhook URL for outbound.
|
||||
// public_key is the Ed25519 pubkey used to verify inbound Interactions
|
||||
// signatures (stored hex-encoded); not required if you only do outbound.
|
||||
func (d *DiscordAdapter) ConfigSchema() []ConfigField {
|
||||
return []ConfigField{
|
||||
{
|
||||
Key: "webhook_url",
|
||||
Label: "Webhook URL",
|
||||
Type: "password",
|
||||
Required: true,
|
||||
Sensitive: true,
|
||||
Placeholder: "https://discord.com/api/webhooks/{id}/{token}",
|
||||
Help: "From Server Settings → Integrations → Webhooks → Copy URL.",
|
||||
},
|
||||
{
|
||||
Key: "public_key",
|
||||
Label: "Interactions Public Key (hex)",
|
||||
Type: "password",
|
||||
Required: false,
|
||||
Sensitive: true,
|
||||
Placeholder: "optional — for inbound slash commands",
|
||||
Help: "Ed25519 public key from the Discord Developer Portal → General Information. Only needed to receive slash commands.",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// ValidateConfig checks that the channel config contains a valid Discord
|
||||
// Incoming Webhook URL. Returns a human-readable error for the Canvas UI.
|
||||
func (d *DiscordAdapter) ValidateConfig(config map[string]interface{}) error {
|
||||
|
||||
@ -241,10 +241,10 @@ func TestListAdapters_IncludesDiscord(t *testing.T) {
|
||||
list := ListAdapters()
|
||||
found := false
|
||||
for _, a := range list {
|
||||
if a["type"] == "discord" {
|
||||
if a.Type == "discord" {
|
||||
found = true
|
||||
if a["display_name"] != "Discord" {
|
||||
t.Errorf("expected display_name 'Discord', got %q", a["display_name"])
|
||||
if a.DisplayName != "Discord" {
|
||||
t.Errorf("expected display_name 'Discord', got %q", a.DisplayName)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -37,6 +37,33 @@ const (
|
||||
func (l *LarkAdapter) Type() string { return "lark" }
|
||||
func (l *LarkAdapter) DisplayName() string { return "Lark / Feishu" }
|
||||
|
||||
// ConfigSchema — Lark Custom Bot webhook URL + optional Event Subscription
|
||||
// verify token. The webhook URL already encodes the chat, so no separate
|
||||
// chat_id field is needed (and StartPolling is a no-op for Lark — inbound
|
||||
// is delivered by ParseWebhook from the Event Subscription callback).
|
||||
func (l *LarkAdapter) ConfigSchema() []ConfigField {
|
||||
return []ConfigField{
|
||||
{
|
||||
Key: "webhook_url",
|
||||
Label: "Custom Bot Webhook URL",
|
||||
Type: "password", // last path component is a secret
|
||||
Required: true,
|
||||
Sensitive: true,
|
||||
Placeholder: "https://open.feishu.cn/open-apis/bot/v2/hook/XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX",
|
||||
Help: "From the Lark/Feishu bot page → Webhook settings. open.feishu.cn (China) and open.larksuite.com (international) both accepted.",
|
||||
},
|
||||
{
|
||||
Key: "verify_token",
|
||||
Label: "Event Subscription Verify Token",
|
||||
Type: "password",
|
||||
Required: false,
|
||||
Sensitive: true,
|
||||
Placeholder: "optional — from Event Subscriptions page",
|
||||
Help: "Only needed if you want to receive messages from Lark. Paste the \"Verification Token\" from your app's Event Subscriptions configuration.",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// ValidateConfig requires webhook_url to point at a Lark or Feishu Custom
|
||||
// Bot endpoint. verify_token is optional — when set, inbound events with a
|
||||
// mismatching token are rejected (use Lark's "Verification Token" from the
|
||||
|
||||
@ -401,3 +401,60 @@ func TestRegistry_HasLark(t *testing.T) {
|
||||
t.Errorf("got %q want lark", a.Type())
|
||||
}
|
||||
}
|
||||
|
||||
// TestLark_ConfigSchema locks in the contract: Lark exposes a required +
|
||||
// sensitive webhook_url and an optional + sensitive verify_token, in that
|
||||
// order. Canvas renders the connect-form from this list so the order and
|
||||
// required/sensitive flags are observable surface.
|
||||
func TestLark_ConfigSchema(t *testing.T) {
|
||||
schema := (&LarkAdapter{}).ConfigSchema()
|
||||
if len(schema) != 2 {
|
||||
t.Fatalf("expected 2 fields, got %d", len(schema))
|
||||
}
|
||||
want := []struct {
|
||||
key string
|
||||
required bool
|
||||
sensitive bool
|
||||
}{
|
||||
{"webhook_url", true, true},
|
||||
{"verify_token", false, true},
|
||||
}
|
||||
for i, w := range want {
|
||||
got := schema[i]
|
||||
if got.Key != w.key {
|
||||
t.Errorf("field %d: key = %q, want %q", i, got.Key, w.key)
|
||||
}
|
||||
if got.Required != w.required {
|
||||
t.Errorf("field %d (%s): required = %v, want %v", i, w.key, got.Required, w.required)
|
||||
}
|
||||
if got.Sensitive != w.sensitive {
|
||||
t.Errorf("field %d (%s): sensitive = %v, want %v", i, w.key, got.Sensitive, w.sensitive)
|
||||
}
|
||||
if got.Label == "" {
|
||||
t.Errorf("field %d (%s): label must not be empty", i, w.key)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestListAdapters_IncludesLark confirms the adapter is wired into the
|
||||
// registry and its schema reaches the API layer intact. Regression guard
|
||||
// against future registry.go refactors silently dropping Lark.
|
||||
func TestListAdapters_IncludesLark(t *testing.T) {
|
||||
list := ListAdapters()
|
||||
var found *AdapterInfo
|
||||
for i := range list {
|
||||
if list[i].Type == "lark" {
|
||||
found = &list[i]
|
||||
break
|
||||
}
|
||||
}
|
||||
if found == nil {
|
||||
t.Fatal("lark adapter not in ListAdapters() output")
|
||||
}
|
||||
if found.DisplayName != "Lark / Feishu" {
|
||||
t.Errorf("DisplayName = %q, want 'Lark / Feishu'", found.DisplayName)
|
||||
}
|
||||
if len(found.ConfigSchema) == 0 {
|
||||
t.Error("ConfigSchema must not be empty in registry output")
|
||||
}
|
||||
}
|
||||
|
||||
@ -15,14 +15,31 @@ func GetAdapter(channelType string) (ChannelAdapter, bool) {
|
||||
return a, ok
|
||||
}
|
||||
|
||||
// ListAdapters returns metadata about all available adapters.
|
||||
func ListAdapters() []map[string]string {
|
||||
result := make([]map[string]string, 0, len(adapters))
|
||||
// AdapterInfo is the metadata payload returned by ListAdapters — the Canvas
|
||||
// connect-channel form renders its field list dynamically from config_schema.
|
||||
type AdapterInfo struct {
|
||||
Type string `json:"type"`
|
||||
DisplayName string `json:"display_name"`
|
||||
ConfigSchema []ConfigField `json:"config_schema"`
|
||||
}
|
||||
|
||||
// ListAdapters returns metadata about all available adapters, in a stable
|
||||
// order (sorted by display name) so UI rendering + test assertions don't
|
||||
// depend on Go's random map iteration.
|
||||
func ListAdapters() []AdapterInfo {
|
||||
result := make([]AdapterInfo, 0, len(adapters))
|
||||
for _, a := range adapters {
|
||||
result = append(result, map[string]string{
|
||||
"type": a.Type(),
|
||||
"display_name": a.DisplayName(),
|
||||
result = append(result, AdapterInfo{
|
||||
Type: a.Type(),
|
||||
DisplayName: a.DisplayName(),
|
||||
ConfigSchema: a.ConfigSchema(),
|
||||
})
|
||||
}
|
||||
// Sort by display name for deterministic ordering.
|
||||
for i := 1; i < len(result); i++ {
|
||||
for j := i; j > 0 && result[j-1].DisplayName > result[j].DisplayName; j-- {
|
||||
result[j-1], result[j] = result[j], result[j-1]
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
@ -31,6 +31,57 @@ type SlackAdapter struct{}
|
||||
func (s *SlackAdapter) Type() string { return "slack" }
|
||||
func (s *SlackAdapter) DisplayName() string { return "Slack" }
|
||||
|
||||
// ConfigSchema — Slack supports two mutually-exclusive outbound modes:
|
||||
// Bot API (bot_token + channel_id, supports per-message identity override)
|
||||
// and Incoming Webhook (webhook_url, legacy, no identity override). The
|
||||
// form exposes both; ValidateConfig enforces "one or the other".
|
||||
func (s *SlackAdapter) ConfigSchema() []ConfigField {
|
||||
return []ConfigField{
|
||||
{
|
||||
Key: "bot_token",
|
||||
Label: "Bot Token (xoxb-…)",
|
||||
Type: "password",
|
||||
Required: false,
|
||||
Sensitive: true,
|
||||
Placeholder: "xoxb-1234-5678-abc...",
|
||||
Help: "Bot API mode — supports per-agent identity override. Required scopes: chat:write, chat:write.customize. Leave empty to use Incoming Webhook mode instead.",
|
||||
},
|
||||
{
|
||||
Key: "channel_id",
|
||||
Label: "Channel ID",
|
||||
Type: "text",
|
||||
Required: false,
|
||||
Placeholder: "C01234ABCDE",
|
||||
Help: "Required when using Bot Token mode. From the channel's \"View channel details\" dialog.",
|
||||
},
|
||||
{
|
||||
Key: "webhook_url",
|
||||
Label: "Incoming Webhook URL (legacy)",
|
||||
Type: "password",
|
||||
Required: false,
|
||||
Sensitive: true,
|
||||
Placeholder: "https://hooks.slack.com/services/T.../B.../...",
|
||||
Help: "Simpler mode — no per-agent identity. Either Bot Token OR Webhook URL is required.",
|
||||
},
|
||||
{
|
||||
Key: "username",
|
||||
Label: "Override Username",
|
||||
Type: "text",
|
||||
Required: false,
|
||||
Placeholder: "optional, Bot Token mode only",
|
||||
Help: "Display name to use on outbound messages. Ignored in Webhook mode.",
|
||||
},
|
||||
{
|
||||
Key: "icon_emoji",
|
||||
Label: "Override Icon Emoji",
|
||||
Type: "text",
|
||||
Required: false,
|
||||
Placeholder: ":robot_face:",
|
||||
Help: "Emoji shortcode for per-message avatar. Ignored in Webhook mode.",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// ValidateConfig checks that the channel config contains a valid Slack
|
||||
// Incoming Webhook URL (must start with https://hooks.slack.com/).
|
||||
// Returns an error whose message becomes part of the 400 response body so
|
||||
|
||||
@ -39,6 +39,31 @@ type TelegramAdapter struct{}
|
||||
func (t *TelegramAdapter) Type() string { return "telegram" }
|
||||
func (t *TelegramAdapter) DisplayName() string { return "Telegram" }
|
||||
|
||||
// ConfigSchema — Telegram uses Bot API long-polling. The bot token comes
|
||||
// from @BotFather; chat_id is a comma-separated list discovered via the
|
||||
// "Detect Chats" UI flow (calls Bot.getUpdates).
|
||||
func (t *TelegramAdapter) ConfigSchema() []ConfigField {
|
||||
return []ConfigField{
|
||||
{
|
||||
Key: "bot_token",
|
||||
Label: "Bot Token",
|
||||
Type: "password",
|
||||
Required: true,
|
||||
Sensitive: true,
|
||||
Placeholder: "123456789:AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
|
||||
Help: "From @BotFather → /newbot (or /token on an existing bot).",
|
||||
},
|
||||
{
|
||||
Key: "chat_id",
|
||||
Label: "Chat IDs",
|
||||
Type: "text",
|
||||
Required: true,
|
||||
Placeholder: "-100123456789, -100987654321",
|
||||
Help: "Comma-separated chat IDs. Use \"Detect Chats\" after adding the bot to groups or sending /start in DMs.",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (t *TelegramAdapter) ValidateConfig(config map[string]interface{}) error {
|
||||
token, _ := config["bot_token"].(string)
|
||||
if token == "" {
|
||||
|
||||
@ -142,13 +142,29 @@ func validateAgentURL(rawURL string) error {
|
||||
{"127.0.0.0/8", "loopback address"},
|
||||
{"fe80::/10", "IPv6 link-local address (cloud metadata analogue)"},
|
||||
{"::1/128", "IPv6 loopback address"},
|
||||
// Always-blocked regardless of deploy mode: these ranges are never valid
|
||||
// agent URLs in any deployment. TEST-NET (RFC-5737) are documentation-only
|
||||
// ranges. CGNAT (RFC-6598) is never used for VPC subnets on any cloud
|
||||
// provider. IPv4 multicast is never a unicast endpoint. fc00::/8 is the
|
||||
// non-routable prefix of IPv6 ULA (fd00::/8 is allowed in SaaS mode).
|
||||
// RFC 3849: 2001:db8::/32 is the IPv6 documentation prefix.
|
||||
{"192.0.2.0/24", "TEST-NET-1 documentation range (RFC-5737)"},
|
||||
{"198.51.100.0/24", "TEST-NET-2 documentation range (RFC-5737)"},
|
||||
{"203.0.113.0/24", "TEST-NET-3 documentation range (RFC-5737)"},
|
||||
{"100.64.0.0/10", "carrier-grade NAT address (RFC-6598)"},
|
||||
{"224.0.0.0/4", "IPv4 multicast address"},
|
||||
{"fc00::/8", "IPv6 ULA non-routable prefix (fc00::/8)"},
|
||||
{"2001:db8::/32", "IPv6 documentation address (RFC-3849 reserved)"},
|
||||
}
|
||||
if !saasMode() {
|
||||
blockedRanges = append(blockedRanges,
|
||||
blockedRange{"10.0.0.0/8", "RFC-1918 private address"},
|
||||
blockedRange{"172.16.0.0/12", "RFC-1918 private address"},
|
||||
blockedRange{"192.168.0.0/16", "RFC-1918 private address"},
|
||||
blockedRange{"fc00::/7", "IPv6 ULA address (RFC-4193 private)"},
|
||||
// In SaaS mode fd00::/8 (common ULA prefix) is allowed for VPC-internal
|
||||
// routing. fc00::/8 is already always-blocked above. In non-SaaS mode
|
||||
// block the entire fc00::/7 supernet (covers both fd00 and fc00).
|
||||
blockedRange{"fd00::/8", "IPv6 ULA address (RFC-4193 private)"},
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@ -540,6 +540,21 @@ func TestValidateAgentURL(t *testing.T) {
|
||||
{"blocked IPv6 loopback [::1]", "http://[::1]:8080", true},
|
||||
{"blocked IPv6 link-local [fe80::1]", "http://[fe80::1]:8080", true},
|
||||
{"blocked IPv6 ULA [fd00::1]", "http://[fd00::1]:8080", true},
|
||||
|
||||
// ── Must be rejected: RFC 5737 TEST-NET reserved ranges ─────────────
|
||||
// These addresses are reserved for documentation and example code.
|
||||
// No production agent has a legitimate reason to use them.
|
||||
{"blocked TEST-NET-1 192.0.2.x", "http://192.0.2.1:8080", true},
|
||||
{"blocked TEST-NET-1 192.0.2.254", "http://192.0.2.254:9000", true},
|
||||
{"blocked TEST-NET-2 198.51.100.x", "http://198.51.100.1:8080", true},
|
||||
{"blocked TEST-NET-2 198.51.100.99", "http://198.51.100.99:8000", true},
|
||||
{"blocked TEST-NET-3 203.0.113.x", "http://203.0.113.1:8080", true},
|
||||
{"blocked TEST-NET-3 203.0.113.254", "http://203.0.113.254:9000", true},
|
||||
|
||||
// ── Must be rejected: RFC 3849 IPv6 documentation prefix ────────────
|
||||
{"blocked IPv6 documentation 2001:db8::1", "http://[2001:db8::1]:8080", true},
|
||||
{"blocked IPv6 documentation 2001:db8::ffff", "http://[2001:db8::ffff]:8000", true},
|
||||
|
||||
// IPv4-mapped IPv6 for a blocked range must also be rejected.
|
||||
// Go normalises ::ffff:169.254.x.x to IPv4 via To4(), so the existing
|
||||
// 169.254.0.0/16 entry catches it without a dedicated rule.
|
||||
@ -570,6 +585,91 @@ func TestValidateAgentURL(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateAgentURL_SaaSMode_AllowsRFC1918 is the integration-level wrapper test
|
||||
// for the SaaS-mode SSRF relaxation in validateAgentURL (used at registration).
|
||||
// It exercises validateAgentURL as called by the Register handler, not just the
|
||||
// inner blockedRanges slice. Regression guard for the same class of bug as
|
||||
// isSafeURL (issue #1785).
|
||||
func TestValidateAgentURL_SaaSMode_AllowsRFC1918(t *testing.T) {
|
||||
t.Setenv("MOLECULE_DEPLOY_MODE", "saas")
|
||||
t.Setenv("MOLECULE_ORG_ID", "")
|
||||
for _, url := range []string{
|
||||
"http://10.1.2.3/agent",
|
||||
"http://10.0.0.5:8000/a2a",
|
||||
"http://172.16.0.1/agent",
|
||||
"http://172.18.0.42:8000/a2a",
|
||||
"http://172.31.44.78/agent",
|
||||
"http://192.168.1.100/agent",
|
||||
"http://192.168.255.254:9000/a2a",
|
||||
"http://[fd00::1]/agent",
|
||||
"http://[fd12:3456:789a::42]/a2a",
|
||||
} {
|
||||
if err := validateAgentURL(url); err != nil {
|
||||
t.Errorf("validateAgentURL(%q) in saasMode: got %v, want nil", url, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateAgentURL_SaaSMode_StillBlocksMetadataEtAl verifies that even in
|
||||
// SaaS mode the always-blocked ranges (metadata, loopback, TEST-NET, CGNAT,
|
||||
// non-fd00 ULA) stay blocked.
|
||||
func TestValidateAgentURL_SaaSMode_StillBlocksMetadataEtAl(t *testing.T) {
|
||||
t.Setenv("MOLECULE_DEPLOY_MODE", "saas")
|
||||
t.Setenv("MOLECULE_ORG_ID", "")
|
||||
for _, url := range []string{
|
||||
"http://169.254.169.254/latest/meta-data/",
|
||||
"http://169.254.0.1/",
|
||||
"http://127.0.0.1:8080",
|
||||
"http://[::1]:8080",
|
||||
"http://192.0.2.5/agent",
|
||||
"http://198.51.100.5/a2a",
|
||||
"http://203.0.113.42/agent",
|
||||
"http://100.64.0.1/agent",
|
||||
"http://100.127.255.254:8000/a2a",
|
||||
"http://[fc00::1]/agent",
|
||||
"http://224.0.0.1/",
|
||||
} {
|
||||
if err := validateAgentURL(url); err == nil {
|
||||
t.Errorf("validateAgentURL(%q) in saasMode: got nil, want block", url)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateAgentURL_StrictMode_BlocksRFC1918 is the strict-mode counterpart
|
||||
// to TestValidateAgentURL_SaaSMode_AllowsRFC1918.
|
||||
func TestValidateAgentURL_StrictMode_BlocksRFC1918(t *testing.T) {
|
||||
t.Setenv("MOLECULE_DEPLOY_MODE", "self-hosted")
|
||||
t.Setenv("MOLECULE_ORG_ID", "")
|
||||
for _, url := range []string{
|
||||
"http://10.1.2.3/agent",
|
||||
"http://172.16.0.1:8000/a2a",
|
||||
"http://172.31.44.78/agent",
|
||||
"http://192.168.1.100/agent",
|
||||
"http://[fd00::1]/agent",
|
||||
} {
|
||||
if err := validateAgentURL(url); err == nil {
|
||||
t.Errorf("validateAgentURL(%q) in strict mode: got nil, want block", url)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateAgentURL_SaaSMode_LegacyOrgID covers the legacy MOLECULE_ORG_ID
|
||||
// signal (no MOLECULE_DEPLOY_MODE set) for validateAgentURL.
|
||||
func TestValidateAgentURL_SaaSMode_LegacyOrgID(t *testing.T) {
|
||||
t.Setenv("MOLECULE_DEPLOY_MODE", "")
|
||||
t.Setenv("MOLECULE_ORG_ID", "7b2179dc-8cc6-4581-a3c6-c8bff4481086")
|
||||
for _, url := range []string{
|
||||
"http://10.1.2.3/agent",
|
||||
"http://172.18.0.42:8000/a2a",
|
||||
"http://192.168.1.100/agent",
|
||||
"http://[fd00::1]/agent",
|
||||
} {
|
||||
if err := validateAgentURL(url); err != nil {
|
||||
t.Errorf("validateAgentURL(%q) with legacy MOLECULE_ORG_ID: got %v, want nil", url, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ==================== C18 — Register ownership ====================
|
||||
|
||||
// TestRegister_C18_BootstrapAllowedNoTokens verifies that a workspace with NO
|
||||
|
||||
@ -326,4 +326,101 @@ func TestDevModeAllowsLoopback_Predicate(t *testing.T) {
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestIsSafeURL_SaaSMode_AllowsRFC1918 is the integration-level wrapper test
|
||||
// for the SaaS-mode SSRF relaxation. It exercises isSafeURL (the public API),
|
||||
// not isPrivateOrMetadataIP (the inner helper), ensuring the wrapper correctly
|
||||
// propagates saasMode() to its helper.
|
||||
//
|
||||
// Regression guard: isSafeURL previously hardcoded RFC-1918 rejection and never
|
||||
// called saasMode(), causing 502 on every A2A call from Docker-networked or VPC
|
||||
// deployments (issue #1785 / PR #1785). The inner helper's TestIsPrivateOrMetadataIP_SaaSMode
|
||||
// was green the whole time — classic "test the intent, not the integration" gap.
|
||||
func TestIsSafeURL_SaaSMode_AllowsRFC1918(t *testing.T) {
|
||||
t.Setenv("MOLECULE_DEPLOY_MODE", "saas")
|
||||
t.Setenv("MOLECULE_ORG_ID", "")
|
||||
for _, url := range []string{
|
||||
"http://10.1.2.3/agent",
|
||||
"http://10.0.0.5:8000/a2a",
|
||||
"http://172.16.0.1/agent",
|
||||
"http://172.18.0.42:8000/a2a",
|
||||
"http://172.31.44.78/agent",
|
||||
"http://192.168.1.100/agent",
|
||||
"http://192.168.255.254:9000/a2a",
|
||||
"http://[fd00::1]/agent",
|
||||
"http://[fd12:3456:789a::42]/a2a",
|
||||
} {
|
||||
if err := isSafeURL(url); err != nil {
|
||||
t.Errorf("isSafeURL(%q) in saasMode: got %v, want nil", url, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestIsSafeURL_SaaSMode_StillBlocksMetadataEtAl verifies that even in SaaS
|
||||
// mode the always-blocked ranges (metadata, loopback, TEST-NET, CGNAT) stay blocked.
|
||||
func TestIsSafeURL_SaaSMode_StillBlocksMetadataEtAl(t *testing.T) {
|
||||
t.Setenv("MOLECULE_DEPLOY_MODE", "saas")
|
||||
t.Setenv("MOLECULE_ORG_ID", "")
|
||||
for _, url := range []string{
|
||||
// Cloud metadata — must stay blocked in every mode.
|
||||
"http://169.254.169.254/latest/meta-data/",
|
||||
"http://169.254.0.1/",
|
||||
// Loopback — must stay blocked.
|
||||
"http://127.0.0.1:8080",
|
||||
"http://[::1]:8080",
|
||||
// TEST-NET documentation ranges — must stay blocked.
|
||||
"http://192.0.2.5/agent",
|
||||
"http://198.51.100.5/a2a",
|
||||
"http://203.0.113.42/agent",
|
||||
// CGNAT — must stay blocked.
|
||||
"http://100.64.0.1/agent",
|
||||
"http://100.127.255.254:8000/a2a",
|
||||
// ULA fc00::/8 (non-fd00 half) — must stay blocked in SaaS.
|
||||
"http://[fc00::1]/agent",
|
||||
// Non-RFC-1918 private ranges still blocked.
|
||||
"http://224.0.0.1/",
|
||||
} {
|
||||
if err := isSafeURL(url); err == nil {
|
||||
t.Errorf("isSafeURL(%q) in saasMode: got nil, want block", url)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestIsSafeURL_StrictMode_BlocksRFC1918 is the strict-mode counterpart to
|
||||
// TestIsSafeURL_SaaSMode_AllowsRFC1918. In self-hosted / single-container
|
||||
// deployments there is no legitimate reason to reach RFC-1918 agents, so the
|
||||
// wrapper must block them.
|
||||
func TestIsSafeURL_StrictMode_BlocksRFC1918(t *testing.T) {
|
||||
t.Setenv("MOLECULE_DEPLOY_MODE", "self-hosted")
|
||||
t.Setenv("MOLECULE_ORG_ID", "")
|
||||
for _, url := range []string{
|
||||
"http://10.1.2.3/agent",
|
||||
"http://172.16.0.1:8000/a2a",
|
||||
"http://172.31.44.78/agent",
|
||||
"http://192.168.1.100/agent",
|
||||
"http://[fd00::1]/agent",
|
||||
} {
|
||||
if err := isSafeURL(url); err == nil {
|
||||
t.Errorf("isSafeURL(%q) in strict mode: got nil, want block", url)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestIsSafeURL_SaasMode_LegacyOrgID covers the legacy MOLECULE_ORG_ID signal
|
||||
// (no MOLECULE_DEPLOY_MODE set). An org ID alone is sufficient to activate SaaS
|
||||
// mode per the saasMode() resolution ladder.
|
||||
func TestIsSafeURL_SaasMode_LegacyOrgID(t *testing.T) {
|
||||
t.Setenv("MOLECULE_DEPLOY_MODE", "")
|
||||
t.Setenv("MOLECULE_ORG_ID", "7b2179dc-8cc6-4581-a3c6-c8bff4481086")
|
||||
for _, url := range []string{
|
||||
"http://10.1.2.3/agent",
|
||||
"http://172.18.0.42:8000/a2a",
|
||||
"http://192.168.1.100/agent",
|
||||
"http://[fd00::1]/agent",
|
||||
} {
|
||||
if err := isSafeURL(url); err != nil {
|
||||
t.Errorf("isSafeURL(%q) with legacy MOLECULE_ORG_ID: got %v, want nil", url, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -77,17 +77,26 @@ func (h *TerminalHandler) HandleConnect(c *gin.Context) {
|
||||
// A2A message-passing, so we apply the same hierarchy check here.
|
||||
// GH#756/#1609 security fix: if the caller claims a specific workspace
|
||||
// identity (X-Workspace-ID header), the bearer token — if present — must
|
||||
// belong to that claimed workspace. ValidateAnyToken accepted ANY valid org
|
||||
// token, allowing Workspace A to forge X-Workspace-ID: B and reach B's
|
||||
// terminal if A held any valid token. ValidateToken binds the token to
|
||||
// the claimed workspace identity.
|
||||
// belong to that claimed workspace. Previously ValidateAnyToken accepted
|
||||
// ANY valid org token, allowing Workspace A to forge X-Workspace-ID: B
|
||||
// and reach B's terminal if A held any valid token. ValidateToken binds
|
||||
// the workspace-scoped token to the claimed workspace identity. Org-level
|
||||
// tokens are handled separately via the org_token_id context key.
|
||||
callerID := c.GetHeader("X-Workspace-ID")
|
||||
if callerID != "" && callerID != workspaceID {
|
||||
tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization"))
|
||||
if tok != "" {
|
||||
if err := wsauth.ValidateToken(ctx, db.DB, callerID, tok); err != nil {
|
||||
c.JSON(http.StatusUnauthorized, gin.H{"error": "invalid token for claimed workspace"})
|
||||
return
|
||||
// Org-scoped tokens (org_api_tokens) are validated at the org level
|
||||
// by WorkspaceAuth and do not have a workspace_auth_tokens row, so
|
||||
// ValidateToken always returns ErrInvalidToken for them. If WorkspaceAuth
|
||||
// already validated an org token (org_token_id set in context), trust
|
||||
// the X-Workspace-ID claim — the hierarchy is enforced by
|
||||
// canCommunicateCheck below. Reject everything else.
|
||||
if c.GetString("org_token_id") == "" {
|
||||
c.JSON(http.StatusUnauthorized, gin.H{"error": "invalid token for claimed workspace"})
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
if !canCommunicateCheck(callerID, workspaceID) {
|
||||
|
||||
@ -455,3 +455,38 @@ func TestTerminalConnect_KI005_AllowsSiblingWorkspace(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestKI005_OrgToken_SkipsValidateToken verifies that when WorkspaceAuth already
|
||||
// validated an org token (org_token_id set in gin context), the X-Workspace-ID
|
||||
// claim is trusted without a workspace_auth_tokens lookup. The hierarchy is still
|
||||
// enforced by canCommunicateCheck. Regression guard for the A2A routing regression
|
||||
// introduced in GH#1885: internal routing uses org tokens which are not in
|
||||
// workspace_auth_tokens, so ValidateToken would always fail for them.
|
||||
func TestKI005_OrgToken_SkipsValidateToken(t *testing.T) {
|
||||
setupTestDB(t) // no ValidateToken ExpectQuery — none should fire
|
||||
prev := canCommunicateCheck
|
||||
canCommunicateCheck = func(callerID, targetID string) bool {
|
||||
// Simulate platform agent → target workspace (same org).
|
||||
return callerID == "ws-platform" && targetID == "ws-target"
|
||||
}
|
||||
defer func() { canCommunicateCheck = prev }()
|
||||
|
||||
h := NewTerminalHandler(nil)
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-target"}}
|
||||
c.Request = httptest.NewRequest("GET", "/workspaces/ws-target/terminal", nil)
|
||||
c.Request.Header.Set("X-Workspace-ID", "ws-platform")
|
||||
c.Request.Header.Set("Authorization", "Bearer org-token-abc123")
|
||||
// Simulate WorkspaceAuth having validated the org token (orgtoken.Validate
|
||||
// succeeded). HandleConnect must skip ValidateToken and trust the claim.
|
||||
c.Set("org_token_id", "tok-org-abc")
|
||||
|
||||
h.HandleConnect(c)
|
||||
|
||||
// Org token path: ValidateToken skipped → canCommunicateCheck=true →
|
||||
// falls through to Docker path → 503 nil-docker (no Docker client).
|
||||
if w.Code != http.StatusServiceUnavailable {
|
||||
t.Errorf("org-token A2A: got %d, want 503 nil-docker (%s)", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -6,6 +6,7 @@ package handlers
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
@ -388,9 +389,24 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) {
|
||||
// Now stop containers + remove volumes for all descendants (any depth).
|
||||
// Any concurrent heartbeat / registration / liveness-triggered restart
|
||||
// will see status='removed' and bail out early.
|
||||
//
|
||||
// #1843: Stop() errors used to be silently swallowed. On the CP/EC2
|
||||
// backend, Stop() calls the control plane's DELETE workspaces endpoint
|
||||
// to terminate the EC2; if that errors (CP transient 5xx, network),
|
||||
// the EC2 stays running with no DB row to track it — the
|
||||
// "14 orphan workspace EC2s on a 0-customer account" scenario.
|
||||
// Aggregate Stop failures and surface them as 500 so the client can
|
||||
// retry. The retry replays Stop with the same instance_id (still
|
||||
// readable from the row even after status='removed') — idempotent on
|
||||
// the CP side. RemoveVolume errors stay log-and-continue: those are
|
||||
// local cleanup of /var/data, not infra-leak class.
|
||||
var stopErrs []error
|
||||
for _, descID := range descendantIDs {
|
||||
if h.provisioner != nil {
|
||||
h.provisioner.Stop(ctx, descID)
|
||||
if err := h.provisioner.Stop(ctx, descID); err != nil {
|
||||
log.Printf("Delete descendant %s stop error: %v", descID, err)
|
||||
stopErrs = append(stopErrs, fmt.Errorf("stop descendant %s: %w", descID, err))
|
||||
}
|
||||
if err := h.provisioner.RemoveVolume(ctx, descID); err != nil {
|
||||
log.Printf("Delete descendant %s volume removal warning: %v", descID, err)
|
||||
}
|
||||
@ -401,7 +417,10 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) {
|
||||
|
||||
// Stop + remove volume for the workspace itself
|
||||
if h.provisioner != nil {
|
||||
h.provisioner.Stop(ctx, id)
|
||||
if err := h.provisioner.Stop(ctx, id); err != nil {
|
||||
log.Printf("Delete %s stop error: %v", id, err)
|
||||
stopErrs = append(stopErrs, fmt.Errorf("stop %s: %w", id, err))
|
||||
}
|
||||
if err := h.provisioner.RemoveVolume(ctx, id); err != nil {
|
||||
log.Printf("Delete %s volume removal warning: %v", id, err)
|
||||
}
|
||||
@ -412,6 +431,21 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) {
|
||||
"cascade_deleted": len(descendantIDs),
|
||||
})
|
||||
|
||||
// If any Stop call failed, surface 500 so the client retries. The DB
|
||||
// row is already 'removed' (idempotent), and Stop's instance_id
|
||||
// lookup tolerates that — the retry replays the terminate. This is
|
||||
// the loud-fail-instead-of-silent-leak choice; users see a 500
|
||||
// instead of an orphaned EC2.
|
||||
if len(stopErrs) > 0 {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{
|
||||
"error": fmt.Sprintf("workspace marked removed, but %d stop call(s) failed — please retry: %v",
|
||||
len(stopErrs), errors.Join(stopErrs...)),
|
||||
"removed_count": len(allIDs),
|
||||
"stop_failures": len(stopErrs),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
// Hard purge: cascade delete all FK data and remove the DB row entirely (#1087)
|
||||
if c.Query("purge") == "true" {
|
||||
purgeIDs := pq.Array(allIDs)
|
||||
|
||||
@ -96,6 +96,14 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri
|
||||
applyAgentGitIdentity(envVars, payload.Name)
|
||||
applyRuntimeModelEnv(envVars, payload.Runtime, payload.Model)
|
||||
|
||||
// Propagate the workspace's role into env so role-aware plugins
|
||||
// (gh-identity — molecule-core#1957) can read it without the
|
||||
// plugin interface having to carry the full payload. Role is
|
||||
// cosmetic metadata — no auth weight on it — safe to surface as env.
|
||||
if payload.Role != "" {
|
||||
envVars["MOLECULE_AGENT_ROLE"] = payload.Role
|
||||
}
|
||||
|
||||
// Plugin extension point: run any registered EnvMutators (e.g.
|
||||
// github-app-auth, vault-secrets) AFTER built-in identity injection so
|
||||
// plugins can override or augment GIT_AUTHOR_*, GITHUB_TOKEN, etc.
|
||||
@ -688,6 +696,11 @@ func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string
|
||||
|
||||
applyAgentGitIdentity(envVars, payload.Name)
|
||||
applyRuntimeModelEnv(envVars, payload.Runtime, payload.Model)
|
||||
// Propagate role for role-aware plugins (#1957). See provisionWorkspace
|
||||
// above for rationale.
|
||||
if payload.Role != "" {
|
||||
envVars["MOLECULE_AGENT_ROLE"] = payload.Role
|
||||
}
|
||||
if err := h.envMutators.Run(ctx, workspaceID, envVars); err != nil {
|
||||
log.Printf("CPProvisioner: env mutator failed for %s: %v", workspaceID, err)
|
||||
// F1086 / #1206: env mutator errors (missing tokens, vault paths) must not
|
||||
|
||||
@ -304,6 +304,7 @@ func CanvasOrBearer(database *sql.DB) gin.HandlerFunc {
|
||||
}
|
||||
|
||||
c.AbortWithStatusJSON(http.StatusUnauthorized, gin.H{"error": "admin auth required"})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1011,8 +1011,10 @@ func TestCanvasOrBearer_TokensExist_NoCreds_Returns401(t *testing.T) {
|
||||
mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1))
|
||||
|
||||
handlerCalled := false
|
||||
r := gin.New()
|
||||
r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) {
|
||||
handlerCalled = true
|
||||
c.JSON(http.StatusOK, gin.H{"ok": true})
|
||||
})
|
||||
|
||||
@ -1023,6 +1025,47 @@ func TestCanvasOrBearer_TokensExist_NoCreds_Returns401(t *testing.T) {
|
||||
if w.Code != http.StatusUnauthorized {
|
||||
t.Errorf("no creds: got %d, want 401", w.Code)
|
||||
}
|
||||
if handlerCalled {
|
||||
t.Error("handler was called after AbortWithStatusJSON — missing return allows fall-through")
|
||||
}
|
||||
if body := w.Body.String(); body == `{"ok":true}` {
|
||||
t.Error("handler body written after AbortWithStatusJSON")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCanvasOrBearer_TokensExist_WrongOrigin_Returns401(t *testing.T) {
|
||||
mockDB, mock, err := sqlmock.New()
|
||||
if err != nil {
|
||||
t.Fatalf("sqlmock: %v", err)
|
||||
}
|
||||
defer mockDB.Close()
|
||||
|
||||
mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1))
|
||||
|
||||
t.Setenv("CORS_ORIGINS", "https://acme.moleculesai.app")
|
||||
|
||||
handlerCalled := false
|
||||
r := gin.New()
|
||||
r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) {
|
||||
handlerCalled = true
|
||||
c.JSON(http.StatusOK, gin.H{"ok": true})
|
||||
})
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil)
|
||||
req.Header.Set("Origin", "https://evil.example.com")
|
||||
r.ServeHTTP(w, req)
|
||||
|
||||
if w.Code != http.StatusUnauthorized {
|
||||
t.Errorf("wrong origin: got %d, want 401", w.Code)
|
||||
}
|
||||
if handlerCalled {
|
||||
t.Error("handler was called after AbortWithStatusJSON — missing return allows fall-through")
|
||||
}
|
||||
if body := w.Body.String(); body == `{"ok":true}` {
|
||||
t.Error("handler body written after AbortWithStatusJSON")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCanvasOrBearer_TokensExist_CanvasOrigin_Passes(t *testing.T) {
|
||||
@ -1100,7 +1143,7 @@ func TestAdminAuth_RemovedWorkspaceToken_Returns401(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestCanvasOrBearer_TokensExist_WrongOrigin_Returns401(t *testing.T) {
|
||||
func TestCanvasOrBearer_WrongOrigin_Blocked(t *testing.T) {
|
||||
mockDB, mock, err := sqlmock.New()
|
||||
if err != nil {
|
||||
t.Fatalf("sqlmock: %v", err)
|
||||
|
||||
@ -18,30 +18,49 @@ type ProvisionTimeoutEmitter interface {
|
||||
}
|
||||
|
||||
// DefaultProvisioningTimeout is how long a workspace may sit in
|
||||
// status='provisioning' before the sweeper flips it to 'failed'. The
|
||||
// container-launch path has its own 3-minute context timeout
|
||||
// (provisioner.ProvisionTimeout) but that only bounds the docker API call —
|
||||
// a container that started but crashes before /registry/register never
|
||||
// triggers that path and would sit in provisioning forever. 10 minutes
|
||||
// covers pathological image-pull + user-data execution on a cold EC2 worker
|
||||
// while still getting well ahead of the "15+ minute" stuck state users see
|
||||
// in production.
|
||||
// status='provisioning' before the sweeper flips it to 'failed'.
|
||||
// Default for non-hermes runtimes (claude-code, langgraph, crewai,
|
||||
// autogen, etc.) which cold-boot in <5 min. The container-launch path
|
||||
// has its own 3-minute context timeout (provisioner.ProvisionTimeout)
|
||||
// but that only bounds the docker API call — a container that started
|
||||
// but crashes before /registry/register never triggers that path and
|
||||
// would sit in provisioning forever. 10 minutes covers pathological
|
||||
// image-pull + user-data execution on a cold EC2 worker while still
|
||||
// getting well ahead of the "15+ minute" stuck state users see in
|
||||
// production.
|
||||
const DefaultProvisioningTimeout = 10 * time.Minute
|
||||
|
||||
// HermesProvisioningTimeout matches the CP bootstrap-watcher's
|
||||
// runtime-aware deadline (cp#245) for hermes workspaces: 25 min watcher
|
||||
// + 5 min sweep slack. Hermes cold-boot does apt + uv + Python venv +
|
||||
// Node + hermes-agent install — 13–25 min on slow apt mirrors is
|
||||
// normal. Without this, the sweep would flip the workspace to 'failed'
|
||||
// at 10 min while the watcher (and the workspace itself) is still
|
||||
// happily progressing through install. Issue #1843 follow-up: a
|
||||
// healthy 10.5-min hermes boot was killed by the 10-min sweep on
|
||||
// 2026-04-26, breaking #2061's E2E.
|
||||
const HermesProvisioningTimeout = 30 * time.Minute
|
||||
|
||||
// DefaultProvisionSweepInterval is how often the sweeper polls. Same cadence
|
||||
// as the hibernation monitor — cheap and bounded by the provisioning-state
|
||||
// query which hits the primary key / status partial index.
|
||||
const DefaultProvisionSweepInterval = 30 * time.Second
|
||||
|
||||
// provisioningTimeout reads the override from env, falling back to the
|
||||
// default. Env var expressed in seconds so operators can tune via a normal
|
||||
// container restart without a code change.
|
||||
func provisioningTimeout() time.Duration {
|
||||
// provisioningTimeoutFor picks the per-runtime sweep deadline. Mirrors
|
||||
// the CP bootstrap-watcher's runtime gating (provisioner.bootstrapTimeoutFn).
|
||||
// PROVISION_TIMEOUT_SECONDS env override, when set, applies to ALL
|
||||
// runtimes — useful for ops debugging but loses the runtime nuance, so
|
||||
// operators should prefer the defaults unless they have a specific
|
||||
// reason.
|
||||
func provisioningTimeoutFor(runtime string) time.Duration {
|
||||
if v := os.Getenv("PROVISION_TIMEOUT_SECONDS"); v != "" {
|
||||
if n, err := strconv.Atoi(v); err == nil && n > 0 {
|
||||
return time.Duration(n) * time.Second
|
||||
}
|
||||
}
|
||||
if runtime == "hermes" {
|
||||
return HermesProvisioningTimeout
|
||||
}
|
||||
return DefaultProvisioningTimeout
|
||||
}
|
||||
|
||||
@ -65,7 +84,8 @@ func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeout
|
||||
ticker := time.NewTicker(interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s)", interval, provisioningTimeout())
|
||||
log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s default / %s hermes)",
|
||||
interval, DefaultProvisioningTimeout, HermesProvisioningTimeout)
|
||||
|
||||
for {
|
||||
select {
|
||||
@ -80,33 +100,51 @@ func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeout
|
||||
// sweepStuckProvisioning is one tick of the sweeper. Exported-for-test via
|
||||
// the package boundary: keep all time.Now reads inside so tests can drive it
|
||||
// deterministically by seeding updated_at rather than manipulating time.
|
||||
//
|
||||
// Runtime-aware: the per-workspace timeout depends on `runtime`. Hermes
|
||||
// gets 30 min (matching the CP bootstrap-watcher's 25-min deadline + 5
|
||||
// min slack); everything else gets 10 min. Without this distinction a
|
||||
// healthy hermes cold-boot at 10–25 min got killed mid-install by this
|
||||
// sweep, leaving an incoherent "marked failed but actually working"
|
||||
// state. See bootstrap_watcher.go's bootstrapTimeoutFn for the
|
||||
// canonical CP-side gating.
|
||||
func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter) {
|
||||
timeout := provisioningTimeout()
|
||||
timeoutSec := int(timeout / time.Second)
|
||||
|
||||
// Read candidates first so the event broadcast can include each id. The
|
||||
// subsequent UPDATE re-checks the predicate to stay race-safe against
|
||||
// concurrent restart / register paths that write updated_at.
|
||||
// We can't pre-filter by age in SQL because the threshold depends
|
||||
// on the row's runtime. Pull every provisioning row + its runtime
|
||||
// + its age, evaluate per-row in Go. Still cheap — the
|
||||
// status='provisioning' row count is bounded (workspaces in
|
||||
// flight, not historical) and the partial index on status keeps
|
||||
// it fast.
|
||||
rows, err := db.DB.QueryContext(ctx, `
|
||||
SELECT id FROM workspaces
|
||||
SELECT id, COALESCE(runtime, ''), EXTRACT(EPOCH FROM (now() - updated_at))::int
|
||||
FROM workspaces
|
||||
WHERE status = 'provisioning'
|
||||
AND updated_at < now() - ($1 || ' seconds')::interval
|
||||
`, timeoutSec)
|
||||
`)
|
||||
if err != nil {
|
||||
log.Printf("Provision-timeout sweep: query error: %v", err)
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var ids []string
|
||||
type candidate struct {
|
||||
id string
|
||||
runtime string
|
||||
ageSec int
|
||||
}
|
||||
var ids []candidate
|
||||
for rows.Next() {
|
||||
var id string
|
||||
if err := rows.Scan(&id); err == nil {
|
||||
ids = append(ids, id)
|
||||
var c candidate
|
||||
if err := rows.Scan(&c.id, &c.runtime, &c.ageSec); err == nil {
|
||||
ids = append(ids, c)
|
||||
}
|
||||
}
|
||||
|
||||
for _, id := range ids {
|
||||
for _, c := range ids {
|
||||
timeout := provisioningTimeoutFor(c.runtime)
|
||||
timeoutSec := int(timeout / time.Second)
|
||||
if c.ageSec < timeoutSec {
|
||||
continue
|
||||
}
|
||||
msg := "provisioning timed out — container started but never called /registry/register. Check container logs and network connectivity to the platform."
|
||||
res, err := db.DB.ExecContext(ctx, `
|
||||
UPDATE workspaces
|
||||
@ -116,9 +154,9 @@ func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter
|
||||
WHERE id = $1
|
||||
AND status = 'provisioning'
|
||||
AND updated_at < now() - ($3 || ' seconds')::interval
|
||||
`, id, msg, timeoutSec)
|
||||
`, c.id, msg, timeoutSec)
|
||||
if err != nil {
|
||||
log.Printf("Provision-timeout sweep: failed to flip %s to failed: %v", id, err)
|
||||
log.Printf("Provision-timeout sweep: failed to flip %s to failed: %v", c.id, err)
|
||||
continue
|
||||
}
|
||||
affected, _ := res.RowsAffected()
|
||||
@ -126,18 +164,19 @@ func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter
|
||||
// Raced with restart / register — no harm, just skip.
|
||||
continue
|
||||
}
|
||||
log.Printf("Provision-timeout sweep: %s stuck in provisioning > %s — marked failed", id, timeout)
|
||||
log.Printf("Provision-timeout sweep: %s (runtime=%q) stuck in provisioning > %s — marked failed", c.id, c.runtime, timeout)
|
||||
// Emit as WORKSPACE_PROVISION_FAILED, not _TIMEOUT, because the
|
||||
// canvas event handler only flips node state on the _FAILED case.
|
||||
// A separate event type was considered but the UI reaction is
|
||||
// identical either way — operators who need to distinguish can
|
||||
// tell from the `source` payload field.
|
||||
if emitErr := emitter.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", id, map[string]interface{}{
|
||||
if emitErr := emitter.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", c.id, map[string]interface{}{
|
||||
"error": msg,
|
||||
"timeout_secs": timeoutSec,
|
||||
"runtime": c.runtime,
|
||||
"source": "provision_timeout_sweep",
|
||||
}); emitErr != nil {
|
||||
log.Printf("Provision-timeout sweep: broadcast failed for %s: %v", id, emitErr)
|
||||
log.Printf("Provision-timeout sweep: broadcast failed for %s: %v", c.id, emitErr)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -5,6 +5,7 @@ import (
|
||||
"errors"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
)
|
||||
@ -40,13 +41,24 @@ func (f *fakeEmitter) count() int {
|
||||
return len(f.events)
|
||||
}
|
||||
|
||||
// candidateRows builds the new-shape query result (id, runtime, age_sec).
|
||||
// Use this in every sweep test to match the runtime-aware SELECT.
|
||||
func candidateRows(rows ...[3]any) *sqlmock.Rows {
|
||||
r := sqlmock.NewRows([]string{"id", "runtime", "age_sec"})
|
||||
for _, row := range rows {
|
||||
r = r.AddRow(row[0], row[1], row[2])
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
// TestSweepStuckProvisioning_FlipsOverdue verifies the happy path: a stuck
|
||||
// provisioning workspace gets flipped to failed AND an event is broadcast.
|
||||
func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
|
||||
mock.ExpectQuery(`SELECT id FROM workspaces`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-stuck"))
|
||||
// claude-code workspace, 700s old > 600s default timeout → flipped.
|
||||
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
|
||||
WillReturnRows(candidateRows([3]any{"ws-stuck", "claude-code", 700}))
|
||||
|
||||
mock.ExpectExec(`UPDATE workspaces`).
|
||||
WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
@ -69,6 +81,60 @@ func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestSweepStuckProvisioning_HermesGets30MinSlack — the regression that
|
||||
// motivated the runtime-aware change. A hermes workspace 11 min into
|
||||
// cold-boot must NOT be flipped to failed; the watcher's 25-min budget
|
||||
// covers it. Without the fix, the 10-min sweep killed healthy hermes
|
||||
// boots mid-install (issue #2061's E2E failure on 2026-04-26).
|
||||
func TestSweepStuckProvisioning_HermesGets30MinSlack(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
|
||||
// 11 min = 660 sec. < HermesProvisioningTimeout (1800s).
|
||||
// No UPDATE should fire — hermes still has time.
|
||||
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
|
||||
WillReturnRows(candidateRows([3]any{"ws-hermes-booting", "hermes", 660}))
|
||||
|
||||
emit := &fakeEmitter{}
|
||||
sweepStuckProvisioning(context.Background(), emit)
|
||||
|
||||
if emit.count() != 0 {
|
||||
t.Fatalf("hermes at 11min should NOT have been flipped, got %d events", emit.count())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSweepStuckProvisioning_HermesPastDeadline — a hermes workspace
|
||||
// past 30 min DOES get flipped. Closes the loop on the runtime-aware
|
||||
// fix: it's still bounded, just with a longer threshold than other
|
||||
// runtimes.
|
||||
func TestSweepStuckProvisioning_HermesPastDeadline(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
|
||||
// 31 min = 1860 sec > HermesProvisioningTimeout (1800s).
|
||||
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
|
||||
WillReturnRows(candidateRows([3]any{"ws-hermes-stuck", "hermes", 1860}))
|
||||
mock.ExpectExec(`UPDATE workspaces`).
|
||||
WithArgs("ws-hermes-stuck", sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
emit := &fakeEmitter{}
|
||||
sweepStuckProvisioning(context.Background(), emit)
|
||||
|
||||
if emit.count() != 1 {
|
||||
t.Fatalf("hermes past 30min must be flipped, got %d events", emit.count())
|
||||
}
|
||||
// Payload should include runtime so ops can distinguish in logs.
|
||||
payload, ok := emit.events[0].Payload.(map[string]interface{})
|
||||
if !ok {
|
||||
t.Fatalf("payload not a map: %T", emit.events[0].Payload)
|
||||
}
|
||||
if payload["runtime"] != "hermes" {
|
||||
t.Errorf("payload.runtime = %v, want hermes", payload["runtime"])
|
||||
}
|
||||
}
|
||||
|
||||
// TestSweepStuckProvisioning_RaceSafe covers the case where UPDATE affects
|
||||
// 0 rows because the workspace flipped to online (or got restarted) between
|
||||
// the SELECT and the UPDATE. We should skip the event, not emit a false
|
||||
@ -76,8 +142,8 @@ func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
|
||||
func TestSweepStuckProvisioning_RaceSafe(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
|
||||
mock.ExpectQuery(`SELECT id FROM workspaces`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-raced"))
|
||||
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
|
||||
WillReturnRows(candidateRows([3]any{"ws-raced", "claude-code", 700}))
|
||||
|
||||
mock.ExpectExec(`UPDATE workspaces`).
|
||||
WithArgs("ws-raced", sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
@ -99,8 +165,8 @@ func TestSweepStuckProvisioning_RaceSafe(t *testing.T) {
|
||||
func TestSweepStuckProvisioning_NoStuck(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
|
||||
mock.ExpectQuery(`SELECT id FROM workspaces`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}))
|
||||
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
|
||||
WillReturnRows(candidateRows())
|
||||
|
||||
emit := &fakeEmitter{}
|
||||
sweepStuckProvisioning(context.Background(), emit)
|
||||
@ -115,14 +181,16 @@ func TestSweepStuckProvisioning_NoStuck(t *testing.T) {
|
||||
|
||||
// TestSweepStuckProvisioning_MultipleStuck covers the realistic case where
|
||||
// both agents (claude-code + hermes) are stuck — both should get flipped
|
||||
// and both should get events.
|
||||
// and both should get events. claude-code at 11 min (over its 10-min
|
||||
// limit), hermes at 31 min (over its 30-min limit).
|
||||
func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
|
||||
mock.ExpectQuery(`SELECT id FROM workspaces`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}).
|
||||
AddRow("ws-claude-code").
|
||||
AddRow("ws-hermes"))
|
||||
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
|
||||
WillReturnRows(candidateRows(
|
||||
[3]any{"ws-claude-code", "claude-code", 700},
|
||||
[3]any{"ws-hermes", "hermes", 1860},
|
||||
))
|
||||
|
||||
mock.ExpectExec(`UPDATE workspaces`).
|
||||
WithArgs("ws-claude-code", sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
@ -145,8 +213,8 @@ func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) {
|
||||
func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
|
||||
mock.ExpectQuery(`SELECT id FROM workspaces`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-stuck"))
|
||||
mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
|
||||
WillReturnRows(candidateRows([3]any{"ws-stuck", "claude-code", 700}))
|
||||
mock.ExpectExec(`UPDATE workspaces`).
|
||||
WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
@ -158,18 +226,47 @@ func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) {
|
||||
|
||||
// TestProvisioningTimeout_EnvOverride verifies PROVISION_TIMEOUT_SECONDS
|
||||
// env var takes effect when set to a positive integer, and falls back to
|
||||
// default otherwise.
|
||||
// the per-runtime default otherwise.
|
||||
func TestProvisioningTimeout_EnvOverride(t *testing.T) {
|
||||
t.Setenv("PROVISION_TIMEOUT_SECONDS", "60")
|
||||
if got := provisioningTimeout(); got.Seconds() != 60 {
|
||||
t.Errorf("override: got %v, want 60s", got)
|
||||
// When env override is set it wins over runtime defaults.
|
||||
if got := provisioningTimeoutFor(""); got.Seconds() != 60 {
|
||||
t.Errorf("override (no runtime): got %v, want 60s", got)
|
||||
}
|
||||
if got := provisioningTimeoutFor("hermes"); got.Seconds() != 60 {
|
||||
t.Errorf("override (hermes): got %v, want 60s", got)
|
||||
}
|
||||
t.Setenv("PROVISION_TIMEOUT_SECONDS", "")
|
||||
if got := provisioningTimeout(); got != DefaultProvisioningTimeout {
|
||||
t.Errorf("default: got %v, want %v", got, DefaultProvisioningTimeout)
|
||||
if got := provisioningTimeoutFor(""); got != DefaultProvisioningTimeout {
|
||||
t.Errorf("default (no runtime): got %v, want %v", got, DefaultProvisioningTimeout)
|
||||
}
|
||||
t.Setenv("PROVISION_TIMEOUT_SECONDS", "not-a-number")
|
||||
if got := provisioningTimeout(); got != DefaultProvisioningTimeout {
|
||||
t.Errorf("bad override: got %v, want default %v", got, DefaultProvisioningTimeout)
|
||||
if got := provisioningTimeoutFor("claude-code"); got != DefaultProvisioningTimeout {
|
||||
t.Errorf("bad override (claude-code): got %v, want default %v", got, DefaultProvisioningTimeout)
|
||||
}
|
||||
}
|
||||
|
||||
// TestProvisioningTimeout_RuntimeAware verifies hermes gets the longer
|
||||
// HermesProvisioningTimeout while other runtimes keep the default.
|
||||
// Mirrors bootstrap_watcher.go's bootstrapTimeoutFn — these two
|
||||
// timeouts must stay in sync (sweep > watcher) or healthy hermes
|
||||
// boots get killed mid-install.
|
||||
func TestProvisioningTimeout_RuntimeAware(t *testing.T) {
|
||||
cases := []struct {
|
||||
runtime string
|
||||
want time.Duration
|
||||
}{
|
||||
{"hermes", HermesProvisioningTimeout},
|
||||
{"langgraph", DefaultProvisioningTimeout},
|
||||
{"claude-code", DefaultProvisioningTimeout},
|
||||
{"crewai", DefaultProvisioningTimeout},
|
||||
{"autogen", DefaultProvisioningTimeout},
|
||||
{"", DefaultProvisioningTimeout},
|
||||
{"unknown-runtime", DefaultProvisioningTimeout},
|
||||
}
|
||||
for _, c := range cases {
|
||||
if got := provisioningTimeoutFor(c.runtime); got != c.want {
|
||||
t.Errorf("runtime=%q: got %v, want %v", c.runtime, got, c.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -8,6 +8,7 @@ import (
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/google/uuid"
|
||||
cronlib "github.com/robfig/cron/v3"
|
||||
@ -23,8 +24,26 @@ const (
|
||||
fireTimeout = 5 * time.Minute
|
||||
phantomSweepInterval = 5 * time.Minute
|
||||
phantomStaleThreshold = 10 * time.Minute
|
||||
// #2026: per-DB-op deadline. Every scheduler DB call must complete
|
||||
// within this window or the Exec/Query is cancelled and the tick
|
||||
// continues. Before this, a slow/stuck DB op (bad UTF-8 rejected by
|
||||
// Postgres, connection pool exhausted, replica lag) would block a
|
||||
// fireSchedule goroutine indefinitely, which blocked wg.Wait() in
|
||||
// tick(), which stalled the entire scheduler until operator restart.
|
||||
dbQueryTimeout = 10 * time.Second
|
||||
)
|
||||
|
||||
// sanitizeUTF8 replaces invalid UTF-8 byte sequences with the Unicode
|
||||
// replacement character. Used before writing agent-produced strings to
|
||||
// Postgres (text/jsonb columns reject invalid UTF-8, silently failing the
|
||||
// INSERT and holding the transaction open). #2026.
|
||||
func sanitizeUTF8(s string) string {
|
||||
if utf8.ValidString(s) {
|
||||
return s
|
||||
}
|
||||
return strings.ToValidUTF8(s, "<22>")
|
||||
}
|
||||
|
||||
// A2AProxy is the interface the scheduler needs to send messages to workspaces.
|
||||
// WorkspaceHandler.ProxyA2ARequest satisfies this.
|
||||
type A2AProxy interface {
|
||||
@ -186,7 +205,10 @@ func (s *Scheduler) Start(ctx context.Context) {
|
||||
func (s *Scheduler) tick(ctx context.Context) {
|
||||
supervised.Heartbeat("scheduler")
|
||||
|
||||
rows, err := db.DB.QueryContext(ctx, `
|
||||
// #2026: bound the due-schedules query — if Postgres is slow/stuck
|
||||
// this fails fast instead of blocking the tick loop indefinitely.
|
||||
queryCtx, queryCancel := context.WithTimeout(ctx, dbQueryTimeout)
|
||||
rows, err := db.DB.QueryContext(queryCtx, `
|
||||
SELECT id, workspace_id, name, cron_expr, timezone, prompt
|
||||
FROM workspace_schedules
|
||||
WHERE enabled = true AND next_run_at IS NOT NULL AND next_run_at <= now()
|
||||
@ -194,9 +216,11 @@ func (s *Scheduler) tick(ctx context.Context) {
|
||||
LIMIT $1
|
||||
`, batchLimit)
|
||||
if err != nil {
|
||||
queryCancel()
|
||||
log.Printf("Scheduler: tick query error: %v", err)
|
||||
return
|
||||
}
|
||||
defer queryCancel()
|
||||
defer rows.Close()
|
||||
|
||||
var wg sync.WaitGroup
|
||||
@ -276,20 +300,29 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
|
||||
// to allow concurrent task processing (e.g. leaders handling A2A while cron runs).
|
||||
var activeTasks int
|
||||
var maxConcurrent int
|
||||
if err := db.DB.QueryRowContext(ctx,
|
||||
// #2026: bound the capacity check — if the DB is slow, fail open
|
||||
// (skip the capacity wait, let fireTimeout catch a truly stuck fire)
|
||||
// rather than blocking here indefinitely.
|
||||
capCtx, capCancel := context.WithTimeout(ctx, dbQueryTimeout)
|
||||
capErr := db.DB.QueryRowContext(capCtx,
|
||||
`SELECT COALESCE(active_tasks, 0), COALESCE(max_concurrent_tasks, 1) FROM workspaces WHERE id = $1`,
|
||||
sched.WorkspaceID,
|
||||
).Scan(&activeTasks, &maxConcurrent); err == nil && activeTasks >= maxConcurrent {
|
||||
).Scan(&activeTasks, &maxConcurrent)
|
||||
capCancel()
|
||||
if capErr == nil && activeTasks >= maxConcurrent {
|
||||
log.Printf("Scheduler: '%s' workspace %s at capacity (active_tasks=%d, max=%d), deferring up to 2 min",
|
||||
sched.Name, short(sched.WorkspaceID, 12), activeTasks, maxConcurrent)
|
||||
// Poll every 10s for up to 2 minutes
|
||||
waited := false
|
||||
for i := 0; i < 12; i++ {
|
||||
time.Sleep(10 * time.Second)
|
||||
if err := db.DB.QueryRowContext(ctx,
|
||||
pollCtx, pollCancel := context.WithTimeout(ctx, dbQueryTimeout)
|
||||
err := db.DB.QueryRowContext(pollCtx,
|
||||
`SELECT COALESCE(active_tasks, 0), COALESCE(max_concurrent_tasks, 1) FROM workspaces WHERE id = $1`,
|
||||
sched.WorkspaceID,
|
||||
).Scan(&activeTasks, &maxConcurrent); err != nil || activeTasks < maxConcurrent {
|
||||
).Scan(&activeTasks, &maxConcurrent)
|
||||
pollCancel()
|
||||
if err != nil || activeTasks < maxConcurrent {
|
||||
waited = true
|
||||
break
|
||||
}
|
||||
@ -362,7 +395,12 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
|
||||
// per schedule; at 100 tenants × dozens of schedules the saved
|
||||
// query matters.
|
||||
var consecEmpty int
|
||||
if err := db.DB.QueryRowContext(ctx, `
|
||||
// #2026: bound the empty-run UPDATE — survives outer ctx cancellation
|
||||
// (uses Background()) so the bookkeeping completes even if fireTimeout
|
||||
// cancelled the HTTP call, and has its own deadline so a stuck DB
|
||||
// can't block the goroutine.
|
||||
emptyCtx, emptyCancel := context.WithTimeout(context.Background(), dbQueryTimeout)
|
||||
if err := db.DB.QueryRowContext(emptyCtx, `
|
||||
UPDATE workspace_schedules
|
||||
SET consecutive_empty_runs = consecutive_empty_runs + 1,
|
||||
updated_at = now()
|
||||
@ -370,6 +408,7 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
|
||||
RETURNING consecutive_empty_runs`, sched.ID).Scan(&consecEmpty); err != nil {
|
||||
log.Printf("Scheduler: '%s' empty-run bump failed: %v", sched.Name, err)
|
||||
}
|
||||
emptyCancel()
|
||||
if consecEmpty >= 3 {
|
||||
lastStatus = "stale"
|
||||
lastError = fmt.Sprintf("empty response %d consecutive times — agent may be phantom-producing (#795)", consecEmpty)
|
||||
@ -378,11 +417,13 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
|
||||
}
|
||||
} else if lastStatus == "ok" {
|
||||
// Non-empty success — reset the counter
|
||||
db.DB.ExecContext(ctx, `
|
||||
resetCtx, resetCancel := context.WithTimeout(context.Background(), dbQueryTimeout)
|
||||
_, _ = db.DB.ExecContext(resetCtx, `
|
||||
UPDATE workspace_schedules
|
||||
SET consecutive_empty_runs = 0,
|
||||
updated_at = now()
|
||||
WHERE id = $1`, sched.ID)
|
||||
resetCancel()
|
||||
}
|
||||
|
||||
nextRun, nextErr := ComputeNextRun(sched.CronExpr, sched.Timezone, time.Now())
|
||||
@ -422,20 +463,31 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
|
||||
|
||||
// Log a dedicated cron_run activity entry with schedule metadata so the
|
||||
// history endpoint can query by schedule_id.
|
||||
// #2026: sanitize the truncated prompt — even UTF-8-safe truncate() can
|
||||
// carry pre-existing invalid bytes from an agent-edited template. jsonb
|
||||
// columns reject invalid UTF-8 and hold the transaction open.
|
||||
cronMeta, _ := json.Marshal(map[string]interface{}{
|
||||
"schedule_id": sched.ID,
|
||||
"schedule_name": sched.Name,
|
||||
"cron_expr": sched.CronExpr,
|
||||
"prompt": truncate(sched.Prompt, 200),
|
||||
"prompt": sanitizeUTF8(truncate(sched.Prompt, 200)),
|
||||
})
|
||||
// #152: persist lastError into error_detail on the activity_logs row
|
||||
// so GET /workspaces/:id/schedules/:id/history can surface why a run
|
||||
// failed (previously dropped — history returned status without any
|
||||
// error context, making root-cause debugging impossible).
|
||||
_, _ = db.DB.ExecContext(ctx, `
|
||||
// #2026: bounded Background() context — this INSERT was observed wedging
|
||||
// indefinitely on invalid-UTF-8 jsonb payloads, blocking wg.Wait() in
|
||||
// tick() and stalling the whole scheduler. Now: 10s deadline, survives
|
||||
// outer ctx cancellation, and every string is UTF-8 sanitized.
|
||||
insertCtx, insertCancel := context.WithTimeout(context.Background(), dbQueryTimeout)
|
||||
if _, insErr := db.DB.ExecContext(insertCtx, `
|
||||
INSERT INTO activity_logs (workspace_id, activity_type, source_id, method, summary, request_body, status, error_detail, created_at)
|
||||
VALUES ($1, 'cron_run', NULL, 'cron', $2, $3::jsonb, $4, $5, now())
|
||||
`, sched.WorkspaceID, "Cron: "+sched.Name, string(cronMeta), lastStatus, lastError)
|
||||
`, sched.WorkspaceID, sanitizeUTF8("Cron: "+sched.Name), string(cronMeta), lastStatus, sanitizeUTF8(lastError)); insErr != nil {
|
||||
log.Printf("Scheduler: activity_logs insert failed for '%s' (%s): %v", sched.Name, sched.ID, insErr)
|
||||
}
|
||||
insertCancel()
|
||||
|
||||
if s.broadcaster != nil {
|
||||
s.broadcaster.RecordAndBroadcast(ctx, "CRON_EXECUTED", sched.WorkspaceID, map[string]interface{}{
|
||||
@ -483,7 +535,10 @@ func (s *Scheduler) recordSkipped(ctx context.Context, sched scheduleRow, active
|
||||
// Advance next_run_at + bump run_count so the liveness view reflects
|
||||
// that we're still ticking. last_status='skipped', last_error carries
|
||||
// the reason for operators debugging via the schedule history API.
|
||||
_, _ = db.DB.ExecContext(ctx, `
|
||||
// #2026: bounded Background() context so the bookkeeping can't block
|
||||
// on a stuck DB and stall the scheduler.
|
||||
skipUpdCtx, skipUpdCancel := context.WithTimeout(context.Background(), dbQueryTimeout)
|
||||
_, _ = db.DB.ExecContext(skipUpdCtx, `
|
||||
UPDATE workspace_schedules
|
||||
SET last_run_at = now(),
|
||||
next_run_at = COALESCE($2, next_run_at),
|
||||
@ -492,7 +547,8 @@ func (s *Scheduler) recordSkipped(ctx context.Context, sched scheduleRow, active
|
||||
last_error = $3,
|
||||
updated_at = now()
|
||||
WHERE id = $1
|
||||
`, sched.ID, nextRunPtr, reason)
|
||||
`, sched.ID, nextRunPtr, sanitizeUTF8(reason))
|
||||
skipUpdCancel()
|
||||
|
||||
cronMeta, _ := json.Marshal(map[string]interface{}{
|
||||
"schedule_id": sched.ID,
|
||||
@ -501,10 +557,14 @@ func (s *Scheduler) recordSkipped(ctx context.Context, sched scheduleRow, active
|
||||
"skipped": true,
|
||||
"active_tasks": activeTasks,
|
||||
})
|
||||
_, _ = db.DB.ExecContext(ctx, `
|
||||
// #2026: bounded Background() context on the skipped activity log INSERT
|
||||
// for the same reason as the fireSchedule activity_logs INSERT above.
|
||||
skipInsCtx, skipInsCancel := context.WithTimeout(context.Background(), dbQueryTimeout)
|
||||
_, _ = db.DB.ExecContext(skipInsCtx, `
|
||||
INSERT INTO activity_logs (workspace_id, activity_type, source_id, method, summary, request_body, status, error_detail, created_at)
|
||||
VALUES ($1, 'cron_run', NULL, 'cron', $2, $3::jsonb, 'skipped', $4, now())
|
||||
`, sched.WorkspaceID, "Cron skipped: "+sched.Name, string(cronMeta), reason)
|
||||
`, sched.WorkspaceID, sanitizeUTF8("Cron skipped: "+sched.Name), string(cronMeta), sanitizeUTF8(reason))
|
||||
skipInsCancel()
|
||||
|
||||
if s.broadcaster != nil {
|
||||
_ = s.broadcaster.RecordAndBroadcast(ctx, "CRON_SKIPPED", sched.WorkspaceID, map[string]interface{}{
|
||||
@ -690,11 +750,26 @@ func isEmptyResponse(body []byte) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// truncate shortens s to at most maxLen bytes, appending "..." if truncated.
|
||||
// #2026: UTF-8 safe — byte-slicing at maxLen-3 would split multi-byte runes
|
||||
// (observed: U+2026 `…` = 0xe2 0x80 0xa6, sliced mid-char, concatenated with
|
||||
// "..." producing 0xe2 0x80 0x2e — rejected by Postgres as invalid UTF-8,
|
||||
// which wedged the activity_logs INSERT with no deadline and stalled the
|
||||
// scheduler).
|
||||
func truncate(s string, maxLen int) string {
|
||||
if len(s) <= maxLen {
|
||||
return s
|
||||
}
|
||||
return s[:maxLen-3] + "..."
|
||||
cut := maxLen - 3
|
||||
if cut < 0 {
|
||||
cut = 0
|
||||
}
|
||||
// Back up to a rune boundary — utf8.RuneStart returns true for any
|
||||
// non-continuation byte (ASCII, or the lead byte of a multi-byte rune).
|
||||
for cut > 0 && !utf8.RuneStart(s[cut]) {
|
||||
cut--
|
||||
}
|
||||
return s[:cut] + "..."
|
||||
}
|
||||
|
||||
// short returns up to n leading characters of s without panicking when s is
|
||||
|
||||
@ -5,6 +5,7 @@ import (
|
||||
"database/sql"
|
||||
"testing"
|
||||
"time"
|
||||
"unicode/utf8"
|
||||
|
||||
sqlmock "github.com/DATA-DOG/go-sqlmock"
|
||||
|
||||
@ -599,3 +600,55 @@ func TestRecordSkipped_AdvancesNextRunAt(t *testing.T) {
|
||||
}
|
||||
}
|
||||
// trigger CI
|
||||
|
||||
// ── TestTruncate_utf8Safe_regression2026 ──────────────────────────────────────
|
||||
|
||||
// TestTruncate_utf8Safe_regression2026 locks in the #2026 fix: truncate must
|
||||
// never split a multi-byte UTF-8 rune. Before the fix, a prompt whose byte-197
|
||||
// landed mid-rune (e.g. U+2026 `…` = 0xe2 0x80 0xa6) would be sliced at
|
||||
// maxLen-3 and produce the sequence 0xe2 0x80 0x2e when concatenated with
|
||||
// "...", which Postgres rejects as invalid UTF-8 — wedging the activity_logs
|
||||
// INSERT and stalling the entire scheduler.
|
||||
func TestTruncate_utf8Safe_regression2026(t *testing.T) {
|
||||
// Build a prompt where the byte at position 197 is the middle of the
|
||||
// 3-byte rune U+2026 (`…`). With maxLen=200 the pre-fix code slices at
|
||||
// byte 197 (maxLen-3), which lands on `0x80` — a continuation byte.
|
||||
filler := ""
|
||||
for len(filler) < 195 {
|
||||
filler += "a"
|
||||
}
|
||||
input := filler + "…xxx" // 195 ASCII + 3-byte rune + 3 trailing
|
||||
out := truncate(input, 200)
|
||||
|
||||
if !utf8.ValidString(out) {
|
||||
t.Fatalf("truncate produced invalid UTF-8: %x", []byte(out))
|
||||
}
|
||||
// Must not contain the 0xe2 0x80 0x2e wedge sequence (partial rune
|
||||
// followed by the "..." suffix).
|
||||
for i := 0; i < len(out)-2; i++ {
|
||||
if out[i] == 0xe2 && out[i+1] == 0x80 && out[i+2] == 0x2e {
|
||||
t.Fatalf("truncate produced the 0xe2 0x80 0x2e wedge sequence at byte %d", i)
|
||||
}
|
||||
}
|
||||
if len(out) > 200 {
|
||||
t.Fatalf("truncate returned %d bytes, want <= 200", len(out))
|
||||
}
|
||||
}
|
||||
|
||||
// ── TestSanitizeUTF8 ──────────────────────────────────────────────────────────
|
||||
|
||||
// TestSanitizeUTF8 confirms sanitizeUTF8 leaves valid UTF-8 unchanged and
|
||||
// replaces invalid sequences with the Unicode replacement character.
|
||||
func TestSanitizeUTF8(t *testing.T) {
|
||||
// Valid UTF-8 passes through unchanged.
|
||||
valid := "hello … world"
|
||||
if got := sanitizeUTF8(valid); got != valid {
|
||||
t.Errorf("sanitizeUTF8(valid) = %q, want %q", got, valid)
|
||||
}
|
||||
// Invalid UTF-8 (orphan continuation byte) is sanitized.
|
||||
bad := "hello \x80 world"
|
||||
out := sanitizeUTF8(bad)
|
||||
if !utf8.ValidString(out) {
|
||||
t.Errorf("sanitizeUTF8 did not produce valid UTF-8: %x", []byte(out))
|
||||
}
|
||||
}
|
||||
|
||||
@ -143,6 +143,21 @@ func (r *Registry) Names() []string {
|
||||
return names
|
||||
}
|
||||
|
||||
// Mutators returns a copy of the registered mutators in registration
|
||||
// order. Used when multiple plugins build their own registries and need
|
||||
// to merge onto a shared one at boot. Returns a copy so callers can't
|
||||
// mutate internal state.
|
||||
func (r *Registry) Mutators() []EnvMutator {
|
||||
if r == nil {
|
||||
return nil
|
||||
}
|
||||
r.mu.RLock()
|
||||
defer r.mu.RUnlock()
|
||||
out := make([]EnvMutator, len(r.mutators))
|
||||
copy(out, r.mutators)
|
||||
return out
|
||||
}
|
||||
|
||||
// FirstTokenProvider returns the first registered mutator that also
|
||||
// implements TokenProvider, or nil if none do. Used to back the
|
||||
// GET /admin/github-installation-token endpoint so long-running
|
||||
|
||||
@ -247,8 +247,6 @@ class LangGraphA2AExecutor(AgentExecutor):
|
||||
task_span.set_attribute(A2A_TASK_ID, context.context_id or "")
|
||||
task_span.set_attribute("a2a.input_preview", user_input[:256])
|
||||
|
||||
await set_current_task(self._heartbeat, brief_task(user_input))
|
||||
|
||||
# Resolve IDs — the RequestContextBuilder always sets them, but
|
||||
# we generate fallbacks for safety (e.g. in unit tests).
|
||||
task_id = context.task_id or str(uuid.uuid4())
|
||||
@ -257,6 +255,12 @@ class LangGraphA2AExecutor(AgentExecutor):
|
||||
updater = TaskUpdater(event_queue, task_id, context_id)
|
||||
|
||||
try:
|
||||
# set_current_task INSIDE the try so active_tasks is always
|
||||
# decremented by the finally block even if CancelledError hits
|
||||
# during the heartbeat HTTP push. Moving it outside the try
|
||||
# created a window where cancellation left active_tasks stuck
|
||||
# at 1, permanently blocking queue drain. (#2026)
|
||||
await set_current_task(self._heartbeat, brief_task(user_input))
|
||||
messages = _extract_history(context)
|
||||
if messages:
|
||||
logger.info("A2A execute: injecting %d history messages", len(messages))
|
||||
|
||||
@ -426,14 +426,19 @@ class ClaudeSDKExecutor(AgentExecutor):
|
||||
# Keep a clean copy of the user's actual message for the memory record,
|
||||
# BEFORE any delegation or memory injection.
|
||||
original_input = user_input
|
||||
await set_current_task(self.heartbeat, brief_summary(user_input))
|
||||
logger.debug("SDK execute [claude-code]: %s", user_input[:200])
|
||||
|
||||
prompt = self._prepare_prompt(user_input)
|
||||
prompt = await self._inject_memories_if_first_turn(prompt)
|
||||
|
||||
response_text: str = ""
|
||||
try:
|
||||
# set_current_task INSIDE the try so active_tasks is always
|
||||
# decremented by the finally block even if CancelledError hits
|
||||
# during the heartbeat HTTP push. Moving it outside the try
|
||||
# created a narrow window where cancellation left active_tasks
|
||||
# stuck at 1 forever, permanently blocking queue drain. (#2026)
|
||||
await set_current_task(self.heartbeat, brief_summary(user_input))
|
||||
prompt = await self._inject_memories_if_first_turn(prompt)
|
||||
for attempt in range(_MAX_RETRIES):
|
||||
options = self._build_options()
|
||||
try:
|
||||
|
||||
@ -280,9 +280,6 @@ class CLIAgentExecutor(AgentExecutor):
|
||||
# delegation or memory injection happens.
|
||||
original_input = user_input
|
||||
|
||||
# Show current task on canvas — extract a brief one-line summary
|
||||
await set_current_task(self._heartbeat, brief_summary(user_input))
|
||||
|
||||
logger.debug("CLI execute [%s]: %s", self.runtime, user_input[:200])
|
||||
|
||||
# Inject delegation results that arrived since last message
|
||||
@ -290,13 +287,20 @@ class CLIAgentExecutor(AgentExecutor):
|
||||
if delegation_context:
|
||||
user_input = f"[Delegation results received while you were idle]\n{delegation_context}\n\n[New message]\n{user_input}"
|
||||
|
||||
# Auto-recall: inject prior memories into every prompt. (The CLI
|
||||
# runtimes don't keep a session, so there's no "first turn" concept.)
|
||||
memories = await recall_memories()
|
||||
if memories:
|
||||
user_input = f"[Prior context from memory]\n{memories}\n\n{user_input}"
|
||||
|
||||
try:
|
||||
# set_current_task INSIDE the try so active_tasks is always
|
||||
# decremented by the finally block even if CancelledError hits
|
||||
# during the heartbeat HTTP push. Moving it outside the try
|
||||
# created a window where cancellation left active_tasks stuck
|
||||
# at 1, permanently blocking queue drain. (#2026)
|
||||
await set_current_task(self._heartbeat, brief_summary(user_input))
|
||||
|
||||
# Auto-recall: inject prior memories into every prompt. (The CLI
|
||||
# runtimes don't keep a session, so there's no "first turn" concept.)
|
||||
memories = await recall_memories()
|
||||
if memories:
|
||||
user_input = f"[Prior context from memory]\n{memories}\n\n{user_input}"
|
||||
|
||||
await self._run_cli(user_input, event_queue)
|
||||
finally:
|
||||
await set_current_task(self._heartbeat, "")
|
||||
|
||||
@ -166,23 +166,42 @@ class SecurityScanConfig:
|
||||
class ComplianceConfig:
|
||||
"""OWASP Top 10 for Agentic Applications compliance settings.
|
||||
|
||||
Set ``mode: owasp_agentic`` to enable all checks. When ``mode`` is
|
||||
empty or absent the compliance layer is a complete no-op.
|
||||
Default is ``mode: owasp_agentic`` + ``prompt_injection: detect``.
|
||||
The detect mode logs injection attempts as audit events without
|
||||
blocking the request — so there is no false-positive UX cost, only
|
||||
a gain in visibility. Operators opt into stricter ``block`` mode per
|
||||
workspace. To disable compliance entirely (not recommended), set
|
||||
``mode: ""`` in config.yaml.
|
||||
|
||||
Example config.yaml snippet::
|
||||
Before 2026-04-24, the default was ``mode: ""`` (fully off). A
|
||||
review of the A2A inbound path showed that no shipped template set
|
||||
``mode`` explicitly, so prompt-injection detection was silently
|
||||
disabled for every live workspace despite the machinery existing.
|
||||
Flipping the default to ``owasp_agentic`` with ``prompt_injection:
|
||||
detect`` closes that gap with zero user-visible behavior change.
|
||||
|
||||
Example config.yaml snippet to opt OUT::
|
||||
|
||||
compliance:
|
||||
mode: owasp_agentic
|
||||
prompt_injection: block # detect | block (default: detect)
|
||||
mode: "" # disables all compliance checks
|
||||
|
||||
Example config.yaml snippet to tighten::
|
||||
|
||||
compliance:
|
||||
mode: owasp_agentic # (default)
|
||||
prompt_injection: block # (default: detect)
|
||||
max_tool_calls_per_task: 30
|
||||
max_task_duration_seconds: 180
|
||||
"""
|
||||
|
||||
mode: str = ""
|
||||
"""Enable compliance mode. Set to ``owasp_agentic`` to activate."""
|
||||
mode: str = "owasp_agentic"
|
||||
"""Enable compliance mode. ``owasp_agentic`` (default) activates the
|
||||
OA-01/OA-02/OA-03/OA-06 checks; ``""`` disables everything."""
|
||||
|
||||
prompt_injection: str = "detect"
|
||||
"""``detect`` logs injection attempts; ``block`` raises PromptInjectionError."""
|
||||
"""``detect`` logs injection attempts (default, zero UX cost);
|
||||
``block`` raises PromptInjectionError before the agent sees the
|
||||
text. Operators can tighten to ``block`` per workspace."""
|
||||
|
||||
max_tool_calls_per_task: int = 50
|
||||
"""Maximum number of tool invocations per task before ExcessiveAgencyError."""
|
||||
@ -353,7 +372,9 @@ def load_config(config_path: Optional[str] = None) -> WorkspaceConfig:
|
||||
fail_open_if_no_scanner=security_scan_raw.get("fail_open_if_no_scanner", True),
|
||||
),
|
||||
compliance=ComplianceConfig(
|
||||
mode=compliance_raw.get("mode", ""),
|
||||
# Default must match ComplianceConfig.mode's dataclass default
|
||||
# (see class docstring for rationale — 2026-04-24 flip).
|
||||
mode=compliance_raw.get("mode", "owasp_agentic"),
|
||||
prompt_injection=compliance_raw.get("prompt_injection", "detect"),
|
||||
max_tool_calls_per_task=int(compliance_raw.get("max_tool_calls_per_task", 50)),
|
||||
max_task_duration_seconds=int(compliance_raw.get("max_task_duration_seconds", 300)),
|
||||
|
||||
Loading…
Reference in New Issue
Block a user