Compare commits

..

2 Commits

Author SHA1 Message Date
Molecule AI Dev Engineer A (Kimi) 48b6011e17 fix(2047): pass workspaceID to stripPluginMarkersFromMemory
ci-arm64-advisory / fast-checks (pull_request) Waiting to run
CI / Python Lint & Test (pull_request) Successful in 6s
Block internal-flavored paths / Block forbidden paths (pull_request) Successful in 8s
Lint shellcheck (arm64 pilot) / shellcheck-arm64 (pilot) (pull_request) Successful in 2s
CI / Detect changes (pull_request) Successful in 8s
E2E API Smoke Test / detect-changes (pull_request) Successful in 9s
Harness Replays / detect-changes (pull_request) Successful in 6s
Lint forbidden tenant-env keys / Scan workspace_secrets writers for forbidden env keys (pull_request) Successful in 5s
E2E Staging Canvas (Playwright) / detect-changes (pull_request) Successful in 11s
gate-check-v3 / gate-check (pull_request_target) Successful in 5s
Lint forbidden tenant-env keys / Scan for repo-host token write into tenant workspace surface (pull_request) Successful in 9s
Handlers Postgres Integration / detect-changes (pull_request) Successful in 13s
security-review / approved (pull_request_target) Failing after 5s
qa-review / approved (pull_request_target) Failing after 6s
E2E Chat / detect-changes (pull_request) Successful in 16s
CI / Canvas (Next.js) (pull_request) Successful in 1s
CI / Shellcheck (E2E scripts) (pull_request) Successful in 1s
CI / Canvas Deploy Status (pull_request) Has been skipped
E2E Staging Canvas (Playwright) / Canvas tabs E2E (pull_request) Successful in 2s
E2E Chat / E2E Chat (pull_request) Successful in 2s
Secret scan / Scan diff for credential-shaped strings (pull_request) Successful in 13s
Harness Replays / Harness Replays (pull_request) Successful in 6s
E2E API Smoke Test / E2E API Smoke Test (pull_request) Successful in 54s
lint-required-no-paths / lint-required-no-paths (pull_request) Successful in 1m32s
Handlers Postgres Integration / Handlers Postgres Integration (pull_request) Successful in 1m49s
CI / Platform (Go) (pull_request) Successful in 3m54s
CI / all-required (pull_request) Successful in 7s
qa-review / approved (pull_request_review) Has been skipped
security-review / approved (pull_request_review) Has been skipped
sop-tier-check / tier-check (pull_request_review) Successful in 5s
sop-checklist / review-refire (pull_request_target) Has been skipped
sop-checklist / all-items-acked (pull_request) [info tier:low] acked: 0/7 — missing: comprehensive-testing, local-postgres-e2e, staging-smoke, +4
sop-checklist / na-declarations (pull_request) N/A: (none)
sop-tier-check / tier-check (pull_request_target) Successful in 5s
sop-checklist / all-items-acked (pull_request_target) Successful in 8s
2026-06-05 04:09:20 +00:00
Molecule AI Dev Engineer A (Kimi) cc99d3fff4 fix(plugins): log silently ignored execAsRoot errors during uninstall
ci-arm64-advisory / fast-checks (pull_request) Waiting to run
Lint shellcheck (arm64 pilot) / shellcheck-arm64 (pilot) (pull_request) Successful in 1s
Block internal-flavored paths / Block forbidden paths (pull_request) Successful in 3s
CI / Python Lint & Test (pull_request) Successful in 3s
CI / Detect changes (pull_request) Successful in 6s
E2E API Smoke Test / detect-changes (pull_request) Successful in 5s
E2E Chat / detect-changes (pull_request) Successful in 5s
E2E Staging Canvas (Playwright) / detect-changes (pull_request) Successful in 5s
Handlers Postgres Integration / detect-changes (pull_request) Successful in 6s
Lint forbidden tenant-env keys / Scan workspace_secrets writers for forbidden env keys (pull_request) Successful in 3s
Lint forbidden tenant-env keys / Scan for repo-host token write into tenant workspace surface (pull_request) Successful in 2s
Secret scan / Scan diff for credential-shaped strings (pull_request) Successful in 4s
gate-check-v3 / gate-check (pull_request_target) Successful in 3s
Harness Replays / detect-changes (pull_request) Successful in 17s
qa-review / approved (pull_request_target) Failing after 4s
sop-checklist / review-refire (pull_request_target) Has been skipped
sop-checklist / all-items-acked (pull_request) acked: 0/7 — missing: comprehensive-testing, local-postgres-e2e, staging-smoke, +4
sop-checklist / na-declarations (pull_request) N/A: (none)
sop-checklist / all-items-acked (pull_request_target) Successful in 3s
security-review / approved (pull_request_target) Failing after 8s
sop-tier-check / tier-check (pull_request_target) Successful in 4s
CI / Canvas (Next.js) (pull_request) Successful in 1s
CI / Shellcheck (E2E scripts) (pull_request) Successful in 2s
E2E Chat / E2E Chat (pull_request) Successful in 2s
E2E Staging Canvas (Playwright) / Canvas tabs E2E (pull_request) Successful in 2s
lint-required-no-paths / lint-required-no-paths (pull_request) Successful in 58s
E2E API Smoke Test / E2E API Smoke Test (pull_request) Failing after 25s
CI / Platform (Go) (pull_request) Failing after 36s
CI / all-required (pull_request) Has been skipped
Harness Replays / Harness Replays (pull_request) Successful in 2s
CI / Canvas Deploy Status (pull_request) Has been skipped
Handlers Postgres Integration / Handlers Postgres Integration (pull_request) Failing after 48s
Plugin uninstall had two sites where execAsRoot errors were discarded:
- Skill directory removal (plugins_install.go:125) — orphaned skill dirs
  if rm -rf failed silently
- CLAUDE.md marker stripping (plugins_install_pipeline.go:326) — stale
  plugin content left in CLAUDE.md if awk script failed

Both now log the error without failing the overall uninstall (best-effort
 cleanup), giving operators visibility into incomplete uninstalls.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-05 03:52:07 +00:00
63 changed files with 646 additions and 4591 deletions
+5 -14
View File
@@ -19,22 +19,13 @@ REDIS_URL=redis://localhost:6379
# itself to 3000 in canvas/package.json, so sourcing this file before
# `npm run dev` won't accidentally make Next.js try to bind 8080.
PORT=8080
# ---- Admin credential — REQUIRED in EVERY environment (auth is fail-closed) ----
# Auth is fail-CLOSED everywhere now (harden/no-fail-open-auth): there is NO
# dev-mode escape hatch. AdminAuth / WorkspaceAuth / discovery all require a
# real credential. The canvas authenticates by sending this value as a bearer
# (it reads NEXT_PUBLIC_ADMIN_TOKEN — set it to the SAME value).
# ---- Admin credential — REQUIRED to close issue #684 (AdminAuth bearer bypass) ----
# When ADMIN_TOKEN is set, only this value is accepted on /admin/* and /approvals/* routes.
# (When unset, a fresh install 401s on admin routes and any valid workspace bearer
# is the only deprecated fallback once tokens exist — set ADMIN_TOKEN to close #684.)
# Generate: openssl rand -base64 32 (scripts/dev-start.sh provisions a fixed dev value)
# Without it, any valid workspace bearer token can call admin endpoints (backward compat
# fallback, still vulnerable). Set this in every environment, rotate when compromised.
# Generate: openssl rand -base64 32
# Store in fly secrets / deployment env — NEVER commit the actual value here.
ADMIN_TOKEN=
# NEXT_PUBLIC_ADMIN_TOKEN= # Canvas-side mirror of ADMIN_TOKEN. The canvas
# bakes this into its bundle and sends it as the
# bearer. MUST equal ADMIN_TOKEN (next.config.ts
# warns if the pair is half-set). dev-start.sh
# exports it for you.
SECRETS_ENCRYPTION_KEY= # 32-byte key (raw or base64). Leave empty for plaintext (dev only).
CONFIGS_DIR= # Path to workspace-configs-templates/ (auto-discovered if empty)
PLUGINS_DIR= # Path to plugins/ directory (default: /plugins in container)
@@ -43,7 +34,7 @@ PLUGINS_DIR= # Path to plugins/ directory (default: /plugins i
# MOLECULE_MCP_ALLOW_SEND_MESSAGE= # Set to "true" to include send_message_to_user in the MCP bridge tool list (issue #810). Excluded by default to prevent unintended WebSocket pushes from CLI sessions.
# MOLECULE_MCP_URL=http://localhost:8080 # Platform URL for opencode MCP config (opencode.json). Same as PLATFORM_URL; separate var so opencode configs can reference it without ambiguity.
# WORKSPACE_DIR= # Optional global host path bind-mounted to /workspace in every container. Per-workspace workspace_dir column overrides this; if neither is set each workspace gets an isolated Docker named volume.
MOLECULE_ENV=development # Environment label (development/staging/production). Used for log tagging and for NON-security local-dev conveniences (loopback HTTP bind, relaxed rate-limit bucket). It is NOT an auth lever — auth is fail-closed in every environment. SaaS deployments MUST set MOLECULE_ENV=production.
MOLECULE_ENV=development # Environment label (development/staging/production). Used for log tagging and for the AdminAuth dev-mode escape hatch (lets the Canvas dashboard keep working after the first workspace is created, when ADMIN_TOKEN is unset). SaaS deployments MUST set MOLECULE_ENV=production.
# MOLECULE_ENABLE_TEST_TOKENS= # Set to 1 to expose GET /admin/workspaces/:id/test-token (mints a fresh bearer token for E2E scripts). The route is auto-enabled when MOLECULE_ENV != production; this flag is the explicit override. Leave unset/0 in prod — the route 404s unless enabled.
# MOLECULE_ORG_ID= # SaaS only: org UUID set by control plane on tenant machines. When set, workspace provisioning auto-routes through the control plane API instead of Docker.
# CP_PROVISION_URL= # Override control plane URL for workspace provisioning (default: https://api.moleculesai.app). Only needed for testing against a non-production control plane.
-19
View File
@@ -364,25 +364,6 @@ jobs:
# check missed. If a refactor weakens the gate to a shape check,
# this step goes red on every PR.
bash tests/e2e/test_completion_assert_unit.sh
# harden/e2e-staging-saas-failclosed: fail-direction proof for the
# E2E_REQUIRE_LIVE fail-closed-on-skip guard in
# test_staging_full_saas.sh. Offline (no LLM/network/provisioning):
# asserts the guard exits 5 when a live lifecycle did NOT run and
# passes when all milestones fired — so a refactor that lets the
# staging gate report green without a real provision→online→A2A
# cycle goes red on every PR.
bash tests/e2e/test_require_live_guard_unit.sh
# harden/enforce-ci-gates-core-v2 (PR #2286): fail-direction proof
# for the E2E_REQUIRE_LIVE zero-validated gate in
# test_priority_runtimes_e2e.sh (the REQUIRED `E2E API Smoke Test`).
# Offline (no LLM/network/provisioning): sources that script under
# its unit source-guard and drives the REAL evaluate_require_live_gate
# — asserts REQUIRE_LIVE=1 + zero validated → RED (the false-green
# trap), REQUIRE_LIVE=1 + >=1 validated → GREEN, and REQUIRE_LIVE
# unset + zero validated → GREEN (loud skip). CI can't provision a
# live arm to prove this, so this unit test IS the regression gate:
# a revert of the zero-validated→RED logic goes red on every PR.
bash tests/e2e/test_require_live_priority_gate_unit.sh
- if: ${{ needs.changes.outputs.scripts == 'true' }}
name: Test ECR promote-tenant-image script (mock-driven, no live infra)
+1 -59
View File
@@ -272,24 +272,6 @@ jobs:
echo "::error::Redis did not become ready in 15s"
docker logs "$REDIS_CONTAINER" || true
exit 1
- name: Set deterministic admin token for the e2e platform
if: needs.detect-changes.outputs.api == 'true'
run: |
# AdminAuth (workspace-server/internal/middleware/wsauth_middleware.go:164)
# reads ADMIN_TOKEN. Setting it (a) closes isDevModeFailOpen (devmode.go:50
# returns false when ADMIN_TOKEN is non-empty), so admin routes require a
# bearer, and (b) makes Tier-2b accept a bearer that constant-time-equals
# ADMIN_TOKEN. The platform process inherits ADMIN_TOKEN from $GITHUB_ENV.
#
# MOLECULE_ADMIN_TOKEN is the var the e2e scripts send as the bearer
# (tests/e2e/_lib.sh:33 e2e_mint_workspace_token, and the run_mock
# org-import curl). Set BOTH to the SAME value so the bearer the test
# sends == the secret the platform checks. Deterministic test value;
# this platform is ephemeral, single-run, and never reachable off-host.
E2E_ADMIN_TOKEN="e2e-api-admin-${{ github.run_id }}-${{ github.run_attempt }}"
echo "ADMIN_TOKEN=${E2E_ADMIN_TOKEN}" >> "$GITHUB_ENV"
echo "MOLECULE_ADMIN_TOKEN=${E2E_ADMIN_TOKEN}" >> "$GITHUB_ENV"
echo "Admin token configured for the e2e platform (ADMIN_TOKEN + MOLECULE_ADMIN_TOKEN)."
- name: Build platform
if: needs.detect-changes.outputs.api == 'true'
working-directory: workspace-server
@@ -412,51 +394,11 @@ jobs:
- name: Run E2E API tests
if: needs.detect-changes.outputs.api == 'true'
run: bash tests/e2e/test_api.sh
- name: Run keyless feature-contract E2E (terminal-diagnose / webhooks / budget / checkpoints / audit / traces / session-search / rescue / llm-billing-mode / resume / hibernate)
# Keyless required-lane coverage for feature endpoints that ship without
# an LLM key (runtime=external fixture). Each asserts the real HTTP
# contract + a meaningful failure mode (401/400/fail-closed) so a
# regression goes RED, not silently green. The mock-runtime A2A canned
# round-trip is covered by the priority-runtimes `mock` arm, not here.
if: needs.detect-changes.outputs.api == 'true'
run: bash tests/e2e/test_keyless_feature_contracts_e2e.sh
- name: Run secrets-dispatch contract test (keyless SECRETS_JSON branch order)
# Previously orphaned (no workflow referenced it). Hermetic unit-style
# contract over test_staging_full_saas.sh's LLM-key branch precedence —
# needs no platform, no bearer, no network. Guards the 2026-05-03
# "wrong key shape wins" incident class.
if: needs.detect-changes.outputs.api == 'true'
run: bash tests/e2e/test_secrets_dispatch.sh
- name: Run notify-with-attachments E2E
if: needs.detect-changes.outputs.api == 'true'
run: bash tests/e2e/test_notify_attachments_e2e.sh
- name: "Run priority-runtimes E2E (REQUIRE-LIVE: mock validates the runtime plumbing end-to-end)"
# E2E_REQUIRE_LIVE=1 is ON: the run MUST validate >=1 runtime end-to-end
# or it exits NON-zero (RED). This is now SAFE because the `mock` arm can
# actually provision in CI: the only blocker was that POST /org/import and
# POST /admin/workspaces/:id/tokens are AdminAuth-gated
# (router.go:778 + :427) and this job previously configured NO admin token,
# so every admin call 401'd ("admin auth required"). The "Set deterministic
# admin token" step above now sets ADMIN_TOKEN on the platform AND exports
# the matching MOLECULE_ADMIN_TOKEN the e2e scripts send as the bearer, so
# the mock arm can org-import → online → mint token → canned A2A reply →
# validated(). That guarantees VALIDATED>=1 on a healthy platform, so the
# REQUIRED `E2E API Smoke Test` gate now HONESTLY validates a runtime
# end-to-end; if the mock plumbing (DB insert, status flip, A2A proxy,
# activity logging, or the admin-auth wiring) genuinely breaks, the gate
# goes RED instead of false-green. The zero-validated→RED decision is also
# regression-gated WITHOUT provisioning by the bash unit test
# tests/e2e/test_require_live_priority_gate_unit.sh (wired into ci.yml's
# "Run E2E bash unit tests" job), so a revert of that logic still fails CI.
#
# MiniMax stays an OPPORTUNISTIC best-effort arm: create is registry-fragile
# in CI (422 UNREGISTERED_MODEL_FOR_RUNTIME), so a miss is reported via
# bestfail() and never reds the gate — mock carries the required validation,
# MiniMax is a bonus real-LLM check when it comes up. ZERO new credentials.
- name: Run priority-runtimes E2E (claude-code + hermes — skips when keys absent)
if: needs.detect-changes.outputs.api == 'true'
env:
E2E_REQUIRE_LIVE: '1'
E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
run: bash tests/e2e/test_priority_runtimes_e2e.sh
- name: Install standalone runtime parser from Gitea registry
if: needs.detect-changes.outputs.api == 'true'
+1 -58
View File
@@ -113,28 +113,6 @@ jobs:
runs-on: docker-host
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
#
# PROMOTION-READINESS (toward required gate — do NOT flip continue-on-error
# without CTO sign-off, that's the irreversible call):
# NOW FAIL-CLOSED:
# - Postgres/Redis/platform/canvas readiness are already bounded
# readiness-polls that hard-fail (and dump logs) at their deadline,
# not fixed sleeps — preserved.
# - passWithNoTests:false + forbidOnly (playwright.config.ts) → a
# renamed/moved spec or stray test.only can no longer green the lane.
# - REQUIRE-LIVE guard in "Run Playwright E2E tests" → chat==true must
# actually execute >=1 test, else exit 1.
# - chat-desktop "activity log" test no longer swallows its assertion.
# STILL BLOCKS PROMOTION:
# - The echo round-trip asserts on rendered "Echo: ..." text but never
# asserts the echo runtime actually RECEIVED the A2A request
# (fixtures/echo-runtime.ts exposes lastRequest, unused) — an
# optimistic client-side render could pass without a real round-trip.
# Add a server-received assertion before required.
# - The "No-op pass" path (detect-changes chat!=true) is a legitimate
# paths-filter skip, but a required gate needs it to be a neutral
# check, not a green "success", so a skipped heavy lane can't be
# mistaken for a passed one.
continue-on-error: true
timeout-minutes: 15
env:
@@ -249,20 +227,6 @@ jobs:
echo "CANVAS_PORT=${CANVAS_PORT}" >> "$GITHUB_ENV"
echo "Canvas host port: ${CANVAS_PORT}"
- name: Set deterministic admin token
if: needs.detect-changes.outputs.chat == 'true'
run: |
# PR #2291 made auth fail-closed everywhere (no dev-mode escape).
# The platform server requires ADMIN_TOKEN; the canvas requires the
# matching NEXT_PUBLIC_ADMIN_TOKEN or every API call 401s.
# We set a deterministic per-run value so the ephemeral platform
# and canvas are paired correctly.
E2E_ADMIN_TOKEN="e2e-chat-admin-${{ github.run_id }}-${{ github.run_attempt }}"
echo "ADMIN_TOKEN=${E2E_ADMIN_TOKEN}" >> "$GITHUB_ENV"
echo "MOLECULE_ADMIN_TOKEN=${E2E_ADMIN_TOKEN}" >> "$GITHUB_ENV"
echo "NEXT_PUBLIC_ADMIN_TOKEN=${E2E_ADMIN_TOKEN}" >> "$GITHUB_ENV"
echo "Admin token configured for e2e-chat platform + canvas."
- name: Start platform (background)
if: needs.detect-changes.outputs.chat == 'true'
working-directory: workspace-server
@@ -370,32 +334,11 @@ jobs:
- name: Run Playwright E2E tests
if: needs.detect-changes.outputs.chat == 'true'
working-directory: canvas
env:
# CI=1 activates forbidOnly in playwright.config.ts (a stray
# `test.only` would otherwise green the suite while skipping the
# rest). passWithNoTests:false (also in the config) already makes
# a zero-match selection exit non-zero.
CI: "1"
run: |
set -euo pipefail
export E2E_PLATFORM_URL="http://127.0.0.1:${PLATFORM_PORT}"
export E2E_DATABASE_URL="${DATABASE_URL}"
export PLAYWRIGHT_BASE_URL="http://localhost:${CANVAS_PORT}"
# REQUIRE-LIVE guard (mirrors CP serving-e2e SERVING_E2E_REQUIRE_LIVE):
# this lane reached here only because detect-changes said chat==true,
# so it MUST actually execute the round-trip specs. `pipefail` makes
# a real test failure (playwright non-zero) abort here under `set -e`;
# passWithNoTests:false makes a zero-match selection non-zero too. The
# explicit grep below is belt-and-braces: assert the list reporter
# printed an executed-count summary, so a silent all-skip / no-op can
# never report green.
npx playwright test e2e/chat-desktop.spec.ts e2e/chat-mobile.spec.ts \
--reporter=list 2>&1 | tee /tmp/pw-chat.out
if ! grep -qE '[0-9]+ (passed|failed|skipped)' /tmp/pw-chat.out; then
echo "::error::E2E Chat REQUIRE-LIVE: chat==true but Playwright reported no executed tests — specs missing or all-skipped, refusing to report green."
exit 1
fi
npx playwright test e2e/chat-desktop.spec.ts e2e/chat-mobile.spec.ts
- name: Dump platform log on failure
if: failure() && needs.detect-changes.outputs.chat == 'true'
+3 -24
View File
@@ -12,30 +12,9 @@ name: E2E Staging Canvas (Playwright)
#
# Playwright test suite that provisions a fresh staging org per run and
# verifies every workspace-panel tab renders REAL content (not just an
# empty/errored container). Complements e2e-staging-saas.yml (which tests
# the API shape) by exercising the actual browser + canvas bundle against
# live staging.
#
# PROMOTION-READINESS (toward making this a HARD merge-gate):
# NOW RELIABLE (spec hardened — staging-tabs.spec.ts):
# - All waits condition-based (toBeVisible/toHaveAttribute/expect.poll);
# no fixed waitForTimeout in the spec.
# - Tabs asserted on settled REAL content, not "container visible".
# - ErrorBoundary + visible error alerts fail non-degraded tabs.
# - Tab-list parity-checked vs live DOM; fail-closed on missing tenant.
# STILL BLOCKS PROMOTION-TO-REQUIRED (do NOT remove continue-on-error —
# CTO-owned, RFC internal#219 §1):
# - Infra dependency: real staging EC2 per run (12-20 min cold boot);
# AWS/Cloudflare/CP availability would become merge-blockers.
# - Shared-zone TLS/DNS/ACME propagation flake surface is upstream of
# this repo and outside its control.
# - Required-gate correctness needs CP_STAGING_ADMIN_API_TOKEN GUARANTEED
# present; today's skip-if-absent (core#2225) is right for non-gating
# but would skip-green a required check.
# - Single hermes/platform_managed workspace; agent-dependent content
# (live chat/traces round-trip) not exercised on staging (#2162).
# The full checklist lives at the foot of canvas/e2e/staging-tabs.spec.ts.
# verifies every workspace-panel tab renders without crashing. Complements
# e2e-staging-saas.yml (which tests the API shape) by exercising the
# actual browser + canvas bundle against live staging.
#
# Triggers: push to main, PR touching canvas sources + this workflow only
# after the PR enters `merge-queue`, manual dispatch, and scheduled cron to
-28
View File
@@ -85,25 +85,6 @@ jobs:
runs-on: ubuntu-latest
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
#
# PROMOTION-READINESS (toward required gate — do NOT flip continue-on-error
# without CTO sign-off, that's the irreversible call):
# NOW FAIL-CLOSED:
# - Missing CP_STAGING_ADMIN_API_TOKEN → hard exit 2 (preflight).
# - Staging CP unhealthy → hard exit 1 (preflight, not a workspace bug).
# - Harness E2E_REQUIRE_LIVE=1 → exit 5 if a clean exit didn't prove
# all four awaiting_agent transitions (no silent skip).
# - Sweep transition (step 6) is now a bounded readiness-poll, not a
# fixed sleep + one-shot assert → no more sweep-cadence flake.
# - register / re-register retry ONLY transient edge 5xx (bounded),
# fail closed on 4xx → no more cold-boot-502 flake.
# STILL BLOCKS PROMOTION:
# - Single shared staging tenant + EC2 quota window: an infra-side
# provisioning outage (not a code bug) would turn the gate red.
# Needs an infra-class vs code-class signal split before required.
# - "CP unhealthy → exit 1" currently looks identical to a real
# failure on the run page; required-gate would need it demoted to
# a neutral/skip so staging flakiness can't block merges.
continue-on-error: true
timeout-minutes: 25
@@ -143,15 +124,6 @@ jobs:
- name: Run external-runtime E2E
id: e2e
# E2E_REQUIRE_LIVE=1: the harness fails CLOSED (exit 5) if it ever
# reaches a clean exit without proving all four awaiting_agent
# transitions. Mirrors CP serving-e2e SERVING_E2E_REQUIRE_LIVE — a
# silent skip / early-return / dropped assertion can no longer
# masquerade as green. Token-missing and CP-unhealthy already
# hard-fail in the two preflight steps above, so reaching this step
# means a real cycle is expected.
env:
E2E_REQUIRE_LIVE: "1"
run: bash tests/e2e/test_staging_external_runtime.sh
# Mirror the e2e-staging-saas.yml safety net: if the runner is
@@ -109,9 +109,6 @@ jobs:
E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }}
E2E_RUNTIME: claude-code
# Platform-managed create path (moonshot/kimi-k2.6, no tenant key) — the
# combo proven to create cleanly; this test only needs the ws online.
E2E_LLM_PATH: platform
E2E_MODEL_SLUG: MiniMax-M2
E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
+1 -19
View File
@@ -172,23 +172,9 @@ jobs:
# and defeats the cost saving. Operators can override via the
# workflow_dispatch flow (no input wired here yet — runtime
# override is enough for ad-hoc).
#
# #2263 deploy-skew: the claude-code default is the COLON-namespaced BYOK
# id `minimax:MiniMax-M2.7`, NOT bare `MiniMax-M2`. The deployed staging
# ws-server's compiled registry can lag source; validateRegisteredModelForRuntime
# 400s the bare form on an older image (the sibling Platform Boot job, on
# the SAME image, succeeds with namespaced `moonshot/kimi-k2.6`). The colon
# form stays in the BYOK `minimax` arm (providers.yaml:851) so it resolves
# provider=minimax (BYOK) and the #1994 byok-not-platform guard still
# passes — the slash/platform form `minimax/MiniMax-M2.7` would not.
E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'codex' && 'openai/gpt-4o' || github.event.inputs.runtime == 'google-adk' && 'google_genai:gemini-2.5-pro' || 'minimax:MiniMax-M2.7' }}
E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'codex' && 'openai/gpt-4o' || github.event.inputs.runtime == 'google-adk' && 'google_genai:gemini-2.5-pro' || 'MiniMax-M2' }}
E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
# Fail-closed-on-skip: in CI the harness MUST prove ≥1 full
# provision→online→A2A cycle. If it reaches the end having validated
# nothing (a future short-circuit / skip path), it exits 5 rather than
# reporting a false green. Mirrors CP serving-e2e SERVING_E2E_REQUIRE_LIVE.
E2E_REQUIRE_LIVE: '1'
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -386,10 +372,6 @@ jobs:
E2E_MODE: smoke
E2E_RUN_ID: "platform-${{ github.run_id }}-${{ github.run_attempt }}"
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
# Fail-closed-on-skip (see BYOK job). smoke mode still runs steps 2/4/7/8b,
# so all four required milestones (provisioned/tenant_online/
# workspace_online/a2a_roundtrip) fire — the guard is valid for this lane too.
E2E_REQUIRE_LIVE: '1'
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+18 -32
View File
@@ -33,20 +33,11 @@
# 2026-05-17 (internal#189 Phase 1).
#
# BURN-IN CLOSED 2026-05-17 (internal#189 Phase 1): The 7-day burn-in
# window closed. As of 2026-06-04 the residual masks left behind by the
# burn-in are removed for real (the comment previously claimed this while
# the masks still persisted — that was stale):
# - continue-on-error: true on the jq-install step (redundant; the step
# already exits 0) and on the tier-check step (the burn-in mask).
# - the `|| true` after the sop-tier-check.sh invocation, which masked
# real tier-gate verdicts.
# AND-composition is now fully enforced and the tier-check step can
# honestly red CI on a real SOP-6 violation. SOP_FAIL_OPEN=1 is RETAINED
# as sanctioned infra-resilience: it fails-open only on token/network/jq
# faults, never on a real gate verdict. If you need to temporarily
# re-introduce a mask, file a tracker and follow the mc#1982 protocol
# (Tier 2e lint requires a current tracker within 2 lines of any
# continue-on-error: true).
# window closed. continue-on-error: true has been removed from the
# tier-check job; AND-composition is now fully enforced. If you need
# to temporarily re-introduce a mask, file a tracker and follow the
# mc#1982 protocol (Tier 2e lint requires a current tracker within
# 2 lines of any continue-on-error: true).
name: sop-tier-check
@@ -99,11 +90,10 @@ jobs:
# GitHub releases may be unreachable from some runner networks
# (infra#241 follow-up: GitHub timeout after 3s on 5.78.80.188
# runners). The sop-tier-check script has its own fallback as a
# third line of defense, and this step's final command
# (`jq --version ... || echo`) already exits 0 unconditionally — so
# the step cannot fail the job on its own.
# continue-on-error REMOVED 2026-06-04 (mc#1982 directive: root-fix
# and remove, do not renew). It was redundant masking, not a gate.
# third line of defense. continue-on-error: true ensures this step
# failing does not block the job.
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
run: |
# apt-get is the primary method — Ubuntu package mirrors are reliably
# reachable from runner containers. GitHub releases may be blocked
@@ -120,11 +110,11 @@ jobs:
jq --version 2>/dev/null || echo "::notice::jq not yet available — script fallback will retry"
- name: Verify tier label + reviewer team membership
# continue-on-error REMOVED 2026-06-04 (expired internal#189 Phase 1
# burn-in, window closed 2026-05-17; mc#1982 directive: root-fix and
# remove, do not renew). SOP_FAIL_OPEN=1 below still fails-open on
# token/network/infra errors only (never on a real tier-gate verdict),
# so this step can now honestly fail CI on a genuine SOP-6 violation.
# continue-on-error: true at step level — job-level is ignored by Gitea
# Actions (quirk #10, internal runbooks). Belt-and-suspenders with
# SOP_FAIL_OPEN=1 + || true below.
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
continue-on-error: true
env:
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
GITEA_HOST: git.moleculesai.app
@@ -133,13 +123,9 @@ jobs:
PR_AUTHOR: ${{ github.event.pull_request.user.login }}
SOP_DEBUG: '0'
SOP_LEGACY_CHECK: '0'
# SOP_FAIL_OPEN=1 fails-open ONLY on infra faults (empty/invalid
# token, unreachable Gitea API, missing jq) — see the guarded
# `exit 0` branches in sop-tier-check.sh. It does NOT mask a real
# tier-gate verdict: a missing tier label, no approving review, or
# an unsatisfied AND-clause still `exit 1`. Kept as sanctioned
# infra-resilience; the `|| true` mask was REMOVED with the burn-in
# COE (2026-06-04) so a genuine SOP-6 violation now reds CI.
# SOP_FAIL_OPEN=1 makes the script always exit 0. The UI enforces
# the actual merge gate. Combined with continue-on-error: true
# above, this step never fails the job regardless of script exit.
SOP_FAIL_OPEN: '1'
run: |
bash .gitea/scripts/sop-tier-check.sh
bash .gitea/scripts/sop-tier-check.sh || true
+4 -13
View File
@@ -101,19 +101,10 @@ test.describe("Desktop ChatTab", () => {
await textarea.fill("Trigger activity");
await page.getByRole("button", { name: /Send/ }).first().click();
// FALSE-GREEN FIX: the prior `.catch(() => {})` swallowed the assertion
// entirely, so this test passed whether or not the activity log ever
// rendered. The activity-log container is optional per layout, so we
// gate on its presence in the DOM: if it's not part of this layout,
// skip explicitly (a recorded skip, not a silent pass); if it IS
// present, it MUST become visible during the send flow — that's the
// behaviour this test exists to protect.
const activityLog = page.locator("[data-testid='activity-log']").first();
if ((await activityLog.count()) === 0) {
test.skip(true, "activity-log not part of this layout");
return;
}
await expect(activityLog).toBeVisible({ timeout: 10_000 });
// Activity log container should appear during the send flow.
await expect(page.locator("[data-testid='activity-log']").first()).toBeVisible({ timeout: 10_000 }).catch(() => {
// Activity log may not be present in all layouts.
});
});
});
-329
View File
@@ -1,329 +0,0 @@
/**
* Staging canvas E2E — REAL desktop take-control path (core#2261 "Gap 1").
*
* This is the live-e2e gate that the existing staging-tabs.spec.ts does NOT
* provide. staging-tabs only opens the 13 declared workspace-panel tabs
* (TAB_IDS at staging-tabs.spec.ts:24-38 — `display` is NOT among them) and
* asserts they render without a "Failed to load" toast. It never acquires
* display control, never opens the noVNC WebSocket, and never asserts a
* framebuffer frame arrives. The companion unit test
* canvas/src/components/tabs/__tests__/DisplayTab.test.tsx mocks the RFB
* constructor (vi.mock("@novnc/novnc"), see its lines 8/20-39) so NO real
* WebSocket is ever opened there either. Result: a broken take-control path
* (acquire → noVNC WS upgrade → ws-proxy → EIC → websockify → x11vnc → Xvfb)
* ships GREEN. This spec closes that gap by exercising the REAL wire path
* end to end against a live, desktop-capable staging workspace.
*
* What it asserts (the real path, no mocks):
* 1. POST /workspaces/<id>/display/control/acquire returns 200 with a
* session_url that carries the signed token in its `#token=` fragment
* (mirrors workspace_display_control.go:signedDisplaySessionURL).
* 2. Opening the noVNC WebSocket at session_url with the subprotocols
* ["binary", "molecule-display-token.<token>"] (exactly what the canvas
* sends — DisplayTab.tsx:339) UPGRADES (onopen fires, readyState===OPEN,
* no immediate 1006 abnormal close). A 1006 / 403 means the handshake
* failed somewhere in the proxy chain.
* 3. At least one BINARY framebuffer message arrives on that socket — a
* real frame off x11vnc, not just a panel mount. RFB sends a
* ProtocolVersion banner ("RFB 003.00x\n") as the first server message,
* which proves the upstream VNC server is live behind the EIC tunnel.
*
* Auth model (important): the WS upgrade is gated by workspace-server
* middleware.AdminAuth. A browser WebSocket CANNOT set an Authorization
* header, so in production the canvas WS upgrade passes AdminAuth via the
* same-origin-canvas path (wsauth_middleware.go:isSameOriginCanvas, which
* keys off the Origin header the browser sets automatically on a same-origin
* WS upgrade). We therefore open the socket from inside the browser page via
* page.evaluate AFTER navigating to the tenant origin — so the browser sends
* `Origin: https://<slug>.staging.moleculesai.app`, exactly as production
* does. The acquire POST (which CAN carry a header) uses the per-tenant admin
* bearer set on the context. This is the faithful production handshake, not a
* synthetic one.
*
* Gate / cost: this test only runs when STAGING_DISPLAY_WORKSPACE_ID points
* at a STANDING desktop-capable workspace (compute.display.mode ==
* "desktop-control"). We deliberately do NOT provision one in the shared
* staging-setup.ts: a desktop AMI boots in ~12-15 min and would tax the
* existing tabs harness on every run. Standing that workspace up is a cost
* item for the CTO (one always-on desktop EC2 on staging). Until that exists,
* the test SKIPS loud. When the env IS present, any failure in
* provision/acquire/upgrade is a HARD error — fail-closed, never silently
* green (no "flaky" disposition: a 1006 names a broken proxy hop).
*/
import { test, expect } from "@playwright/test";
const STAGING = process.env.CANVAS_E2E_STAGING === "1";
// The standing desktop-capable workspace id. Absent => skip loud. This is
// the single knob that activates the gate; see file header for the cost note.
const DISPLAY_WS_ID = process.env.STAGING_DISPLAY_WORKSPACE_ID;
test.skip(!STAGING, "CANVAS_E2E_STAGING not set — skipping staging-only tests");
test.skip(
!DISPLAY_WS_ID,
"STAGING_DISPLAY_WORKSPACE_ID not set — no standing desktop-capable staging " +
"workspace to exercise the take-control path. Set it to a workspace whose " +
"compute.display.mode == 'desktop-control' to activate this real-e2e gate. " +
"(Standing that workspace up is a CTO cost item — one always-on desktop EC2.)",
);
// How long we wait for the WS to upgrade + deliver the first frame. The EIC
// tunnel + websockify handshake adds real latency on top of the edge; budget
// generously but bounded, so a genuinely-dead path fails LOUD instead of
// hanging to the suite timeout.
const WS_UPGRADE_TIMEOUT_MS = 30_000;
const FIRST_FRAME_TIMEOUT_MS = 30_000;
test.describe("staging desktop take-control (real noVNC path)", () => {
test("acquire → WS upgrades → first framebuffer frame arrives", async ({
page,
context,
}) => {
// The standing desktop workspace lives in its OWN standing org (it can't
// live in the per-run ephemeral org — that gets torn down each run). When
// STAGING_DISPLAY_SLUG is configured, staging-setup.ts resolves that org's
// tenant URL / admin token / org id and exports them under STAGING_DISPLAY_*.
// Fall back to the ephemeral org's exports only if the display org wasn't
// separately configured (e.g. the desktop workspace happens to live in the
// run's own tenant — not the expected topology, but supported).
const tenantURL =
process.env.STAGING_DISPLAY_TENANT_URL || process.env.STAGING_TENANT_URL;
const tenantToken =
process.env.STAGING_DISPLAY_TENANT_TOKEN || process.env.STAGING_TENANT_TOKEN;
const orgID =
process.env.STAGING_DISPLAY_ORG_ID || process.env.STAGING_ORG_ID;
// Fail-closed: when the gate env IS present (we got past the skips above),
// the rest of the staging context MUST be wired or this is a hard error,
// never a silent pass. Mirrors staging-tabs.spec.ts:53-57.
if (!tenantURL || !tenantToken) {
throw new Error(
"STAGING_DISPLAY_WORKSPACE_ID is set but no tenant URL/token is available " +
"for the take-control gate. Set STAGING_DISPLAY_SLUG so staging-setup.ts " +
"resolves STAGING_DISPLAY_TENANT_URL / STAGING_DISPLAY_TENANT_TOKEN for the " +
"standing desktop org (or ensure the ephemeral STAGING_TENANT_* exports exist).",
);
}
const workspaceId = DISPLAY_WS_ID as string;
// The per-tenant admin bearer satisfies AdminAuth for the acquire POST
// (which can carry a header). The WS upgrade below relies on Origin
// (same-origin canvas), NOT this header.
await context.setExtraHTTPHeaders({
Authorization: `Bearer ${tenantToken}`,
// X-Molecule-Org-Id is required by workspace-server TenantGuard for
// cross-org requests routed through the CP edge; staging-setup exports it.
// Harmless (and correct) to send on the same-origin tenant box too.
...(orgID ? { "X-Molecule-Org-Id": orgID } : {}),
});
// 0. Sanity: the workspace must actually be display-enabled, else the
// whole gate is meaningless. Hit the availability endpoint first so a
// mis-pointed STAGING_DISPLAY_WORKSPACE_ID fails with a precise message
// instead of an opaque acquire error.
const availResp = await page.request.get(
`${tenantURL}/workspaces/${workspaceId}/display`,
);
expect(
availResp.status(),
`GET /display for ${workspaceId} should be 200`,
).toBe(200);
const avail = await availResp.json();
expect(
avail.available,
`workspace ${workspaceId} is not display-available (reason=${avail.reason}). ` +
"STAGING_DISPLAY_WORKSPACE_ID must point at a workspace with " +
"compute.display.mode == 'desktop-control' AND a live instance_id.",
).toBe(true);
// 1. Acquire display control. The handler returns session_url +
// expires_at; session_url embeds the signed token in its #token=
// fragment (workspace_display_control.go:signedDisplaySessionURL).
const acquireResp = await page.request.post(
`${tenantURL}/workspaces/${workspaceId}/display/control/acquire`,
{ data: { controller: "user", ttl_seconds: 300 } },
);
expect(
acquireResp.status(),
`acquire should be 200; body: ${await acquireResp.text()}`,
).toBe(200);
const acquire = await acquireResp.json();
expect(acquire.controller, "controller should be 'user'").toBe("user");
expect(
typeof acquire.session_url,
`acquire response missing session_url: ${JSON.stringify(acquire)}`,
).toBe("string");
// The token rides in the URL fragment (#token=...), never as a query
// param — confirm the contract the client (DisplayTab.tsx:459-466)
// depends on so a server-side change to the URL shape fails HERE.
const sessionUrl: string = acquire.session_url;
expect(
sessionUrl,
`session_url should carry the token in a #token= fragment: ${sessionUrl}`,
).toContain("#token=");
// 2. Open the REAL noVNC WebSocket from inside the page, so the browser
// sends Origin: <tenant> and the same-origin-canvas AdminAuth path
// accepts the upgrade (a browser WS can't set Authorization). We
// navigate to the tenant origin first purely to anchor the Origin
// header; we don't need the canvas bundle to hydrate.
await page.goto(tenantURL, { waitUntil: "domcontentloaded" });
// Reproduce DisplayTab.tsx:459-466 (displayWebSocketConnection): resolve
// session_url against the tenant origin, pull the token out of the
// fragment, strip the fragment, switch http(s)->ws(s). Then connect with
// the exact subprotocols the canvas uses (DisplayTab.tsx:339).
const result = await page.evaluate(
async ({ rawSessionUrl, upgradeTimeoutMs, frameTimeoutMs }) => {
const u = new URL(rawSessionUrl, window.location.href);
const token =
new URLSearchParams(u.hash.replace(/^#/, "")).get("token") ?? "";
if (!token) {
return { ok: false, stage: "token-parse", detail: "no #token in session_url" };
}
u.hash = "";
u.protocol = window.location.protocol === "https:" ? "wss:" : "ws:";
const wsUrl = u.toString();
return await new Promise<{
ok: boolean;
stage: string;
detail: string;
frameBytes?: number;
frameKind?: string;
closeCode?: number;
}>((resolve) => {
let upgraded = false;
let settled = false;
const finish = (r: {
ok: boolean;
stage: string;
detail: string;
frameBytes?: number;
frameKind?: string;
closeCode?: number;
}) => {
if (settled) return;
settled = true;
try {
ws.close();
} catch {
/* ignore */
}
resolve(r);
};
let ws: WebSocket;
try {
ws = new WebSocket(wsUrl, [`binary`, `molecule-display-token.${token}`]);
} catch (e) {
resolve({ ok: false, stage: "construct", detail: String(e) });
return;
}
ws.binaryType = "arraybuffer";
const upgradeTimer = setTimeout(() => {
finish({
ok: false,
stage: "upgrade-timeout",
detail: `WS did not open within ${upgradeTimeoutMs}ms (readyState=${ws.readyState})`,
});
}, upgradeTimeoutMs);
let frameTimer: ReturnType<typeof setTimeout> | null = null;
ws.onopen = () => {
upgraded = true;
clearTimeout(upgradeTimer);
// Now wait for the first server message. RFB's ProtocolVersion
// banner is the first thing x11vnc sends; if nothing arrives the
// tunnel opened but the VNC server behind it is dead.
frameTimer = setTimeout(() => {
finish({
ok: false,
stage: "frame-timeout",
detail: `WS upgraded but no framebuffer message within ${frameTimeoutMs}ms`,
});
}, frameTimeoutMs);
};
ws.onmessage = (ev) => {
if (frameTimer) clearTimeout(frameTimer);
let bytes = 0;
let kind: string = typeof ev.data;
if (ev.data instanceof ArrayBuffer) {
bytes = ev.data.byteLength;
kind = "ArrayBuffer";
} else if (typeof Blob !== "undefined" && ev.data instanceof Blob) {
bytes = ev.data.size;
kind = "Blob";
} else if (typeof ev.data === "string") {
bytes = ev.data.length;
kind = "string";
}
finish({
ok: bytes > 0,
stage: "frame",
detail:
bytes > 0
? "received framebuffer message"
: "first message was empty",
frameBytes: bytes,
frameKind: kind,
});
};
ws.onclose = (ev) => {
// A close BEFORE open === failed upgrade (1006 abnormal / 403
// forbidden surface here). A close AFTER we already saw a frame is
// benign (our own finish() triggered it).
if (!upgraded) {
clearTimeout(upgradeTimer);
finish({
ok: false,
stage: "upgrade-close",
detail: `WS closed before upgrade (code=${ev.code}, reason="${ev.reason}") — handshake rejected somewhere in edge → ws-proxy → EIC → websockify → x11vnc`,
closeCode: ev.code,
});
}
};
ws.onerror = () => {
if (!upgraded) {
clearTimeout(upgradeTimer);
finish({
ok: false,
stage: "upgrade-error",
detail: "WS error before upgrade — proxy chain rejected the handshake",
});
}
};
});
},
{
rawSessionUrl: sessionUrl,
upgradeTimeoutMs: WS_UPGRADE_TIMEOUT_MS,
frameTimeoutMs: FIRST_FRAME_TIMEOUT_MS,
},
);
// 3. Assert the real outcome. No "flaky" escape hatch: each failure stage
// names the broken hop so a reviewer can act on it directly.
expect(
result.ok,
`take-control failed at stage="${result.stage}": ${result.detail}` +
(result.closeCode ? ` (close code ${result.closeCode})` : ""),
).toBe(true);
expect(
result.stage,
`expected to reach the 'frame' stage; got '${result.stage}' (${result.detail})`,
).toBe("frame");
expect(
result.frameBytes ?? 0,
`framebuffer message should be non-empty (kind=${result.frameKind})`,
).toBeGreaterThan(0);
});
});
+4 -90
View File
@@ -337,99 +337,13 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
// 7. Hand state off to tests + teardown — overwrite the slug-only
// bootstrap state with the full state spec tests need.
//
// FAIL-CLOSED handoff: every field the spec reads must be non-empty. If
// any is missing here, the spec's env-presence guard would throw with a
// generic "did setup run?" message that hides WHICH field was lost. Catch
// it at the source — a partial provision must hard-fail setup, never hand
// off a half-built state that the spec then has to diagnose (or worse,
// skip). This is the loud, fail-closed contract: STAGING was requested,
// so an incomplete provision is an error, not a skip.
const handoff = { slug, tenantURL, workspaceId, tenantToken };
const missingFields = Object.entries(handoff)
.filter(([, v]) => !v)
.map(([k]) => k);
if (missingFields.length > 0) {
throw new Error(
`[staging-setup] provision incomplete — empty handoff field(s): ` +
`${missingFields.join(", ")}. Refusing to hand off a partial state ` +
`that would surface downstream as an opaque spec failure.`,
);
}
writeFileSync(stateFile, JSON.stringify(handoff, null, 2));
writeFileSync(
stateFile,
JSON.stringify({ slug, tenantURL, workspaceId, tenantToken }, null, 2),
);
process.env.STAGING_SLUG = slug;
process.env.STAGING_TENANT_URL = tenantURL;
process.env.STAGING_WORKSPACE_ID = workspaceId;
process.env.STAGING_TENANT_TOKEN = tenantToken;
// The ephemeral org's UUID — exported so specs that route through the CP
// edge can send X-Molecule-Org-Id (workspace-server TenantGuard). The tabs
// harness hits the tenant box same-origin and doesn't need it, but the
// take-control gate (staging-display.spec.ts) does.
process.env.STAGING_ORG_ID = orgID;
console.log(`[staging-setup] Ready — ${stateFile}`);
// 8. (core#2261 Gap 1) Resolve the STANDING desktop-capable org, if one is
// configured, for the live take-control e2e (staging-display.spec.ts).
//
// This block is FULLY env-gated and additive: it provisions NOTHING and is
// a no-op unless STAGING_DISPLAY_SLUG is set. We deliberately do NOT spin a
// desktop workspace inside this shared setup — a desktop AMI boots in
// ~12-15 min and would tax every tabs run. Instead an operator stands up
// one always-on desktop org once (a CTO cost item) and points
// STAGING_DISPLAY_SLUG + STAGING_DISPLAY_WORKSPACE_ID at it. Here we just
// resolve that standing org's tenant URL, admin token, and org id so the
// display spec can reach it. Fail-closed: if STAGING_DISPLAY_SLUG is set but
// we can't resolve its token/id, we THROW — the gate must never silently
// fall back to the (non-desktop) ephemeral org and pass.
const displaySlug = process.env.STAGING_DISPLAY_SLUG;
if (displaySlug) {
console.log(`[staging-setup] Resolving standing desktop org: ${displaySlug}`);
// org id for the standing slug (admin-orgs row carries it + status).
const orgsRes = await jsonFetch(`${CP_URL}/cp/admin/orgs`, { headers: adminAuth });
if (orgsRes.status !== 200) {
throw new Error(
`STAGING_DISPLAY_SLUG=${displaySlug} set, but GET /cp/admin/orgs returned ` +
`${orgsRes.status} — cannot resolve the standing desktop org for the ` +
`take-control gate.`,
);
}
const displayRow = (orgsRes.body?.orgs || []).find(
(o: any) => o.slug === displaySlug,
);
if (!displayRow?.id) {
throw new Error(
`STAGING_DISPLAY_SLUG=${displaySlug} not found in /cp/admin/orgs — the ` +
`standing desktop org for the take-control gate does not exist. Provision ` +
`it (one always-on desktop EC2) or unset STAGING_DISPLAY_SLUG/` +
`STAGING_DISPLAY_WORKSPACE_ID to skip the gate.`,
);
}
if (displayRow.instance_status !== "running") {
throw new Error(
`Standing desktop org ${displaySlug} is '${displayRow.instance_status}', ` +
`not 'running' — the take-control gate needs a live desktop tenant. ` +
`full row: ${JSON.stringify(displayRow)}`,
);
}
const displayTokRes = await jsonFetch(
`${CP_URL}/cp/admin/orgs/${displaySlug}/admin-token`,
{ headers: adminAuth },
);
if (displayTokRes.status !== 200 || !displayTokRes.body?.admin_token) {
throw new Error(
`admin-token fetch for standing desktop org ${displaySlug} returned ` +
`${displayTokRes.status}: ${JSON.stringify(displayTokRes.body)}`,
);
}
process.env.STAGING_DISPLAY_ORG_ID = displayRow.id;
process.env.STAGING_DISPLAY_TENANT_URL = `https://${displaySlug}.${TENANT_DOMAIN}`;
process.env.STAGING_DISPLAY_TENANT_TOKEN = displayTokRes.body.admin_token;
console.log(
`[staging-setup] Standing desktop org resolved: ${displaySlug} ` +
`(org_id=${displayRow.id}, url=${process.env.STAGING_DISPLAY_TENANT_URL})`,
);
}
}
+33 -305
View File
@@ -1,8 +1,7 @@
/**
* Staging canvas E2E — opens each workspace-panel tab against a fresh
* staging org provisioned in the global setup. Asserts each tab renders
* REAL content (not an empty container, not an error state) and captures a
* screenshot for visual review.
* Staging canvas E2E — opens each of the 13 workspace-panel tabs against a
* fresh staging org provisioned in the global setup. Asserts each tab
* renders without throwing and captures a screenshot for visual review.
*
* Auth model: the tenant platform's AdminAuth middleware accepts a bearer
* token OR a WorkOS session cookie. Playwright can't mint a WorkOS
@@ -11,39 +10,17 @@
* Bearer header via context.setExtraHTTPHeaders(). Every browser
* request inherits the header.
*
* PROMOTION-READINESS (see § at bottom of file): this suite is being
* hardened toward becoming a HARD merge-gate. It currently runs under
* `continue-on-error: true` (RFC internal#219 §1, non-gating) — that is a
* deliberate, CTO-owned call and is NOT changed here. The hardening makes
* every assertion deterministic so that WHEN promotion happens the gate
* does not flap. See the PROMOTION-READINESS block at the foot of this
* file for what is now reliable and what still blocks promotion.
*
* Known SaaS gaps — documented in #1369. These tabs legitimately cannot
* load real content in SaaS mode and are allowed an in-panel empty/error
* state (NOT a hard crash, NOT an ErrorBoundary):
* Known SaaS gaps — documented in #1369 and allowed to render errored
* content without failing the test (the gate is "no hard crash, no
* 'Failed to load' toast"):
* - Files tab: empty (platform can't docker exec into a remote EC2)
* - Terminal tab: WS connect fails
* - Peers tab: 401 without workspace-scoped token
* These are enumerated in KNOWN_DEGRADED_TABS below and asserted with a
* weaker (but still non-trivial) contract: the panel renders and does not
* crash the app. Every OTHER tab must render real content.
*/
import { test, expect, type Page } from "@playwright/test";
import { test, expect } from "@playwright/test";
// Tab ids as declared in canvas/src/components/SidePanel.tsx TABS.
//
// NOTE (drift guard): this list is asserted-complete against the live DOM
// below (see "tab list parity" step) so it cannot silently drift out of
// sync with SidePanel.tsx TABS the way a hand-maintained constant does.
// `display` and `container-config` are intentionally EXCLUDED here:
// - `display` is owned by the in-flight take-control e2e (PR #2275 /
// staging-display.spec.ts); asserting it here would collide.
// - `container-config` only renders when selectedNodeId is set AND is
// gated on tier; it is covered by container-config-specific specs.
// The parity check accounts for these via EXPECTED_EXTRA_TABS so a NEW
// tab appearing in SidePanel still trips the guard.
const TAB_IDS = [
"chat",
"activity",
@@ -60,131 +37,12 @@ const TAB_IDS = [
"audit",
] as const;
// Tabs present in the DOM that this spec intentionally does not drive.
// Keeping this explicit means a genuinely-new tab (not one of these) makes
// the parity assertion fail LOUD instead of being silently un-tested.
const EXPECTED_EXTRA_TABS = ["display", "container-config"] as const;
// Tabs that are KNOWN to degrade in SaaS mode (#1369). They get the weaker
// "renders + no crash" contract instead of the "real content" contract.
// Anything NOT in this set must render real content or the test fails.
const KNOWN_DEGRADED_TABS = new Set<string>(["terminal", "files"]);
const STAGING = process.env.CANVAS_E2E_STAGING === "1";
// IMPORTANT — fail-closed, not skip-green.
//
// `test.skip(!STAGING)` is correct ONLY when the operator never asked for a
// staging run (CANVAS_E2E_STAGING unset). In that case the workflow's
// detect-changes / token-check gates have already decided not to exercise
// staging, and skipping is the documented contract.
//
// But if STAGING *is* requested (CANVAS_E2E_STAGING=1) and global setup did
// NOT hand off the tenant state, that is a HARD failure, not a skip — see
// the explicit env-presence throw inside the test body. A silent skip there
// would let a broken provision ship green, which is exactly the
// weak-gate failure this hardening removes (§ No flakes / internal#828).
test.skip(!STAGING, "CANVAS_E2E_STAGING not set — staging-only suite, not requested");
/**
* Assert the panel for `tabId` rendered real content.
*
* Deterministic contract (no fixed waits — every step is condition-based
* with Playwright's built-in retry / expect.poll):
* 1. The tabpanel container is visible.
* 2. The global ErrorBoundary did NOT trip ("Something went wrong").
* 3. No visible error alert is shown in the panel.
* 4. For non-degraded tabs: the panel settles to non-empty,
* non-spinner content (so an empty <div/> or a stuck "Loading…"
* spinner FAILS instead of passing as it did before).
*/
async function assertPanelRendered(page: Page, tabId: string): Promise<void> {
const panel = page.locator(`#panel-${tabId}`);
// (1) Container visible. Built-in retry up to the expect timeout — no
// arbitrary waitForTimeout. Mechanism: replaces any reliance on a fixed
// settle delay with a real visibility condition.
await expect(panel, `panel for ${tabId} never became visible`).toBeVisible({
timeout: 10_000,
});
// (2) ErrorBoundary trip = hard crash anywhere in the React subtree.
// canvas/src/components/ErrorBoundary.tsx renders "Something went wrong".
// The OLD gate only looked for a "Failed to load" toast and would ship
// an ErrorBoundary-crashed panel GREEN. Mechanism: assert the crash
// surface is absent, retried via expect.poll so a late-mounting crash
// banner is still caught.
await expect
.poll(
async () =>
page.getByText("Something went wrong", { exact: false }).count(),
{
message: `tab ${tabId}: ErrorBoundary tripped (Something went wrong)`,
timeout: 5_000,
},
)
.toBe(0);
// (3) No visible error alert inside the panel. Tabs surface load errors
// as role="alert" with the real error text (EventsTab/ChannelsTab/
// ConfigTab/...). The OLD gate matched ONLY [role=alert]:has-text("Failed
// to load") — it missed (a) error messages that don't contain that exact
// phrase and (b) error divs that omit role="alert" entirely (e.g.
// ActivityTab). We replace it with a broader, but still SaaS-gap-aware,
// check: any *visible* alert OR red error banner inside the panel.
//
// Degraded tabs (#1369) are allowed an error state — for those we only
// require no app-level crash (covered by step 2). For every other tab a
// visible error alert is a real regression.
if (!KNOWN_DEGRADED_TABS.has(tabId)) {
const visibleAlerts = panel.locator('[role="alert"]:visible');
await expect
.poll(async () => visibleAlerts.count(), {
message:
`tab ${tabId}: a visible error alert is shown in the panel ` +
`(was a weak "Failed to load"-only check before)`,
timeout: 5_000,
})
.toBe(0);
}
// (4) Real content. The tabpanel CONTAINER always mounts, so the old
// toBeVisible() on the container passed even when the child rendered
// nothing. Assert the panel's trimmed innerText is non-empty AND not
// stuck on a loading spinner. expect.poll retries until the async
// fetch+render settles — replacing the implicit "the network finished
// by now" timing assumption with an explicit polled condition.
//
// Degraded tabs may legitimately be empty (Files in SaaS mode), so they
// are exempt from the non-empty requirement; step 2 still guards them
// against a hard crash.
if (!KNOWN_DEGRADED_TABS.has(tabId)) {
await expect
.poll(
async () => {
const text = ((await panel.innerText()) || "").trim();
// A panel still showing only a loading spinner has not settled.
const stillLoading = /^(loading\b|loading…|loading\.\.\.)/i.test(
text,
);
return text.length > 0 && !stillLoading;
},
{
message:
`tab ${tabId}: panel rendered empty or stuck on a loading ` +
`spinner — no real content settled (weak "container visible" ` +
`gate would have passed this)`,
// Generous: real tabs fetch from the tenant over the network.
// Polled, so it returns as soon as content appears.
timeout: 20_000,
},
)
.toBe(true);
}
}
test.skip(!STAGING, "CANVAS_E2E_STAGING not set — skipping staging-only tests");
test.describe("staging canvas tabs", () => {
test("each workspace-panel tab renders real content", async ({
test("each workspace-panel tab renders without error", async ({
page,
context,
}) => {
@@ -192,16 +50,9 @@ test.describe("staging canvas tabs", () => {
const tenantToken = process.env.STAGING_TENANT_TOKEN;
const workspaceId = process.env.STAGING_WORKSPACE_ID;
// FAIL-CLOSED (not skip): STAGING was requested but global setup did
// not export tenant state. A silent skip here would paint a broken
// provision GREEN. This is the loud-fail the hardening mandates.
if (!tenantURL || !tenantToken || !workspaceId) {
throw new Error(
"staging-setup.ts did not export STAGING_TENANT_URL / " +
"STAGING_TENANT_TOKEN / STAGING_WORKSPACE_ID. CANVAS_E2E_STAGING=1 " +
"was set (staging WAS requested) but global setup produced no " +
"tenant — this is a provisioning failure, NOT a reason to skip. " +
"Check the [staging-setup] log above for the real error.",
"staging-setup.ts did not export STAGING_TENANT_URL / STAGING_TENANT_TOKEN / STAGING_WORKSPACE_ID — did global setup run?",
);
}
@@ -301,19 +152,11 @@ test.describe("staging canvas tabs", () => {
// omit the URL, so we'd otherwise be flying blind. Logged to the
// test's stdout (visible in the workflow log under the failed step).
page.on("requestfailed", (req) => {
console.log(
`[e2e/requestfailed] ${req.method()} ${req.url()}: ${
req.failure()?.errorText ?? "?"
}`,
);
console.log(`[e2e/requestfailed] ${req.method()} ${req.url()}: ${req.failure()?.errorText ?? "?"}`);
});
page.on("response", (res) => {
if (res.status() >= 400) {
console.log(
`[e2e/response-${res.status()}] ${res
.request()
.method()} ${res.url()}`,
);
console.log(`[e2e/response-${res.status()}] ${res.request().method()} ${res.url()}`);
}
});
@@ -330,8 +173,9 @@ test.describe("staging canvas tabs", () => {
// hydrated, even with zero workspaces) or the hydration-error
// banner — whichever wins first. Previous version of this wait
// used `[role="tablist"]`, but that selector only appears AFTER
// a workspace node is clicked, so the wait would always time out
// at 45s before any meaningful failure surfaced.
// a workspace node is clicked (which happens below at L100), so
// the wait would always time out at 45s before any meaningful
// failure surfaced.
await page.waitForSelector(
'[aria-label="Molecule AI workspace canvas"], [data-testid="hydration-error"]',
{ timeout: 45_000 },
@@ -345,20 +189,10 @@ test.describe("staging canvas tabs", () => {
"canvas hydration failed — check staging CP + tenant reachability",
).toBe(0);
// The global ErrorBoundary must not have tripped at the app root
// either — a crash before the side panel even opens would otherwise
// be invisible until a tab assertion happened to notice it.
await expect(
page.getByText("Something went wrong", { exact: false }),
"app-level ErrorBoundary tripped during hydration",
).toHaveCount(0);
// Click the workspace node to open the side panel. Try a data
// attribute first, fall back to a generic role-based selector so
// the test doesn't break when the node-card markup changes.
const byDataAttr = page
.locator(`[data-workspace-id="${workspaceId}"]`)
.first();
const byDataAttr = page.locator(`[data-workspace-id="${workspaceId}"]`).first();
if ((await byDataAttr.count()) > 0) {
await byDataAttr.click({ timeout: 10_000 });
} else {
@@ -368,56 +202,19 @@ test.describe("staging canvas tabs", () => {
await firstNode.click({ timeout: 10_000 });
}
// The tablist appears once the side panel mounts. Condition-based
// wait — no fixed delay.
const tablist = page.locator('[role="tablist"]');
await expect(
tablist,
"side panel tablist never appeared after clicking the workspace node",
).toBeVisible({ timeout: 15_000 });
// Tab-list parity guard. The hand-maintained TAB_IDS constant used to
// be able to drift silently out of sync with SidePanel.tsx TABS — a
// tab could be added to the UI and never get an assertion, shipping
// broken-but-untested. Read the actual tab ids from the DOM and assert
// every live tab is either driven by this spec (TAB_IDS) or explicitly
// excluded (EXPECTED_EXTRA_TABS). A genuinely-new tab fails LOUD.
const liveTabIds = (
await tablist.locator('[role="tab"][id^="tab-"]').evaluateAll((els) =>
els.map((el) => el.id.replace(/^tab-/, "")),
)
).sort();
const accountedFor = new Set<string>([
...TAB_IDS,
...EXPECTED_EXTRA_TABS,
]);
const unaccounted = liveTabIds.filter((id) => !accountedFor.has(id));
expect(
unaccounted,
`SidePanel exposes tab(s) this spec neither drives nor excludes: ` +
`${unaccounted.join(", ")}. Add them to TAB_IDS (and assert their ` +
`content) or to EXPECTED_EXTRA_TABS with a reason.`,
).toHaveLength(0);
// And the inverse: every TAB_ID we intend to drive must actually exist
// in the DOM, so a renamed/removed tab fails here instead of timing out
// on a missing #tab-<id> selector with an opaque message.
const missing = TAB_IDS.filter((id) => !liveTabIds.includes(id));
expect(
missing,
`TAB_IDS references tab(s) not present in SidePanel: ${missing.join(
", ",
)} — the spec's tab list has drifted from SidePanel.tsx TABS.`,
).toHaveLength(0);
await page.waitForSelector('[role="tablist"]', { timeout: 15_000 });
for (const tabId of TAB_IDS) {
await test.step(`tab: ${tabId}`, async () => {
const tabButton = page.locator(`#tab-${tabId}`);
// The TABS bar is `overflow-x-auto` — tabs past position ~3 are
// clipped behind the right-edge fade gradient on smaller
// viewports. Playwright's toBeVisible() returns false for clipped
// elements, so a bare visibility check fails on later tabs in CI.
// scrollIntoViewIfNeeded brings the button into view before the
// visibility check.
// The TABS bar is `overflow-x-auto` (SidePanel.tsx:~tabs
// wrapper) — tabs after position ~3 are clipped behind the
// right-edge fade gradient on smaller viewports. Playwright's
// `toBeVisible()` returns false for clipped elements, so a
// bare visibility check fails on `skills` and later tabs in
// CI. scrollIntoViewIfNeeded brings the button into view
// before the visibility check, mirroring what SidePanel's own
// keyboard handler does on arrow-key navigation.
await tabButton.scrollIntoViewIfNeeded({ timeout: 5_000 });
await expect(
tabButton,
@@ -425,34 +222,18 @@ test.describe("staging canvas tabs", () => {
).toBeVisible({ timeout: 5_000 });
await tabButton.click();
// Confirm the click actually activated this tab before asserting
// its content — aria-selected flips on the active tab. This closes
// a race where a slow click handler left the PREVIOUS tab's panel
// mounted and we asserted the wrong panel's content. Built-in
// retry, condition-based, no fixed wait.
await expect(
tabButton,
`tab-${tabId} did not become the selected tab after click`,
).toHaveAttribute("aria-selected", "true", { timeout: 5_000 });
const panel = page.locator(`#panel-${tabId}`);
await expect(panel, `panel for ${tabId} never rendered`).toBeVisible({
timeout: 10_000,
});
// Real-content assertion (the core hardening). See
// assertPanelRendered: container visible + no ErrorBoundary + no
// visible error alert + settled non-empty content for non-degraded
// tabs. Replaces the old "panel visible + no Failed-to-load toast"
// pair, which shipped empty/errored panels green.
await assertPanelRendered(page, tabId);
// Belt to the braces: the original toast check stays. A global
// "Failed to load" toast (role=alert outside the panel) is still a
// crash signal worth catching even though the in-panel checks above
// now do the heavy lifting.
// "Failed to load" toast = hard crash. Known SaaS-mode gaps
// (Files empty, Terminal disconnected, Peers 401) surface as
// in-panel content, not toasts.
const errorToasts = await page
.locator('[role="alert"]:has-text("Failed to load")')
.count();
expect(
errorToasts,
`tab ${tabId}: a global "Failed to load" toast is showing`,
).toBe(0);
expect(errorToasts, `tab ${tabId}: "Failed to load" toast`).toBe(0);
await page.screenshot({
path: `test-results/staging-tab-${tabId}.png`,
@@ -486,56 +267,3 @@ test.describe("staging canvas tabs", () => {
).toHaveLength(0);
});
});
/*
* PROMOTION-READINESS — staging canvas E2E → HARD merge-gate
* ----------------------------------------------------------
* NOW RELIABLE (deterministic; these no longer flap on timing):
* - Every wait is condition-based (toBeVisible / toHaveAttribute /
* expect.poll). There is NO fixed waitForTimeout / sleep in the spec;
* the only setTimeout is the bounded poll-interval inside
* staging-setup.ts waitFor(), which has a hard deadline.
* - Tabs are asserted on REAL settled content (non-empty, non-spinner),
* not just "container is visible" — an empty or stuck-loading panel now
* fails instead of shipping green.
* - The ErrorBoundary ("Something went wrong") is asserted absent at app
* hydration AND per tab — a React subtree crash can no longer pass.
* - Visible error alerts inside a panel fail non-degraded tabs (was a
* weak [role=alert]:has-text("Failed to load")-only check that missed
* both other error phrasings and role-less error divs).
* - The driven tab list is parity-checked against the live DOM, so a new
* SidePanel tab can't ship un-tested and a removed one fails loud.
* - Click→activation is confirmed (aria-selected) before asserting the
* panel, removing a wrong-panel race.
* - The suite is fail-closed: CANVAS_E2E_STAGING=1 with no tenant state
* hard-errors (never skips→green); CANVAS_E2E_STAGING unset cleanly
* skips (operator did not request staging).
*
* STILL BLOCKS PROMOTION-TO-REQUIRED (do NOT flip continue-on-error here —
* CTO-owned, RFC internal#219 §1):
* - INFRA DEPENDENCY: each run provisions a real staging EC2 tenant
* (12-20 min cold boot). Required-gate latency + AWS/Cloudflare/CP
* availability become merge-blockers. A staging outage would freeze
* main even though the code is fine — unacceptable for a required check
* until staging has an SLA or this runs against a warm pre-provisioned
* pool.
* - SHARED-RESOURCE FLAKE SURFACE: TLS/DNS/ACME propagation on a shared
* staging zone (staging-setup TLS_TIMEOUT_MS) is outside this repo's
* control. Deterministic here ≠ deterministic upstream.
* - SECRET DEPENDENCY: CP_STAGING_ADMIN_API_TOKEN must be present on the
* runner. The workflow's skip-if-absent (core#2225) keeps a missing
* secret from painting red — correct for non-gating, but a REQUIRED
* check must instead guarantee the secret is always present, else it
* skip-greens the very thing it is supposed to enforce.
* - SINGLE-WORKSPACE COVERAGE: one hermes/platform_managed workspace that
* does NOT boot an agent on staging (no CP LLM proxy env, workspace-
* server #2162). Tabs render, but agent-dependent content paths (live
* chat round-trip, traces from a real run) are not exercised.
*
* PROMOTION CHECKLIST (when CTO signs off on making this required):
* 1. Warm pre-provisioned tenant pool OR a staging SLA bounding boot time.
* 2. Guarantee CP_STAGING_ADMIN_API_TOKEN on the gating runner; turn the
* skip-if-absent into a hard error for the required path.
* 3. Decide whether agent-dependent tabs need a wired LLM proxy on the
* staging tenant (covers chat/traces real content) before gating them.
*/
-8
View File
@@ -7,14 +7,6 @@ export default defineConfig({
fullyParallel: false,
workers: 1,
retries: 0,
// Fail CLOSED when an explicit spec selection matches zero tests.
// Playwright defaults this to true, so `playwright test e2e/chat-*.spec.ts`
// would exit 0 (green) if those files were renamed/moved/deleted — a
// false-green that would silently gut the e2e-chat gate after a refactor.
// forbidOnly likewise stops a stray `test.only` from green-ing the suite
// while skipping every other case.
passWithNoTests: false,
forbidOnly: !!process.env.CI,
use: {
baseURL: process.env.PLAYWRIGHT_BASE_URL || "http://localhost:3000",
headless: true,
+1 -1
View File
@@ -114,7 +114,7 @@ Opt-in pattern: when `idle_prompt` is non-empty in `config.yaml`, the workspace
Three Gin middleware classes gate server-side routes. Full contract in `docs/runbooks/admin-auth.md`.
- **`middleware.AdminAuth(db.DB)`** — strict bearer-only and **fail-closed in every environment** (harden/no-fail-open-auth). Used for any route where a forged request could leak prompts/memory, create/mutate workspaces, or leak ops intel. The former lazy-bootstrap fail-open (pass when `HasAnyLiveTokenGlobal` returns 0) and the dev-mode escape hatch have both been removed — a fresh install must provision `ADMIN_TOKEN` to reach admin routes.
- **`middleware.AdminAuth(db.DB)`** — strict bearer-only. Used for any route where a forged request could leak prompts/memory, create/mutate workspaces, or leak ops intel. Lazy-bootstrap fail-open when `HasAnyLiveTokenGlobal` returns 0.
- **`middleware.CanvasOrBearer(db.DB)`** — accepts a bearer token OR an Origin matching `CORS_ORIGINS`. Used **only** for cosmetic routes where a forged request has zero data/security impact. Currently only on `PUT /canvas/viewport`. Do not extend this to any route that leaks data or creates resources — see the runbook.
- **`middleware.WorkspaceAuth(db.DB)`** — binds a bearer token to `:id`. Workspace A's token cannot hit workspace B's sub-routes. Used for the entire `/workspaces/:id/*` group except the A2A proxy (which has its own `CanCommunicate` layer).
+3 -9
View File
@@ -24,7 +24,7 @@ cd molecule-core
That single script:
1. Generates an `ADMIN_TOKEN` into `.env` (first run only — preserved on re-runs) and exports the matching `NEXT_PUBLIC_ADMIN_TOKEN` so the canvas authenticates with it. Auth is **fail-closed in every environment** (including local dev) — there is no dev-mode fail-open; the canvas reaches admin/workspace routes only because it sends this bearer.
1. Generates an `ADMIN_TOKEN` into `.env` (first run only — preserved on re-runs)
2. Brings up Postgres, Redis, Langfuse, ClickHouse, and Temporal via `infra/scripts/setup.sh`
3. Populates the workspace template + plugin registry from `manifest.json`
4. Builds and starts the platform on `http://localhost:8080`
@@ -62,17 +62,11 @@ If you only want the raw compose flow:
docker compose -f docker-compose.infra.yml up -d
```
> **Auth is fail-closed even in local dev.** Pick any local admin token and
> set it on *both* sides — the platform (`ADMIN_TOKEN`) and the canvas
> (`NEXT_PUBLIC_ADMIN_TOKEN`, same value). Without it the canvas 401s on every
> admin/workspace call. (`scripts/dev-start.sh` does this for you; the manual
> steps below set it explicitly.)
### Step 3: Start the platform
```bash
cd workspace-server
ADMIN_TOKEN=dev-local-admin-token MOLECULE_ENV=development go run ./cmd/server
go run ./cmd/server
```
The control plane listens on `http://localhost:8080`.
@@ -84,7 +78,7 @@ In a new terminal:
```bash
cd canvas
npm install
NEXT_PUBLIC_ADMIN_TOKEN=dev-local-admin-token npm run dev # MUST match ADMIN_TOKEN above
npm run dev
```
Open `http://localhost:3000`.
+4 -32
View File
@@ -1,29 +1,5 @@
# Admin Authentication Runbook
## Auth is fail-CLOSED in every environment — `ADMIN_TOKEN` is the bootstrap credential
Per the CTO "nothing should be fail-open" directive, **every** auth path on the
workspace-server fails closed — there is no dev-mode / zero-token / DB-outage
hatch that grants access. This includes:
- `AdminAuth` and `WorkspaceAuth` (admin + per-workspace routes),
- `CanvasOrBearer` (the cosmetic `PUT /canvas/viewport` route), and
- `validateDiscoveryCaller` (`/registry/:id/peers`, `/registry/discover/:id`).
Consequence for **bootstrap**: a brand-new self-hosted / dev install has **no
DB-backed tokens yet**, and there is no longer a fail-open that lets the first
request through. The **only** way to reach admin routes (and to mint the first
workspace token via `POST /admin/workspaces/:id/tokens`) is to set `ADMIN_TOKEN`
in the platform environment and present it as the bearer. This is the "local
mimics production" principle: there is no zero-config bootstrap.
- **Local dev:** `scripts/dev-start.sh` provisions a deterministic
`ADMIN_TOKEN` into `.env` (and exports the matching `NEXT_PUBLIC_ADMIN_TOKEN`
so the canvas authenticates with it). See `docs/quickstart.md`.
- **Self-hosted / SaaS:** set `ADMIN_TOKEN` to a strong random secret
(`openssl rand -base64 32`) in the platform env and bake the matching
`NEXT_PUBLIC_ADMIN_TOKEN` into the canvas bundle.
## Required: set `MOLECULE_ENV` in all non-dev environments
```bash
@@ -31,10 +7,8 @@ mimics production" principle: there is no zero-config bootstrap.
MOLECULE_ENV=production
```
This matches the production tenant default. NOTE: `MOLECULE_ENV` no longer gates
any auth decision — it only drives NON-security local-dev conveniences (loopback
bind, relaxed rate limit). Setting it to `dev`/`development` does **not** relax
authentication. Staging and production smoke tests should use the real user/API
This matches the production tenant default and disables development-only
shortcuts. Staging and production smoke tests should use the real user/API
workflow: create a workspace, then mint a one-time displayed workspace bearer
with `POST /admin/workspaces/:id/tokens`.
@@ -49,7 +23,5 @@ The platform uses `ADMIN_TOKEN` as the bearer credential for admin-gated endpoin
| `POST /org/import` | `Authorization: Bearer <ADMIN_TOKEN>` |
| `POST /admin/workspaces/:id/tokens` | `Authorization: Bearer <ADMIN_TOKEN>`; plaintext token returned once |
Missing or invalid bearer → **401 in every environment** (fail-closed; no
dev-mode fail-open). If the auth datastore is unreachable, auth-gated routes
return **503** (`platform_unavailable`) — an availability tradeoff that grants no
access — rather than allowing the request through.
Missing or invalid `ADMIN_TOKEN` → AdminAuth fails open in dev mode (no token set), or
returns 401 in production mode (token set but invalid).
+22 -51
View File
@@ -46,67 +46,46 @@ cleanup() {
trap cleanup EXIT INT TERM
# ─────────────────────────────────────────────── 1. dev-mode auth posture
# The AdminAuth middleware closes its fail-open the moment the first
# workspace token lands in the DB — at which point /workspaces and
# other admin routes 401 unless the caller has either ADMIN_TOKEN or
# the dev-mode escape hatch. The canvas at localhost:3000 has no
# bearer token to send, so without one of those two paths it can't
# call admin endpoints after a workspace exists.
#
# SECURITY (harden/no-fail-open-auth): the workspace-server auth chain is
# now fail-CLOSED in EVERY environment, dev included. There is NO dev-mode
# fail-open escape hatch anymore — AdminAuth / WorkspaceAuth / discovery all
# require a real credential. So local dev must AUTHENTICATE, not run open.
# For local dev the right posture is the dev-mode escape hatch:
#
# The clean way to keep the canvas working locally is to provision a
# deterministic ADMIN_TOKEN and hand the matching NEXT_PUBLIC_ADMIN_TOKEN to
# the canvas bundle. The canvas already attaches `Authorization: Bearer
# $NEXT_PUBLIC_ADMIN_TOKEN` on every platform call (canvas/src/lib/api.ts),
# and next.config.ts warns if the pair is half-set. We set BOTH here.
# MOLECULE_ENV=development AND ADMIN_TOKEN unset
#
# MOLECULE_ENV=development — dev conveniences (loopback bind, relaxed
# rate limit). NOT an auth lever.
# ADMIN_TOKEN=<dev value> — server-side bearer AdminAuth/WorkspaceAuth
# enforce (Tier-2b). Real credential.
# NEXT_PUBLIC_ADMIN_TOKEN — same value, baked into the canvas bundle so
# the browser sends the matching bearer.
# That makes middleware.isDevModeFailOpen() return true and lets the
# canvas keep working without a bearer. Setting ADMIN_TOKEN here
# would BREAK the canvas (it has no way to read that token in dev).
#
# For SaaS the platform is provisioned with a random ADMIN_TOKEN + the
# canvas image baked with the matching NEXT_PUBLIC_ADMIN_TOKEN, plus
# MOLECULE_ENV=production. Same shape, stronger secret.
# For SaaS the platform is provisioned with ADMIN_TOKEN set AND
# MOLECULE_ENV=production — either one closes the hatch. So the dev
# mode signal here is safe (it's only active when both other knobs
# are absent).
if [ -f "$ENV_FILE" ] && grep -q '^MOLECULE_ENV=' "$ENV_FILE"; then
echo "==> Reusing MOLECULE_ENV from existing .env"
else
echo "==> Setting MOLECULE_ENV=development in .env"
echo "==> Setting MOLECULE_ENV=development in .env (dev-mode auth hatch)"
{
if [ -f "$ENV_FILE" ]; then
cat "$ENV_FILE"
echo ""
fi
echo "# Generated by scripts/dev-start.sh on $(date -u +%Y-%m-%dT%H:%M:%SZ)"
echo "# Local-dev conveniences (loopback bind, relaxed rate limit)."
echo "# Auth is fail-closed even in dev — see ADMIN_TOKEN below."
echo "# Local-dev auth posture: dev-mode fail-open lets the canvas at"
echo "# localhost:3000 call admin endpoints without a bearer token."
echo "# DO NOT set ADMIN_TOKEN here in dev — it would close the hatch"
echo "# and the canvas would 401 on every admin call."
echo "MOLECULE_ENV=development"
} > "$ENV_FILE.tmp"
mv "$ENV_FILE.tmp" "$ENV_FILE"
echo " Saved to $ENV_FILE"
fi
# Provision a deterministic dev ADMIN_TOKEN (idempotent — preserved across
# re-runs). This is the credential the canvas authenticates with locally; it
# is NOT a secret (it only guards your own localhost stack), so a fixed,
# well-known value is fine and keeps re-runs reproducible.
DEV_ADMIN_TOKEN="dev-local-admin-token"
if [ -f "$ENV_FILE" ] && grep -q '^ADMIN_TOKEN=' "$ENV_FILE"; then
echo "==> Reusing ADMIN_TOKEN from existing .env"
else
echo "==> Provisioning dev ADMIN_TOKEN in .env (fail-closed auth, authenticated canvas)"
{
cat "$ENV_FILE"
echo ""
echo "# Dev ADMIN_TOKEN — the canvas authenticates with this locally."
echo "# Auth is fail-closed; without a matching bearer the canvas 401s."
echo "# Fixed value is fine: it only guards your localhost stack."
echo "ADMIN_TOKEN=$DEV_ADMIN_TOKEN"
} > "$ENV_FILE.tmp"
mv "$ENV_FILE.tmp" "$ENV_FILE"
echo " Saved to $ENV_FILE"
fi
# Source .env so the platform inherits ADMIN_TOKEN (and anything else
# the user has added — e.g. ANTHROPIC_API_KEY for skipping the canvas
# Secrets UI). `set -a` exports every assignment in the sourced file
@@ -116,12 +95,6 @@ set -a
. "$ENV_FILE"
set +a
# The canvas reads NEXT_PUBLIC_ADMIN_TOKEN at build/dev time and attaches it
# as the bearer on every platform call. Mirror the server-side ADMIN_TOKEN
# into it so the matched-pair guard in canvas/next.config.ts is satisfied and
# the browser authenticates. Exported for the `npm run dev` child below.
export NEXT_PUBLIC_ADMIN_TOKEN="$ADMIN_TOKEN"
# ─────────────────────────────────────────────── 2. infra + templates
# Use setup.sh (not raw docker-compose) so the template registry gets
@@ -222,9 +195,7 @@ cat <<EOF
Molecule AI dev environment ready
Canvas: http://localhost:3000
Platform: http://localhost:8080 (bound to loopback in dev)
Auth: fail-closed — canvas authenticates with the dev ADMIN_TOKEN
(ADMIN_TOKEN + NEXT_PUBLIC_ADMIN_TOKEN, see .env)
Platform: http://localhost:8080
Logs: /tmp/molecule-platform.log
/tmp/molecule-canvas.log
+3 -49
View File
@@ -17,33 +17,6 @@ e2e_extract_token() {
python3 "$(dirname "${BASH_SOURCE[0]}")/_extract_token.py"
}
# Populate a curl-args array with the platform admin bearer, IF one is set.
#
# AdminAuth (workspace-server/internal/middleware/wsauth_middleware.go:161)
# fail-opens ONLY while ADMIN_TOKEN is unset AND no workspace token exists yet
# (devmode.go:50). The e2e-api CI job now sets ADMIN_TOKEN on the platform and
# exports the matching MOLECULE_ADMIN_TOKEN here, which flips fail-open OFF — so
# every admin-gated route (GET/POST/DELETE /workspaces, /events, /bundles,
# /org/import, …) now requires the EXACT ADMIN_TOKEN as bearer (Tier-2b rejects
# workspace bearers, wsauth_middleware.go:250). Helpers that hit admin routes
# (e2e_cleanup_all_workspaces, e2e_delete_workspace's default path) must send it.
#
# Guarded if-set so a bootstrap/dev platform with no admin token (fail-open)
# still works with zero auth. Mirrors e2e_mint_workspace_token's admin_auth.
#
# Usage:
# local admin_auth=(); e2e_admin_auth_args admin_auth
# curl -s "$BASE/workspaces" ${admin_auth[@]+"${admin_auth[@]}"}
e2e_admin_auth_args() {
local _outname="$1"
local _bearer="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}"
if [ -n "$_bearer" ]; then
eval "$_outname=(-H \"Authorization: Bearer \$_bearer\")"
else
eval "$_outname=()"
fi
}
# Delete every workspace currently on the platform. Use at the top of a
# script so count-based assertions are reproducible across runs.
# Mint a fresh workspace auth token via the real admin endpoint.
@@ -80,38 +53,19 @@ e2e_delete_workspace() {
if [ -z "$wid" ]; then
return 0
fi
# DELETE /workspaces/:id and GET /workspaces/:id-for-name are both behind
# AdminAuth (router.go:155 GET single is public, but List/Delete are gated at
# router.go:165-167). Callers that already pass a per-workspace bearer (e.g.
# test_api.sh's NEW_TOKEN) authenticate themselves; the cleanup-trap callers
# in poll-mode/notify/priority pass NO curl args and rely on this fallback to
# the platform admin bearer so the DELETE doesn't 401 once ADMIN_TOKEN is set.
if [ "${#curl_args[@]}" -eq 0 ]; then
e2e_admin_auth_args curl_args
fi
# ${curl_args[@]+"…"} guard: under `set -u` an empty array expands to an
# "unbound variable" error on bash <4.4 (macOS 3.2, some Linux). This form
# expands to nothing when the array is empty. Callers from the priority-
# runtimes EXIT trap pass no extra curl args, so the array IS empty there —
# without the guard the trap aborts non-zero AFTER the gate already passed,
# turning a validated run RED. (Same idiom already used for CREATED_WSIDS.)
if [ -z "$name" ]; then
name=$(curl -s "$BASE/workspaces/$wid" ${curl_args[@]+"${curl_args[@]}"} | python3 -c "import json,sys
name=$(curl -s "$BASE/workspaces/$wid" "${curl_args[@]}" | python3 -c "import json,sys
try:
print(json.load(sys.stdin).get('name',''))
except Exception:
pass" 2>/dev/null || true)
fi
curl -s -X DELETE "$BASE/workspaces/$wid?confirm=true" \
-H "X-Confirm-Name: $name" ${curl_args[@]+"${curl_args[@]}"} > /dev/null || true
-H "X-Confirm-Name: $name" "${curl_args[@]}" > /dev/null || true
}
e2e_cleanup_all_workspaces() {
# GET /workspaces (list) is AdminAuth-gated (router.go:165). Send the platform
# admin bearer if one is set so the list doesn't 401 → empty → no cleanup.
local _admin_auth=()
e2e_admin_auth_args _admin_auth
curl -s "$BASE/workspaces" ${_admin_auth[@]+"${_admin_auth[@]}"} | python3 -c "import json,sys
curl -s "$BASE/workspaces" | python3 -c "import json,sys
try:
[print(f\"{w.get('id','')}\\t{w.get('name','')}\") for w in json.load(sys.stdin)]
except Exception:
+2 -15
View File
@@ -11,10 +11,7 @@
# default + 401, see PR #1714.)
#
# claude-code → auth-aware:
# E2E_MINIMAX_API_KEY → "minimax:MiniMax-M2.7"
# (colon-namespaced BYOK id; bare
# "MiniMax-M2" 400s on a deploy-skewed
# staging registry — #2263)
# E2E_MINIMAX_API_KEY → "MiniMax-M2"
# E2E_ANTHROPIC_API_KEY → "claude-sonnet-4-6"
# otherwise → "sonnet"
#
@@ -85,17 +82,7 @@ pick_model_slug() {
hermes) printf 'openai/gpt-4o' ;;
claude-code)
if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
# Namespaced (colon) BYOK id, not bare "MiniMax-M2" (#2263 deploy-skew):
# bare ids can lag the deployed staging ws-server's compiled registry,
# so workspace-create's validateRegisteredModelForRuntime 400s the bare
# form on an older image. The colon-namespaced `minimax:MiniMax-M2.7`
# resolves the same way the proven-working sibling `moonshot/kimi-k2.6`
# does. It stays in the BYOK `minimax` arm (providers.yaml:851), so
# DeriveProvider -> provider_selection=minimax (BYOK) and the #1994
# byok-not-platform guard (test_staging_full_saas.sh:1000) still passes —
# unlike the slash/platform form `minimax/MiniMax-M2.7`, which resolves
# to provider=platform and would trip that guard.
printf 'minimax:MiniMax-M2.7'
printf 'MiniMax-M2'
elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
printf 'claude-sonnet-4-6'
else
+41 -61
View File
@@ -15,27 +15,18 @@ SUM_AUTH=()
ECHO_URL="https://example.com/echo-agent"
SUM_URL="https://example.com/summarizer-agent"
# AdminAuth-gated calls (GET/POST/DELETE /workspaces, /events, /bundles)
# require the platform admin bearer once ADMIN_TOKEN is set on the server.
# Tier-2b (wsauth_middleware.go:250) REJECTS workspace bearer tokens on admin
# routes when ADMIN_TOKEN is set, so admin calls MUST send the exact ADMIN_TOKEN
# value — which the e2e-api CI job exports here as MOLECULE_ADMIN_TOKEN. acurl =
# "admin curl": it always sends the platform admin bearer (if one is set).
#
# Guarded if-set: a fresh self-hosted/dev platform with no ADMIN_TOKEN fail-opens
# (devmode.go:50), so sending no bearer still works there.
ADMIN_BEARER="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}"
ADMIN_AUTH=()
[ -n "$ADMIN_BEARER" ] && ADMIN_AUTH=(-H "Authorization: Bearer $ADMIN_BEARER")
# AdminAuth-gated calls need a bearer token once any workspace token
# exists in the DB. ADMIN_TOKEN is populated after the first workspace
# create + real token mint. acurl = "authenticated curl".
ADMIN_TOKEN=""
acurl() {
curl -s ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} "$@"
if [ -n "$ADMIN_TOKEN" ]; then
curl -s -H "Authorization: Bearer $ADMIN_TOKEN" "$@"
else
curl -s "$@"
fi
}
# WORKSPACE_TOKEN holds a per-workspace bearer for the WorkspaceAuth-gated
# routes (PATCH /workspaces/:id, /activity, …). It is set after the first
# create+mint and is NOT interchangeable with the admin bearer.
WORKSPACE_TOKEN=""
# Pre-test cleanup: remove any workspaces left over from prior runs so
# count-based assertions ("empty", "count=2") are reproducible.
e2e_cleanup_all_workspaces
@@ -66,22 +57,19 @@ check "GET /health" '"status":"ok"' "$R"
R=$(acurl "$BASE/workspaces")
check "GET /workspaces (empty)" '[]' "$R"
# Test 3: Create workspace A. POST /workspaces is AdminAuth-gated (router.go:166);
# send the admin bearer (acurl). On a fail-open dev platform acurl sends nothing
# and the create still works.
R=$(acurl -X POST "$BASE/workspaces" -H "Content-Type: application/json" -d '{"name":"Echo Agent","tier":1,"runtime":"external","external":true}')
# Test 3: Create workspace A (AdminAuth fail-open — no tokens exist yet)
R=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" -d '{"name":"Echo Agent","tier":1,"runtime":"external","external":true}')
check "POST /workspaces (create echo)" '"status":"awaiting_agent"' "$R"
ECHO_ID=$(echo "$R" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])")
# Per-workspace token for Echo, for the WorkspaceAuth-gated routes below.
WORKSPACE_TOKEN=$(echo "$R" | e2e_extract_token)
if [ -z "$WORKSPACE_TOKEN" ]; then
WORKSPACE_TOKEN=$(e2e_mint_workspace_token "$ECHO_ID" 2>/dev/null || echo "")
ADMIN_TOKEN=$(echo "$R" | e2e_extract_token)
if [ -z "$ADMIN_TOKEN" ]; then
ADMIN_TOKEN=$(e2e_mint_workspace_token "$ECHO_ID" 2>/dev/null || echo "")
fi
if [ -n "$WORKSPACE_TOKEN" ]; then
echo " (acquired Echo workspace token: ${WORKSPACE_TOKEN:0:8}...)"
if [ -n "$ADMIN_TOKEN" ]; then
echo " (acquired admin token: ${ADMIN_TOKEN:0:8}...)"
else
echo " WARNING: no Echo workspace token acquired — WorkspaceAuth calls will fail"
echo " WARNING: no admin token acquired — subsequent AdminAuth calls will fail"
fi
# Test 4: Create workspace B (needs bearer — tokens now exist in DB)
@@ -110,7 +98,7 @@ check "GET /workspaces/:id (agent_card null)" '"agent_card":null' "$R"
# Test 7: Register echo — use workspace-specific token (from real admin
# endpoint), not the admin token. C18 requires a token issued TO THIS
# workspace, not just any valid token.
ECHO_WS_TOKEN="$WORKSPACE_TOKEN"
ECHO_WS_TOKEN="$ADMIN_TOKEN"
[ -n "$ECHO_WS_TOKEN" ] && ECHO_AUTH=(-H "Authorization: Bearer $ECHO_WS_TOKEN")
R=$(curl -s -X POST "$BASE/registry/register" -H "Content-Type: application/json" \
"${ECHO_AUTH[@]}" \
@@ -171,29 +159,26 @@ R=$(curl -s -X POST "$BASE/registry/check-access" -H "Content-Type: application/
-d "{\"caller_id\":\"$ECHO_ID\",\"target_id\":\"$SUM_ID\"}")
check "POST /registry/check-access (same-org allowed)" '"allowed":true' "$R"
# Test 15: PATCH workspace (update position). PATCH /workspaces/:id is
# WorkspaceAuth-gated (router.go:227 — #680 IDOR fix), so it needs Echo's OWN
# bearer, NOT the admin bearer (WorkspaceAuth rejects the admin token).
R=$(curl -s "${ECHO_AUTH[@]}" -X PATCH "$BASE/workspaces/$ECHO_ID" -H "Content-Type: application/json" -d '{"x":100,"y":200}')
# Test 15: PATCH workspace (update position)
R=$(acurl -X PATCH "$BASE/workspaces/$ECHO_ID" -H "Content-Type: application/json" -d '{"x":100,"y":200}')
check "PATCH /workspaces/:id (position)" '"status":"updated"' "$R"
R=$(acurl "$BASE/workspaces/$ECHO_ID")
check "Position saved (x=100)" '"x":100' "$R"
check "Position saved (y=200)" '"y":200' "$R"
# Test 16: PATCH workspace (update name) — WorkspaceAuth-gated; use Echo's token.
R=$(curl -s "${ECHO_AUTH[@]}" -X PATCH "$BASE/workspaces/$ECHO_ID" -H "Content-Type: application/json" -d '{"name":"Echo Agent v2"}')
# Test 16: PATCH workspace (update name)
R=$(acurl -X PATCH "$BASE/workspaces/$ECHO_ID" -H "Content-Type: application/json" -d '{"name":"Echo Agent v2"}')
check "PATCH /workspaces/:id (name)" '"status":"updated"' "$R"
R=$(acurl "$BASE/workspaces/$ECHO_ID")
check "Name updated" '"name":"Echo Agent v2"' "$R"
# Test 17: Events (#165 / PR #167 — admin-gated; the admin bearer is required,
# and Tier-2b rejects a workspace bearer here, so use acurl's admin token alone).
R=$(acurl "$BASE/events")
# Test 17: Events (#165 / PR #167 — now admin-gated, bearer required)
R=$(acurl "$BASE/events" -H "Authorization: Bearer $ECHO_TOKEN")
check "GET /events (has events)" 'WORKSPACE_ONLINE' "$R"
R=$(acurl "$BASE/events/$ECHO_ID")
R=$(acurl "$BASE/events/$ECHO_ID" -H "Authorization: Bearer $ECHO_TOKEN")
check "GET /events/:id (has events for echo)" 'WORKSPACE_ONLINE' "$R"
# Test 18: Update card
@@ -310,7 +295,7 @@ check "active_tasks cleared" '"active_tasks":0' "$R"
# endpoint is admin-auth gated and keeps the full record, so operators
# can still see task progress from the dashboard without exposing it
# over the public per-workspace GET.
R=$(acurl "$BASE/workspaces")
R=$(curl -s "$BASE/workspaces" -H "Authorization: Bearer $ECHO_TOKEN")
check "current_task in list response" '"current_task"' "$R"
# Test 21: Delete
@@ -321,20 +306,18 @@ check "current_task in list response" '"current_task"' "$R"
# Delete the CHILD (Summarizer) here instead: a child delete does NOT cascade
# upward, so the parent Echo survives and count=1 holds. The bundle round-trip
# below needs Summarizer's exported config, so capture it BEFORE this delete.
# GET /bundles/export/:id is admin-gated (router.go:741) — use the admin bearer.
BUNDLE=$(acurl "$BASE/bundles/export/$SUM_ID")
BUNDLE=$(curl -s "$BASE/bundles/export/$SUM_ID" -H "Authorization: Bearer $SUM_TOKEN")
check "GET /bundles/export/:id" '"name":"Summarizer Agent"' "$BUNDLE"
ORIG_NAME=$(echo "$BUNDLE" | python3 -c "import sys,json; print(json.load(sys.stdin)['name'])")
ORIG_TIER=$(echo "$BUNDLE" | python3 -c "import sys,json; print(json.load(sys.stdin)['tier'])")
# DELETE /workspaces/:id is admin-gated (router.go:167). X-Confirm-Name must
# still match the workspace name even with admin auth.
R=$(acurl -X DELETE "$BASE/workspaces/$SUM_ID?confirm=true" \
-H "Authorization: Bearer $SUM_TOKEN" \
-H "X-Confirm-Name: Summarizer Agent")
check "DELETE /workspaces/:id" '"status":"removed"' "$R"
# Parent Echo must survive a child delete — list (admin) and expect count=1.
R=$(acurl "$BASE/workspaces")
# Parent Echo must survive a child delete — list as Echo and expect count=1.
R=$(curl -s "$BASE/workspaces" -H "Authorization: Bearer $ECHO_TOKEN")
COUNT=$(echo "$R" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))")
check "List after delete (count=1)" "1" "$COUNT"
@@ -345,21 +328,21 @@ check "List after delete (count=1)" "1" "$COUNT"
echo ""
echo "--- Bundle Round-Trip Test ---"
# Delete the remaining parent Echo — DELETE is admin-gated (router.go:167);
# the platform admin bearer (acurl) authorizes it. X-Confirm-Name still required.
# Delete the remaining parent Echo — use ECHO_TOKEN (per-workspace) for
# WorkspaceAuth and ADMIN_TOKEN for the AdminAuth layer.
R=$(acurl -X DELETE "$BASE/workspaces/$ECHO_ID?confirm=true" \
-H "Authorization: Bearer $ECHO_TOKEN" \
-H "X-Confirm-Name: Echo Agent v2")
check "Delete before re-import" '"status":"removed"' "$R"
# Both workspaces are now deleted. The platform-level ADMIN_TOKEN env is still
# set, so admin routes still require the admin bearer (fail-open does NOT
# re-engage just because the token table emptied) — keep using acurl's bearer.
# After deleting both workspaces, all per-workspace tokens are revoked.
# Clear the now-revoked admin bearer so acurl can use fresh-install fail-open.
ADMIN_TOKEN=""
R=$(acurl "$BASE/workspaces")
COUNT=$(echo "$R" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))")
check "All workspaces deleted (count=0)" "0" "$COUNT"
# Re-import from the exported bundle. POST /bundles/import is admin-gated
# (router.go:742) — acurl sends the admin bearer.
# Re-import from the exported bundle (AdminAuth fail-open — no live tokens)
R=$(acurl -X POST "$BASE/bundles/import" -H "Content-Type: application/json" -d "$BUNDLE")
check "POST /bundles/import" '"status":"provisioning"' "$R"
NEW_ID=$(echo "$R" | python3 -c "import sys,json; print(json.load(sys.stdin)['workspace_id'])")
@@ -415,15 +398,12 @@ check "Register re-imported workspace" '"status":"registered"' "$R"
REG_NEW_TOKEN=$(echo "$R" | e2e_extract_token)
[ -n "$REG_NEW_TOKEN" ] && NEW_TOKEN="$REG_NEW_TOKEN"
# Re-export and verify agent_card survives the round-trip (#165 / PR #167 —
# GET /bundles/export/:id is admin-gated; use the admin bearer).
REBUNDLE=$(acurl "$BASE/bundles/export/$NEW_ID")
# Re-export and verify agent_card survives the round-trip (#165 / PR #167 — admin-gated)
REBUNDLE=$(curl -s "$BASE/bundles/export/$NEW_ID" -H "Authorization: Bearer $NEW_TOKEN")
check "Re-exported bundle has agent_card" '"agent_card"' "$REBUNDLE"
# Clean up — DELETE /workspaces/:id is admin-gated; pass no per-call auth so
# e2e_delete_workspace falls back to the platform admin bearer (a workspace
# bearer would be rejected by Tier-2b).
e2e_delete_workspace "$NEW_ID" "$ORIG_NAME"
# Clean up — use the token just issued to the re-imported workspace
e2e_delete_workspace "$NEW_ID" "$ORIG_NAME" -H "Authorization: Bearer $NEW_TOKEN"
echo ""
echo "=== Results: $PASS passed, $FAIL failed ==="
+45 -72
View File
@@ -1,30 +1,24 @@
#!/usr/bin/env bash
# E2E regression suite asserting that "dev mode" is fail-CLOSED.
# E2E regression suite for the local-dev escape hatches added in
# fix/quickstart-bugless. These cover the exact user-facing breakages
# that dropped out of the partial squash-merge of PR #1871:
#
# History: this file used to assert the local-dev fail-open escape hatches
# (GET /workspaces 200 with NO bearer, /workspaces/:id/activity 200 with no
# bearer) added in fix/quickstart-bugless. Under the CTO "nothing should be
# fail-open" directive (harden/no-fail-open-auth) those hatches were REMOVED:
# auth is fail-CLOSED in EVERY environment, local dev included. This suite now
# pins the inverse contract — bearer-less admin/workspace requests 401, and the
# SAME requests with the dev ADMIN_TOKEN bearer succeed.
# 1. GET /workspaces returns 200 with no bearer after tokens exist in
# the DB — exercises the AdminAuth Tier-1b dev-mode hatch
# (middleware/devmode.go::isDevModeFailOpen).
# 2. GET /workspaces/:id/activity returns 200 with no bearer — the
# same hatch applied to WorkspaceAuth.
# 3. POST /workspaces/:id/a2a doesn't 502-SSRF on a loopback workspace
# URL — exercises handlers/ssrf.go::devModeAllowsLoopback.
# 4. GET /org/templates returns the curated set populated by
# clone-manifest.sh — exercises infra/scripts/setup.sh + the
# ListTemplates failure logging in handlers/org.go.
#
# What it verifies:
# 1. GET /workspaces 401s with NO bearer once tokens exist (was: 200 via the
# removed AdminAuth Tier-1b dev-mode hatch); 200 WITH the admin bearer.
# 2. GET /workspaces/:id/activity (and /delegations, /approvals/pending) 401
# with no bearer (was: 200 via the WorkspaceAuth hatch); 200 WITH bearer.
# 3. GET /org/templates returns the curated set populated by clone-manifest.sh
# (unauth-readable bootstrap surface — unchanged).
#
# Requires: platform running on :8080 with MOLECULE_ENV=development AND
# ADMIN_TOKEN set (the dev value), with MOLECULE_ADMIN_TOKEN (or
# ADMIN_TOKEN) exported here so the suite can present the bearer.
# scripts/dev-start.sh provisions ADMIN_TOKEN locally; the e2e-api CI
# job sets it on the platform and exports the matching bearer.
# Requires: platform running on :8080 with MOLECULE_ENV=development and
# ADMIN_TOKEN unset. Matches the README quickstart env.
#
# Usage:
# MOLECULE_ADMIN_TOKEN=dev-local-admin-token bash tests/e2e/test_dev_mode.sh
# bash tests/e2e/test_dev_mode.sh
set -euo pipefail
# shellcheck source=_lib.sh
@@ -52,44 +46,35 @@ check_http() {
fi
}
echo "=== Dev-mode fail-CLOSED regression tests ==="
echo "=== Dev-mode escape-hatch regression tests ==="
echo ""
# The platform is fail-closed in every environment now, so the suite MUST have
# the admin bearer to drive the authenticated (200) assertions. Without it we
# cannot create / clean up workspaces — bail loudly rather than silently skip.
ADMIN_BEARER="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}"
if [ -z "$ADMIN_BEARER" ]; then
echo "FAIL: MOLECULE_ADMIN_TOKEN/ADMIN_TOKEN not set — auth is fail-closed in"
echo " every environment, so this suite needs the dev ADMIN_TOKEN bearer."
echo " e.g. MOLECULE_ADMIN_TOKEN=dev-local-admin-token bash $0"
exit 1
fi
ADMIN_AUTH=(-H "Authorization: Bearer $ADMIN_BEARER")
# Pre-test: ensure MOLECULE_ENV=development and no ADMIN_TOKEN are in the
# platform's env. The request path doesn't let us read the platform's
# env directly, but we can verify the hatch is active by confirming the
# expected behaviour under the conditions the test otherwise sets up.
e2e_cleanup_all_workspaces
# ----------------------------------------------------------------------
# Section 1 — AdminAuth is fail-CLOSED (dev-mode hatch removed)
# Section 1 — AdminAuth dev-mode hatch
# ----------------------------------------------------------------------
echo "--- Section 1: AdminAuth fail-closed ---"
# Before fix: once any workspace had tokens in the DB, GET /workspaces
# closed to unauthenticated callers and the Canvas broke. The hatch
# keeps it open specifically in dev mode.
echo "--- Section 1: AdminAuth dev-mode hatch ---"
# No bearer → 401 in dev mode (the removed hatch used to return 200).
R=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/workspaces")
check_http "GET /workspaces (no bearer) is fail-CLOSED" "401" "$R"
check_http "GET /workspaces (empty DB)" "200" "$R"
# With the dev admin bearer → 200.
R=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/workspaces" "${ADMIN_AUTH[@]}")
check_http "GET /workspaces (with admin bearer)" "200" "$R"
# Create a workspace (authenticated) so tokens land in the DB.
# Create a workspace so tokens land in the DB.
R=$(curl -s -w "\n%{http_code}" -X POST "$BASE/workspaces" \
"${ADMIN_AUTH[@]}" \
-H "Content-Type: application/json" \
-d '{"name":"Dev-Mode-Test","tier":1,"runtime":"external","external":true}')
CODE=$(echo "$R" | tail -n1)
BODY=$(echo "$R" | sed '$d')
check_http "POST /workspaces (create, with admin bearer)" "201" "$CODE"
check_http "POST /workspaces (create)" "201" "$CODE"
WS_ID=$(echo "$BODY" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || true)
if [ -z "$WS_ID" ]; then
@@ -98,55 +83,43 @@ if [ -z "$WS_ID" ]; then
exit 1
fi
# Ensure a real workspace token exists so AdminAuth sees a live token globally.
# Ensure a real workspace token exists so AdminAuth now sees a live token. On
# pre-fix builds the next /workspaces call would 401 — on post-fix it
# must stay 200 because MOLECULE_ENV=development + ADMIN_TOKEN unset.
TOKEN=$(echo "$BODY" | e2e_extract_token)
if [ -z "$TOKEN" ]; then
e2e_mint_workspace_token "$WS_ID" >/dev/null
fi
# With tokens now in the DB, the bearer-less call STILL 401s (no lazy-bootstrap
# / dev-mode fall-through), and the authenticated call still 200s.
R=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/workspaces")
check_http "GET /workspaces (after token minted, no bearer) is fail-CLOSED" "401" "$R"
R=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/workspaces" "${ADMIN_AUTH[@]}")
check_http "GET /workspaces (after token minted, with admin bearer)" "200" "$R"
check_http "GET /workspaces (after token minted, no bearer)" "200" "$R"
# ----------------------------------------------------------------------
# Section 2 — WorkspaceAuth is fail-CLOSED (dev-mode hatch removed)
# Section 2 — WorkspaceAuth dev-mode hatch
# ----------------------------------------------------------------------
# Before fix: /workspaces/:id/activity 401'd once tokens existed —
# the Canvas side panel's chat history load broke.
echo ""
echo "--- Section 2: WorkspaceAuth fail-closed ---"
echo "--- Section 2: WorkspaceAuth dev-mode hatch ---"
# No bearer → 401 (the removed hatch used to return 200).
R=$(curl -s -o /dev/null -w "%{http_code}" \
"$BASE/workspaces/$WS_ID/activity?type=a2a_receive&limit=50")
check_http "GET /workspaces/:id/activity (no bearer) is fail-CLOSED" "401" "$R"
check_http "GET /workspaces/:id/activity (no bearer)" "200" "$R"
R=$(curl -s -o /dev/null -w "%{http_code}" \
"$BASE/workspaces/$WS_ID/delegations")
check_http "GET /workspaces/:id/delegations (no bearer) is fail-CLOSED" "401" "$R"
check_http "GET /workspaces/:id/delegations (no bearer)" "200" "$R"
R=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/approvals/pending")
check_http "GET /approvals/pending (no bearer) is fail-CLOSED" "401" "$R"
# Same requests WITH the admin bearer → 200.
R=$(curl -s -o /dev/null -w "%{http_code}" \
"$BASE/workspaces/$WS_ID/activity?type=a2a_receive&limit=50" "${ADMIN_AUTH[@]}")
check_http "GET /workspaces/:id/activity (with admin bearer)" "200" "$R"
R=$(curl -s -o /dev/null -w "%{http_code}" \
"$BASE/workspaces/$WS_ID/delegations" "${ADMIN_AUTH[@]}")
check_http "GET /workspaces/:id/delegations (with admin bearer)" "200" "$R"
R=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/approvals/pending" "${ADMIN_AUTH[@]}")
check_http "GET /approvals/pending (with admin bearer)" "200" "$R"
check_http "GET /approvals/pending (no bearer)" "200" "$R"
# ----------------------------------------------------------------------
# Section 3 — Template registry populated by setup.sh
# ----------------------------------------------------------------------
# GET /org/templates is an unauthenticated bootstrap surface (the template
# palette must render before the user has a credential) — unchanged.
# Before fix: setup.sh didn't run clone-manifest.sh so the template
# palette was empty and the molecule-dev in-tree copy was broken.
echo ""
echo "--- Section 3: Template registry ---"
@@ -1,332 +0,0 @@
#!/usr/bin/env bash
set -uo pipefail
#
# test_keyless_feature_contracts_e2e.sh — REQUIRED-lane (E2E API Smoke Test)
# keyless HTTP-contract coverage for feature endpoints that ship WITHOUT an
# LLM key and had NO e2e assertion before (coverage-audit gap list).
#
# Why a NEW script (not added to test_api.sh): PR #2286 is concurrently
# rewriting test_api.sh's auth helpers + _lib.sh (e2e_admin_auth_args) and the
# test_priority_runtimes mock arm. Keeping these assertions in a standalone
# file avoids a merge conflict with that in-flight PR and keeps the new feature
# coverage independently reviewable. The mock-runtime A2A canned round-trip is
# OWNED by #2286's `mock` arm (run_mock) — intentionally NOT duplicated here.
#
# Every endpoint below is exercised against a runtime=external workspace so NO
# LLM key is needed. For each we assert the real HTTP contract: the happy path
# AND a meaningful failure mode (401 without auth, 400 on bad input, or the
# documented fail-closed status) so the test catches REAL regressions, not
# just 200s.
#
# Auth model (matches workspace-server/internal/middleware/wsauth_middleware.go):
# * WorkspaceAuth (/workspaces/:id/*) is STRICT once a token exists — a
# bearer-less request 401s (devmode fail-open needs MOLECULE_ENV=dev AND
# ADMIN_TOKEN unset, neither of which the e2e-api job sets).
# * AdminAuth routes accept the platform ADMIN_TOKEN (post-#2286) OR, when no
# ADMIN_TOKEN is configured, any valid workspace bearer (Tier-3 fallback) —
# so the workspace token we mint authenticates admin routes in BOTH the
# pre-#2286 (no ADMIN_TOKEN) and post-#2286 (ADMIN_TOKEN set) CI shapes.
#
# Local-run shape (mirrors the e2e-api job — real PG+Redis+platform):
# DATABASE_URL=... REDIS_URL=... ADMIN_TOKEN=... ./platform-server &
# BASE=http://127.0.0.1:$PORT bash tests/e2e/test_keyless_feature_contracts_e2e.sh
source "$(dirname "$0")/_lib.sh" # sets BASE default
PASS=0
FAIL=0
pass() { echo "PASS: $1"; PASS=$((PASS + 1)); }
fail() { echo "FAIL: $1"; echo " $2"; FAIL=$((FAIL + 1)); }
# assert_contains DESC EXPECTED_SUBSTRING ACTUAL
assert_contains() {
if printf '%s' "$3" | grep -qF "$2"; then
pass "$1"
else
fail "$1" "expected to contain [$2] — got: $3"
fi
}
# http_code METHOD URL [curl-args...] → prints the HTTP status code only.
http_code() {
local method="$1" url="$2"; shift 2
curl -s -o /dev/null -w "%{http_code}" -X "$method" "$url" "$@"
}
# body_and_code METHOD URL [curl-args...] → prints "<body>\n<code>".
body_and_code() {
local method="$1" url="$2"; shift 2
curl -s -w $'\n%{http_code}' -X "$method" "$url" "$@"
}
echo "=== Keyless feature HTTP-contract E2E (required lane) ==="
echo ""
# Platform admin bearer when the job set one (#2286 shape). When ADMIN_TOKEN is
# configured, AdminAuth's Tier-1 fail-open is OFF even before the first token
# exists, so admin-gated create / list / delete must carry it from the start.
# Pre-#2286 (no ADMIN_TOKEN) this is empty → fail-open create works bare.
ENV_ADMIN="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}"
ENV_ADMIN_AUTH=()
[ -n "$ENV_ADMIN" ] && ENV_ADMIN_AUTH=(-H "Authorization: Bearer $ENV_ADMIN")
# Reproducible counts across reruns. e2e_cleanup_all_workspaces hits the
# admin-gated list/delete; the platform admin bearer (if set) goes via the
# MOLECULE_ADMIN_TOKEN/ADMIN_TOKEN env the helper already reads.
e2e_cleanup_all_workspaces
# ---------------------------------------------------------------------------
# Fixture: one external workspace, registered → online. Keyless (external=true
# means no container is provisioned and no LLM key is consulted).
# ---------------------------------------------------------------------------
R=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \
${ENV_ADMIN_AUTH[@]+"${ENV_ADMIN_AUTH[@]}"} \
-d '{"name":"Keyless Fixture","tier":1,"runtime":"external","external":true}')
WS_ID=$(printf '%s' "$R" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo "")
if [ -z "$WS_ID" ]; then
echo "FATAL: could not create fixture workspace — got: $R" >&2
exit 2
fi
assert_contains "POST /workspaces (external fixture created)" '"status":"awaiting_agent"' "$R"
# Workspace token: register returns one; else mint via the admin endpoint.
WS_TOKEN=$(printf '%s' "$R" | e2e_extract_token)
if [ -z "$WS_TOKEN" ]; then
WS_TOKEN=$(e2e_mint_workspace_token "$WS_ID" 2>/dev/null || echo "")
fi
if [ -z "$WS_TOKEN" ]; then
echo "FATAL: could not obtain workspace token for $WS_ID" >&2
exit 2
fi
AUTH=(-H "Authorization: Bearer $WS_TOKEN")
# Admin bearer: explicit platform ADMIN_TOKEN if the job set one (#2286 shape),
# else the workspace token (AdminAuth Tier-3 accepts it pre-#2286).
ADMIN_BEARER="${ENV_ADMIN:-$WS_TOKEN}"
ADMIN_AUTH=(-H "Authorization: Bearer $ADMIN_BEARER")
# Bring the fixture online so lifecycle (hibernate) has a hibernatable state.
curl -s -X POST "$BASE/registry/register" -H "Content-Type: application/json" "${AUTH[@]}" \
-d "{\"id\":\"$WS_ID\",\"url\":\"https://example.com/keyless\",\"agent_card\":{\"name\":\"Keyless Fixture\",\"skills\":[{\"id\":\"noop\",\"name\":\"Noop\"}]}}" >/dev/null
# ===========================================================================
# 1. Terminal diagnose — GET /workspaces/:id/terminal/diagnose (wsAuth)
# External workspace has no instance_id → diagnoseLocal path → 200 with a
# deterministic report (ok=false, first_failure on docker/container). The
# /terminal endpoint itself is a WebSocket upgrade (not HTTP-assertable
# keyless); diagnose is its pure-HTTP sibling and the real contract surface.
# ===========================================================================
echo "--- /terminal/diagnose ---"
BC=$(body_and_code GET "$BASE/workspaces/$WS_ID/terminal/diagnose" "${AUTH[@]}")
DIAG_CODE=$(printf '%s' "$BC" | tail -n1)
DIAG_BODY=$(printf '%s' "$BC" | sed '$d')
assert_contains "GET /terminal/diagnose (200 report)" "200" "$DIAG_CODE"
assert_contains "GET /terminal/diagnose (carries workspace_id)" "\"workspace_id\":\"$WS_ID\"" "$DIAG_BODY"
assert_contains "GET /terminal/diagnose (has steps[])" '"steps"' "$DIAG_BODY"
# Failure mode: no bearer → 401 (WorkspaceAuth strict once a token exists).
assert_contains "GET /terminal/diagnose (no auth → 401)" "401" \
"$(http_code GET "$BASE/workspaces/$WS_ID/terminal/diagnose")"
# ===========================================================================
# 2. Webhooks (public) — POST /webhooks/:type
# Public, no auth. telegram adapter: empty update body → (nil,nil) → 200
# ignored; non-JSON → parse error → 400; unknown type → 404.
# ===========================================================================
echo "--- /webhooks/:type ---"
BC=$(body_and_code POST "$BASE/webhooks/telegram" -H "Content-Type: application/json" -d '{}')
WH_CODE=$(printf '%s' "$BC" | tail -n1)
WH_BODY=$(printf '%s' "$BC" | sed '$d')
assert_contains "POST /webhooks/telegram (non-message update → 200)" "200" "$WH_CODE"
assert_contains "POST /webhooks/telegram (status ignored)" '"status":"ignored"' "$WH_BODY"
assert_contains "POST /webhooks/telegram (bad JSON → 400)" "400" \
"$(http_code POST "$BASE/webhooks/telegram" -H 'Content-Type: application/json' -d 'not-json')"
assert_contains "POST /webhooks/<unknown> (→ 404)" "404" \
"$(http_code POST "$BASE/webhooks/nope-not-a-channel" -H 'Content-Type: application/json' -d '{}')"
# ===========================================================================
# 3. Budget — GET /workspaces/:id/budget (wsAuth) + PATCH (admin)
# GET: fresh workspace → multi-period view, no limits, zero spend.
# PATCH: set monthly limit (admin) → reflected; bad input → 400.
# ===========================================================================
echo "--- /budget ---"
BUD=$(curl -s "$BASE/workspaces/$WS_ID/budget" "${AUTH[@]}")
assert_contains "GET /budget (has periods map)" '"periods"' "$BUD"
assert_contains "GET /budget (monthly_spend 0 on fresh ws)" '"monthly_spend":0' "$BUD"
# PATCH is admin-gated (router.go:419). Set a monthly limit and verify echo.
PB=$(curl -s -X PATCH "$BASE/workspaces/$WS_ID/budget" -H "Content-Type: application/json" "${ADMIN_AUTH[@]}" \
-d '{"budget_limits":{"monthly":2000}}')
assert_contains "PATCH /budget (monthly limit set → echoed)" '"budget_limit":2000' "$PB"
# Re-read confirms persistence.
assert_contains "GET /budget (limit persisted)" '"budget_limit":2000' \
"$(curl -s "$BASE/workspaces/$WS_ID/budget" "${AUTH[@]}")"
# Failure: empty body → 400 "budget_limits or budget_limit field is required".
assert_contains "PATCH /budget (empty body → 400)" "400" \
"$(http_code PATCH "$BASE/workspaces/$WS_ID/budget" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{}')"
# Failure: unknown period → 400.
assert_contains "PATCH /budget (unknown period → 400)" "400" \
"$(http_code PATCH "$BASE/workspaces/$WS_ID/budget" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{"budget_limits":{"yearly":1}}')"
# Failure: GET without bearer → 401.
assert_contains "GET /budget (no auth → 401)" "401" "$(http_code GET "$BASE/workspaces/$WS_ID/budget")"
# ===========================================================================
# 4. Checkpoints — POST/GET/DELETE /workspaces/:id/checkpoints* (wsAuth)
# Fully self-contained CRUD over workflow_checkpoints (#788). Upsert → latest
# → list-by-wfid → delete → 404. Failure modes: missing workflow_id → 400,
# empty latest → 404.
# ===========================================================================
echo "--- /checkpoints ---"
WFID="kl-wf-$$"
CP=$(curl -s -X POST "$BASE/workspaces/$WS_ID/checkpoints" -H "Content-Type: application/json" "${AUTH[@]}" \
-d "{\"workflow_id\":\"$WFID\",\"step_name\":\"step-a\",\"step_index\":1,\"payload\":{\"k\":\"v\"}}")
assert_contains "POST /checkpoints (upsert → id + workflow_id)" "\"workflow_id\":\"$WFID\"" "$CP"
assert_contains "GET /checkpoints/latest (200 newest)" "\"workflow_id\":\"$WFID\"" \
"$(curl -s "$BASE/workspaces/$WS_ID/checkpoints/latest" "${AUTH[@]}")"
assert_contains "GET /checkpoints/:wfid (lists the step)" '"step_name":"step-a"' \
"$(curl -s "$BASE/workspaces/$WS_ID/checkpoints/$WFID" "${AUTH[@]}")"
DEL=$(curl -s -X DELETE "$BASE/workspaces/$WS_ID/checkpoints/$WFID" "${AUTH[@]}")
assert_contains "DELETE /checkpoints/:wfid (deleted count)" '"deleted":1' "$DEL"
assert_contains "GET /checkpoints/:wfid (after delete → 404)" "404" \
"$(http_code GET "$BASE/workspaces/$WS_ID/checkpoints/$WFID" "${AUTH[@]}")"
# Failure: missing workflow_id → 400 (binding:required).
assert_contains "POST /checkpoints (missing workflow_id → 400)" "400" \
"$(http_code POST "$BASE/workspaces/$WS_ID/checkpoints" -H 'Content-Type: application/json' "${AUTH[@]}" -d '{"step_name":"x"}')"
# Failure: no bearer → 401.
assert_contains "POST /checkpoints (no auth → 401)" "401" \
"$(http_code POST "$BASE/workspaces/$WS_ID/checkpoints" -H 'Content-Type: application/json' -d '{"workflow_id":"x","step_name":"y"}')"
# ===========================================================================
# 5. Audit — GET /workspaces/:id/audit (wsAuth)
# EU AI Act ledger query (#594). Fresh ws → empty events, total 0,
# chain_valid null (AUDIT_LEDGER_SALT unset). Failure: bad RFC3339 from → 400.
# ===========================================================================
echo "--- /audit ---"
AUD=$(curl -s "$BASE/workspaces/$WS_ID/audit" "${AUTH[@]}")
assert_contains "GET /audit (total 0 on fresh ws)" '"total":0' "$AUD"
assert_contains "GET /audit (chain_valid null without salt)" '"chain_valid":null' "$AUD"
assert_contains "GET /audit (bad 'from' → 400)" "400" \
"$(http_code GET "$BASE/workspaces/$WS_ID/audit?from=not-a-date" "${AUTH[@]}")"
assert_contains "GET /audit (no auth → 401)" "401" "$(http_code GET "$BASE/workspaces/$WS_ID/audit")"
# ===========================================================================
# 6. Traces — GET /workspaces/:id/traces (wsAuth)
# Langfuse proxy (#590). No LANGFUSE_* configured → 200 [] (graceful empty),
# never a 5xx. Failure: no auth → 401.
# ===========================================================================
echo "--- /traces ---"
BC=$(body_and_code GET "$BASE/workspaces/$WS_ID/traces" "${AUTH[@]}")
TR_CODE=$(printf '%s' "$BC" | tail -n1)
TR_BODY=$(printf '%s' "$BC" | sed '$d')
assert_contains "GET /traces (200 without Langfuse)" "200" "$TR_CODE"
assert_contains "GET /traces (empty list)" '[]' "$TR_BODY"
assert_contains "GET /traces (no auth → 401)" "401" "$(http_code GET "$BASE/workspaces/$WS_ID/traces")"
# ===========================================================================
# 7. Session search — GET /workspaces/:id/session-search (wsAuth)
# Searches activity_logs. Seed one activity row, then assert q-filter finds
# it and a non-matching q returns []. Failure: no auth → 401.
# ===========================================================================
echo "--- /session-search ---"
curl -s -X POST "$BASE/workspaces/$WS_ID/activity" -H "Content-Type: application/json" "${AUTH[@]}" \
-d '{"activity_type":"agent_log","method":"inference","summary":"keyless-needle marker"}' >/dev/null
assert_contains "GET /session-search?q=keyless-needle (finds row)" 'keyless-needle' \
"$(curl -s "$BASE/workspaces/$WS_ID/session-search?q=keyless-needle" "${AUTH[@]}")"
assert_contains "GET /session-search?q=<no-match> (empty)" '[]' \
"$(curl -s "$BASE/workspaces/$WS_ID/session-search?q=zzz-no-such-token-zzz" "${AUTH[@]}")"
assert_contains "GET /session-search (no auth → 401)" "401" \
"$(http_code GET "$BASE/workspaces/$WS_ID/session-search?q=x")"
# ===========================================================================
# 8. Rescue — GET /workspaces/:id/rescue (wsAuth)
# RFC internal#742. Fail-CLOSED contract: the e2e-api job has no
# MOLECULE_ORG_ID, so the handler returns 503 platform_misconfigured rather
# than leaking cross-org. That fail-closed behaviour IS the keyless contract
# we gate here (a regression that drops the org guard would flip this to a
# 200/404 and turn this assertion RED). Failure mode: no auth → 401.
# ===========================================================================
echo "--- /rescue ---"
BC=$(body_and_code GET "$BASE/workspaces/$WS_ID/rescue" "${AUTH[@]}")
RES_CODE=$(printf '%s' "$BC" | tail -n1)
RES_BODY=$(printf '%s' "$BC" | sed '$d')
if [ "$RES_CODE" = "404" ]; then
# MOLECULE_ORG_ID was set in this environment → no-bundle path.
assert_contains "GET /rescue (no bundle → 404, org configured)" 'no rescue bundle' "$RES_BODY"
else
# No MOLECULE_ORG_ID (the e2e-api default) → fail-closed 503.
assert_contains "GET /rescue (fail-closed 503 without MOLECULE_ORG_ID)" "503" "$RES_CODE"
assert_contains "GET /rescue (platform_misconfigured code)" 'platform_misconfigured' "$RES_BODY"
fi
assert_contains "GET /rescue (no auth → 401)" "401" "$(http_code GET "$BASE/workspaces/$WS_ID/rescue")"
# ===========================================================================
# 9. LLM billing-mode admin toggle — GET/PUT /admin/workspaces/:id/llm-billing-mode
# (AdminAuth). Flip to byok → read back override; bad UUID → 400; missing
# 'mode' key → 400; unknown mode → 400.
# ===========================================================================
echo "--- /admin/workspaces/:id/llm-billing-mode ---"
assert_contains "GET llm-billing-mode (resolves a mode)" '"resolved_mode"' \
"$(curl -s "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" "${ADMIN_AUTH[@]}")"
PUTBM=$(curl -s -X PUT "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" -H "Content-Type: application/json" "${ADMIN_AUTH[@]}" \
-d '{"mode":"byok"}')
assert_contains "PUT llm-billing-mode byok (override set)" '"workspace_override":"byok"' "$PUTBM"
assert_contains "GET llm-billing-mode (byok persisted)" '"workspace_override":"byok"' \
"$(curl -s "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" "${ADMIN_AUTH[@]}")"
# Clear the override (null) so we don't leave fixture state skewed.
curl -s -X PUT "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" -H "Content-Type: application/json" "${ADMIN_AUTH[@]}" \
-d '{"mode":null}' >/dev/null
# Failure: malformed UUID → 400.
assert_contains "PUT llm-billing-mode (bad UUID → 400)" "400" \
"$(http_code PUT "$BASE/admin/workspaces/not-a-uuid/llm-billing-mode" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{"mode":"byok"}')"
# Failure: missing 'mode' key → 400.
assert_contains "PUT llm-billing-mode (missing mode → 400)" "400" \
"$(http_code PUT "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{}')"
# Failure: unknown mode string → 400.
assert_contains "PUT llm-billing-mode (unknown mode → 400)" "400" \
"$(http_code PUT "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{"mode":"bogus-mode"}')"
# ===========================================================================
# 10. Lifecycle — Pause → Resume + Hibernate (wsAuth)
# Pause works backend-agnostically (StopWorkspaceAuto no-ops on no backend)
# → status=paused. Resume re-provisions: 200 provisioning when a provisioner
# is wired (the e2e-api host has Docker), or 503 provisioner-not-available
# otherwise — both are valid contracts, so accept either. Failure modes:
# resume a non-paused ws → 404; hibernate a non-online ws → 404.
# ===========================================================================
echo "--- lifecycle (resume / hibernate) ---"
# Pause the (online) fixture → status paused.
PA=$(curl -s -X POST "$BASE/workspaces/$WS_ID/pause" "${AUTH[@]}")
assert_contains "POST /pause (online → paused)" '"status":"paused"' "$PA"
# Resume the paused fixture — accept 200 provisioning OR 503 (no provisioner).
BC=$(body_and_code POST "$BASE/workspaces/$WS_ID/resume" "${AUTH[@]}")
RSM_CODE=$(printf '%s' "$BC" | tail -n1)
RSM_BODY=$(printf '%s' "$BC" | sed '$d')
if [ "$RSM_CODE" = "200" ]; then
assert_contains "POST /resume (paused → provisioning)" '"status":"provisioning"' "$RSM_BODY"
elif [ "$RSM_CODE" = "503" ]; then
assert_contains "POST /resume (no provisioner → 503 contract)" 'provisioner not available' "$RSM_BODY"
else
fail "POST /resume (expected 200 or 503)" "got HTTP $RSM_CODE$RSM_BODY"
fi
# Failure: resume a workspace that is NOT paused → 404.
# (After the resume above it is provisioning/online, not paused.)
assert_contains "POST /resume (not-paused → 404)" "404" \
"$(http_code POST "$BASE/workspaces/$WS_ID/resume" "${AUTH[@]}")"
# Hibernate: bring the fixture back online first, then hibernate it.
curl -s -X POST "$BASE/registry/register" -H "Content-Type: application/json" "${AUTH[@]}" \
-d "{\"id\":\"$WS_ID\",\"url\":\"https://example.com/keyless\",\"agent_card\":{\"name\":\"Keyless Fixture\",\"skills\":[{\"id\":\"noop\",\"name\":\"Noop\"}]}}" >/dev/null
HB=$(curl -s -X POST "$BASE/workspaces/$WS_ID/hibernate" "${AUTH[@]}")
assert_contains "POST /hibernate (online → hibernated)" '"status":"hibernated"' "$HB"
# Failure: hibernate again (now hibernated, not online/degraded) → 404.
assert_contains "POST /hibernate (not-hibernatable → 404)" "404" \
"$(http_code POST "$BASE/workspaces/$WS_ID/hibernate" "${AUTH[@]}")"
# Failure: no bearer → 401.
assert_contains "POST /resume (no auth → 401)" "401" "$(http_code POST "$BASE/workspaces/$WS_ID/resume")"
# ---------------------------------------------------------------------------
# Cleanup — delete the fixture (admin-gated DELETE + per-workspace bearer).
# ---------------------------------------------------------------------------
e2e_delete_workspace "$WS_ID" "Keyless Fixture" "${ADMIN_AUTH[@]}"
echo ""
echo "=== Results: $PASS passed, $FAIL failed ==="
[ "$FAIL" -eq 0 ]
+2 -2
View File
@@ -49,13 +49,13 @@ run_test "codex → slash-form fallback" codex
run_test "claude-code → OAuth/default alias" claude-code "sonnet"
got=$(unset E2E_MODEL_SLUG E2E_ANTHROPIC_API_KEY; E2E_MINIMAX_API_KEY="mx-test" pick_model_slug claude-code)
assert_eq "claude-code + MiniMax key → MiniMax model" "$got" "minimax:MiniMax-M2.7"
assert_eq "claude-code + MiniMax key → MiniMax model" "$got" "MiniMax-M2"
got=$(unset E2E_MODEL_SLUG E2E_MINIMAX_API_KEY; E2E_ANTHROPIC_API_KEY="sk-ant-test" pick_model_slug claude-code)
assert_eq "claude-code + Anthropic API key → Anthropic API model" "$got" "claude-sonnet-4-6"
got=$(unset E2E_MODEL_SLUG; E2E_MINIMAX_API_KEY="mx-priority" E2E_ANTHROPIC_API_KEY="sk-ant-loser" pick_model_slug claude-code)
assert_eq "claude-code + both keys → MiniMax priority" "$got" "minimax:MiniMax-M2.7"
assert_eq "claude-code + both keys → MiniMax priority" "$got" "MiniMax-M2"
# ── Fallback for unknown runtime ──
# Picks slash-form (hermes-shaped) since hermes is the historical
+2 -9
View File
@@ -28,13 +28,6 @@ PASS=0
FAIL=0
WSID=""
# GET /workspaces (list) and POST /workspaces (create) are AdminAuth-gated
# (router.go:165-166). The e2e-api CI job sets ADMIN_TOKEN on the platform
# (fail-open OFF) and exports MOLECULE_ADMIN_TOKEN here, so these calls need the
# admin bearer. Guarded if-set so a fail-open dev platform still works.
ADMIN_AUTH=()
e2e_admin_auth_args ADMIN_AUTH
cleanup() {
# Workspace teardown — best-effort, ignore errors so an unrelated CP
# outage doesn't shadow a real test failure.
@@ -87,7 +80,7 @@ echo "=== Setup ==="
# canvas. Find and delete any with this exact name so the test is safe to
# re-run from any state. Match by name (not tag) so this also catches
# leftovers created by older script versions.
PRIOR=$(curl -s "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} | python3 -c '
PRIOR=$(curl -s "$BASE/workspaces" | python3 -c '
import json, sys
try:
print(" ".join(w["id"] for w in json.load(sys.stdin) if w.get("name") == "Notify E2E"))
@@ -103,7 +96,7 @@ done
# feedback_workspace_model_required_no_platform_default_dynamic_credential_intake).
# Body has no runtime → defaults to claude-code; pass the matching model
# that the workspace-creation contract now requires.
R=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \
R=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \
-d '{"name":"Notify E2E","tier":1,"runtime":"external","external":true,"model":"sonnet"}')
WSID=$(echo "$R" | python3 -c 'import json,sys;print(json.load(sys.stdin)["id"])' 2>/dev/null || true)
[ -n "$WSID" ] || { echo "Failed to create workspace: $R"; exit 1; }
+1 -31
View File
@@ -300,14 +300,7 @@ rows = json.load(sys.stdin)
def text_of(r):
body = r.get('request_body') or {}
parts = (body.get('params') or {}).get('message', {}).get('parts') or []
# A2A v0.3 keys the Part discriminator on 'kind'; legacy senders used
# 'type'. ProxyA2A.normalizeA2APayload (#2251) rewrites 'type' -> 'kind'
# on ingest, so the stored request_body carries 'kind' even when the
# caller posted 'type'. Accept EITHER so this parser asserts on the text
# payload, not on which discriminator field the server happened to store.
def is_text(p):
return p.get('kind') == 'text' or p.get('type') == 'text'
return ''.join(p.get('text', '') for p in parts if is_text(p))
return ''.join(p.get('text','') for p in parts if p.get('type')=='text')
if len(rows) < 2:
print('NEED2_GOT_'+str(len(rows)))
else:
@@ -316,29 +309,6 @@ else:
check_eq "since_id feed orders ASC (oldest-new first, newest-new last)" \
"hello-from-e2e-2|hello-from-e2e-3" "$ASC_FIRST"
# Wire-contract gate (#2251): the caller posted parts with the LEGACY "type"
# discriminator, but ProxyA2A.normalizeA2APayload rewrites "type" -> "kind"
# (A2A v0.3) BEFORE the row is durably logged. Assert the stored request_body
# carries "kind" and no longer carries "type", so a regression that drops the
# rename — or a feed that stops storing the normalized body — fails loudly here
# instead of silently feeding the polling agent an untagged Part. This is the
# end-to-end half of the Go unit tests in a2a_proxy_test.go (which assert the
# rename in isolation); this proves it survives the durable activity_logs path.
DISC=$(echo "$ASC_RESP" | python3 -c "
import json, sys
rows = json.load(sys.stdin)
kinds, types = [], []
for r in rows:
body = r.get('request_body') or {}
parts = (body.get('params') or {}).get('message', {}).get('parts') or []
for p in parts:
if 'kind' in p: kinds.append(p['kind'])
if 'type' in p: types.append(p['type'])
print(('kind' if kinds and not types else 'BAD') + ':' + ','.join(kinds) + '/' + ','.join(types))
")
check_eq "stored Part uses v0.3 'kind' discriminator, never legacy 'type' (#2251)" \
"kind:text,text/" "$DISC"
# ---------- Phase 6: stale cursor returns 410 ----------
echo ""
echo "--- Phase 6: Stale / unknown cursor returns 410 ---"
+19 -396
View File
@@ -24,73 +24,11 @@
# Each phase skips cleanly when its prerequisite secret is absent so a
# partially-keyed env (e.g. CI without an OpenAI key) doesn't false-fail.
#
# REQUIRE-LIVE (false-green guard, mirrors CP serving-e2e's
# SERVING_E2E_REQUIRE_LIVE semantics)
# ------------------------------------------------------------------
# Without a guard, an env with NO live secrets makes every phase SKIP,
# leaving PASS=0 FAIL=0 — and the historical `[ "$FAIL" -eq 0 ]` gate
# exits 0 (GREEN) while validating ZERO runtimes. That made the REQUIRED
# `E2E API Smoke Test` merge gate pass without exercising a single
# runtime (false-green).
#
# Fix: a real "validated arm" counter (VALIDATED) tracks runtimes that
# actually ran AND produced a non-error A2A reply. With E2E_REQUIRE_LIVE=1:
# if zero arms validated, the run exits NON-zero with a loud message.
# Without it (E2E_REQUIRE_LIVE unset/0), a fully-skipped run stays a LOUD
# skip + exit 0 for dev convenience.
#
# This zero-validated→RED decision is the load-bearing logic. It is factored
# into evaluate_require_live_gate() (a pure function of $FAIL/$VALIDATED/
# $E2E_REQUIRE_LIVE, defined before any platform I/O) and is REGRESSION-GATED
# on every PR by tests/e2e/test_require_live_priority_gate_unit.sh, which
# sources this file (E2E_PRIORITY_UNIT_SOURCE=1), sets the counters, and
# asserts the gate's exit code — no platform, no provisioning, no network.
# So the false-green can't silently come back: a revert of the guard fails CI.
#
# CI POSTURE (REQUIRE-LIVE ON — see .gitea/workflows/e2e-api.yml):
# The live e2e-api job SETS E2E_REQUIRE_LIVE=1. The `mock` arm is the
# CI-provisionable live-completion arm: it org-imports a mock workspace
# (→online→canned A2A reply) with NO external secret. The only thing that
# previously blocked it in CI was admin auth — POST /org/import and POST
# /admin/workspaces/:id/tokens are AdminAuth-gated, and the job set no admin
# token, so every admin call 401'd ("admin auth required"). The job now sets
# ADMIN_TOKEN on the platform AND exports the matching MOLECULE_ADMIN_TOKEN
# the scripts send, so mock validates end-to-end and VALIDATED>=1 holds on a
# healthy platform — the REQUIRED `E2E API Smoke Test` gate now HONESTLY
# validates a runtime. If the mock plumbing or the admin-auth wiring breaks,
# the gate goes RED (not false-green). The zero-validated→RED decision is also
# regression-gated WITHOUT provisioning by the bash unit test above, so a
# revert of that logic still fails CI.
#
# LIVE ARMS (run when their prerequisite is present; opportunistic):
# - `mock` (run_mock) is the no-key REQUIRE-LIVE backbone: a virtual
# workspace (no container, no EC2, no provider) whose org-import path
# short-circuits to status='online' with a canned A2A reply. It validates
# in CI now that the e2e-api job wires an admin token (org-import + token
# mint are AdminAuth-gated), so it is the guaranteed >=1 validation.
# - MiniMax (E2E_MINIMAX_API_KEY, from MOLECULE_STAGING_MINIMAX_API_KEY) is
# an OPPORTUNISTIC best-effort real-LLM arm: registry-fragile in CI (422
# UNREGISTERED_MODEL_FOR_RUNTIME — see run_minimax header), so a miss is
# a best-effort MISS via bestfail() and does NOT red the gate.
# The CI e2e-api job sets E2E_REQUIRE_LIVE=1: mock guarantees a validation, so
# the REQUIRED gate is honest (RED if the mock plumbing/admin-auth breaks). The
# zero-validated→RED logic is also regression-gated by the bash unit test above.
#
# Usage:
# # Enforce REQUIRE-LIVE locally (need >=1 arm to actually validate):
# E2E_REQUIRE_LIVE=1 E2E_MINIMAX_API_KEY=... \
# tests/e2e/test_priority_runtimes_e2e.sh
#
# # Default (no enforcement): all-skip stays a LOUD skip + exit 0:
# tests/e2e/test_priority_runtimes_e2e.sh
#
# # Other live arms (if their secrets are configured):
# CLAUDE_CODE_OAUTH_TOKEN=... E2E_OPENAI_API_KEY=... \
# tests/e2e/test_priority_runtimes_e2e.sh
#
# # Run only one runtime
# E2E_RUNTIMES=mock tests/e2e/test_priority_runtimes_e2e.sh
# E2E_RUNTIMES=minimax tests/e2e/test_priority_runtimes_e2e.sh
# E2E_RUNTIMES=claude-code tests/e2e/test_priority_runtimes_e2e.sh
# E2E_RUNTIMES=hermes tests/e2e/test_priority_runtimes_e2e.sh
#
@@ -103,81 +41,13 @@
set -euo pipefail
source "$(dirname "$0")/_lib.sh"
PASS=0
FAIL=0
SKIP=0
# VALIDATED counts runtimes that ACTUALLY ran end-to-end (provisioned,
# reached online, AND returned a non-error A2A reply). Distinct from PASS,
# which also counts sub-assertions like activity-log rows. This is the
# signal the REQUIRE-LIVE gate keys off: VALIDATED==0 means we proved
# nothing about any runtime, regardless of how many sub-asserts "passed".
VALIDATED=0
CREATED_WSIDS=()
# evaluate_require_live_gate — the SINGLE source of the final exit decision.
# Pure function of $FAIL, $VALIDATED, and $E2E_REQUIRE_LIVE; performs NO I/O
# beyond the loud messages. Returns the exit code the script should exit with:
# - FAIL>0 → 1 (a real failure is always red)
# - VALIDATED==0 + REQUIRE_LIVE → 1 (false-green trap: proved nothing → RED)
# - VALIDATED==0 + !REQUIRE_LIVE → 0 (dev-convenience LOUD skip)
# - VALIDATED>=1 → 0 (at least one arm validated end-to-end)
# It is a function (not inline tail code) so test_require_live_priority_gate_unit.sh
# can drive the REAL decision in isolation — set the counters, call this, assert
# the return code — with no platform, no provisioning, no network. That makes the
# zero-validated→RED logic a CI-gated regression contract: a future revert of it
# fails the unit test on every PR. See that unit test for the fail-direction proof.
evaluate_require_live_gate() {
# Any real failure is always red.
if [ "$FAIL" -ne 0 ]; then
return 1
fi
# REQUIRE-LIVE gate (mirrors CP serving-e2e SERVING_E2E_REQUIRE_LIVE).
# A run where every runtime SKIPPED proves nothing. In enforced mode
# (E2E_REQUIRE_LIVE=1) that MUST be red so the required `E2E API Smoke
# Test` gate can't be false-green on an all-skip run.
local require_live="${E2E_REQUIRE_LIVE:-0}"
if [ "$VALIDATED" -eq 0 ]; then
if [ "$require_live" = "1" ] || [ "$require_live" = "true" ]; then
echo "::error::E2E_REQUIRE_LIVE is set but ZERO runtimes were validated end-to-end." >&2
echo " Every runtime SKIPPED — no live secret was present, so this gate" >&2
echo " validated nothing. Wire at least one live arm via Gitea secrets" >&2
echo " (E2E_MINIMAX_API_KEY ← MOLECULE_STAGING_MINIMAX_API_KEY is the" >&2
echo " default CI arm; CLAUDE_CODE_OAUTH_TOKEN / E2E_OPENAI_API_KEY also" >&2
echo " work) so >=1 runtime actually provisions + replies. Failing RED" >&2
echo " instead of false-green." >&2
return 1
fi
# Dev convenience: no enforcement requested → loud skip, exit 0.
echo "SKIPPED: no live secrets present and E2E_REQUIRE_LIVE is not set — validated" >&2
echo " zero runtimes. This is a dev-convenience pass; CI sets" >&2
echo " E2E_REQUIRE_LIVE=1 to make zero-validated a hard failure." >&2
return 0
fi
echo "OK: $VALIDATED runtime(s) validated end-to-end."
return 0
}
# Source-guard: when sourced by the unit test (E2E_PRIORITY_UNIT_SOURCE=1) we
# stop HERE — the counters + evaluate_require_live_gate are now defined, and we
# must NOT fall through to _lib.sh's platform-dependent helpers or the live
# pre-sweep curl below (there is no platform in the unit-test environment).
if [ "${E2E_PRIORITY_UNIT_SOURCE:-0}" = "1" ]; then
return 0
fi
source "$(dirname "$0")/_lib.sh"
# GET /workspaces (list, router.go:165) and POST /workspaces (create,
# router.go:166) are AdminAuth-gated. The e2e-api CI job sets ADMIN_TOKEN on the
# platform (fail-open OFF) and exports MOLECULE_ADMIN_TOKEN here, so the
# pre-sweep list and every runtime-create must send the admin bearer or they
# 401. run_mock uses POST /org/import (also admin-gated) and wires its own admin
# auth inline. Guarded if-set so a fail-open dev platform still works.
ADMIN_AUTH=()
e2e_admin_auth_args ADMIN_AUTH
cleanup() {
# `set -u` + empty array would error on "${CREATED_WSIDS[@]}"; the
# ${VAR[@]+"…"} form expands to nothing when the array is unset/empty
@@ -188,26 +58,14 @@ cleanup() {
}
trap cleanup EXIT
pass() { echo " PASS — $1"; PASS=$((PASS + 1)); }
fail() { echo " FAIL — $1"; echo " $2"; FAIL=$((FAIL + 1)); }
skip() { echo " SKIP — $1"; SKIP=$((SKIP + 1)); }
# Mark a runtime as having been validated end-to-end (online + non-error
# A2A reply). Also emits a PASS line so it shows in the results tally.
validated() { echo " PASS — $1"; PASS=$((PASS + 1)); VALIDATED=$((VALIDATED + 1)); }
# bestfail() is for OPPORTUNISTIC (best-effort) arms whose failure must
# NOT red the gate. It does NOT increment FAIL — it only logs + bumps
# SKIP so the tally stays honest ("we tried, it didn't validate, but it
# was never load-bearing"). Used by the MiniMax arm: MiniMax-create is
# fragile in CI (registry-skewed model id, BYOK plumbing — see core#2263
# and the run_minimax header), so a MiniMax miss is reported but never
# fails the REQUIRED gate. The mock arm is the load-bearing validation
# that keeps the gate honest; MiniMax is the real-LLM bonus on top.
bestfail() { echo " BEST-EFFORT MISS — $1"; echo " $2"; SKIP=$((SKIP + 1)); }
pass() { echo " PASS — $1"; PASS=$((PASS + 1)); }
fail() { echo " FAIL — $1"; echo " $2"; FAIL=$((FAIL + 1)); }
skip() { echo " SKIP — $1"; SKIP=$((SKIP + 1)); }
# Pre-sweep any prior runs that left workspaces behind (same defence as
# test_notify_attachments_e2e.sh: trap fires on normal exit, but a
# SIGPIPE / kill -9 can bypass it).
PRIOR=$(curl -s "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} | python3 -c '
PRIOR=$(curl -s "$BASE/workspaces" | python3 -c '
import json, sys
try:
print(" ".join(w["id"] for w in json.load(sys.stdin) if w.get("name","").startswith("Priority E2E ")))
@@ -330,7 +188,7 @@ print(json.dumps({'CLAUDE_CODE_OAUTH_TOKEN': os.environ['CLAUDE_CODE_OAUTH_TOKEN
")
local resp wsid
# model required (CTO 2026-05-22 SSOT) — pass the deleted DefaultModel("claude-code") value.
resp=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \
resp=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \
-d "{\"name\":\"Priority E2E (claude-code)\",\"runtime\":\"claude-code\",\"model\":\"sonnet\",\"tier\":1,\"secrets\":$secrets}")
wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true
if [ -z "$wsid" ]; then
@@ -362,9 +220,9 @@ print(json.dumps({'CLAUDE_CODE_OAUTH_TOKEN': os.environ['CLAUDE_CODE_OAUTH_TOKEN
local reply
if reply=$(send_test_prompt "$wsid" "$token"); then
if echo "$reply" | grep -q "PONG"; then
validated "claude-code reply contains PONG"
pass "claude-code reply contains PONG"
else
validated "claude-code reply non-empty (first 80 chars: ${reply:0:80})"
pass "claude-code reply non-empty (first 80 chars: ${reply:0:80})"
fi
assert_activity_logged "claude-code" "$wsid" "$token"
else
@@ -396,7 +254,7 @@ print(json.dumps({
}))
")
local resp wsid
resp=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \
resp=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \
-d "{\"name\":\"Priority E2E (hermes)\",\"runtime\":\"hermes\",\"tier\":1,\"model\":\"openai/gpt-4o\",\"secrets\":$secrets}")
wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true
if [ -z "$wsid" ]; then
@@ -430,9 +288,9 @@ print(json.dumps({
local reply
if reply=$(send_test_prompt "$wsid" "$token"); then
if echo "$reply" | grep -q "PONG"; then
validated "hermes reply contains PONG"
pass "hermes reply contains PONG"
else
validated "hermes reply non-empty (first 80 chars: ${reply:0:80})"
pass "hermes reply non-empty (first 80 chars: ${reply:0:80})"
fi
assert_activity_logged "hermes" "$wsid" "$token"
else
@@ -469,7 +327,7 @@ print(json.dumps({
}))
")
local resp wsid
resp=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \
resp=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \
-d "{\"name\":\"Priority E2E ($runtime)\",\"runtime\":\"$runtime\",\"tier\":1,\"model\":\"openai/gpt-4o-mini\",\"secrets\":$secrets}")
wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true
if [ -z "$wsid" ]; then
@@ -500,9 +358,9 @@ print(json.dumps({
local reply
if reply=$(send_test_prompt "$wsid" "$token"); then
if echo "$reply" | grep -q "PONG"; then
validated "$runtime reply contains PONG"
pass "$runtime reply contains PONG"
else
validated "$runtime reply non-empty (first 80 chars: ${reply:0:80})"
pass "$runtime reply non-empty (first 80 chars: ${reply:0:80})"
fi
assert_activity_logged "$runtime" "$wsid" "$token"
else
@@ -513,253 +371,18 @@ print(json.dumps({
run_codex() { run_openai_runtime "codex" "codex"; }
run_openclaw() { run_openai_runtime "openclaw" "openclaw"; }
####################################################################
# Mock arm — the GUARANTEED, always-available REQUIRE-LIVE backbone.
####################################################################
# The mock runtime (workspace-server/internal/handlers/mock_runtime.go)
# is a virtual workspace: NO container, NO EC2, NO LLM key. The org-import
# path (createWorkspaceTree, org_import.go) short-circuits a runtime=mock
# workspace straight to status='online' (no provisioner needed), and the
# A2A proxy (a2a_proxy.go → handleMockA2A) synthesises a deterministic
# canned JSON-RPC reply with logActivity=true (writes the activity_logs
# row too). That makes mock the perfect REQUIRE-LIVE backbone: it
# exercises the SAME plumbing every real runtime needs to pass —
# provision-decision → status=online → A2A round-trip → activity_logs —
# without depending on any external provider key or LLM availability. It
# is GREEN on a healthy platform and RED only if that plumbing genuinely
# breaks (DB insert, status flip, A2A proxy, activity logging). No more
# false-green (zero-validated is impossible when mock works), and no more
# can't-go-green (mock needs no secret, so it always runs in CI).
#
# Why org-import (POST /org/import) instead of POST /workspaces:
# The mock→online short-circuit lives ONLY in createWorkspaceTree
# (org_import.go). The single-workspace Create handler (workspace.go)
# has no mock branch — it routes runtime=mock through
# provisionWorkspaceAuto, which in CI's local-build mode has no mock
# image and would never reach online. Org-import is the supported path
# to a live mock workspace, so the arm drives it.
#
# The canned reply is one of the "On it!" variants (NOT "PONG"), so this
# arm validates on the non-empty / non-error branch — that is the real
# contract for mock (it proves the plumbing, not an LLM's instruction-
# following).
run_mock() {
echo ""
echo "=== mock (no-key plumbing backbone) happy path ==="
# No secret gate — mock ALWAYS runs. That is the whole point: it is the
# required-validation arm that keeps E2E_REQUIRE_LIVE honest without a key.
# Inline single-workspace mock org. model is a required field on the
# org-import contract (createWorkspaceTree fails-closed without one);
# mock never USES the model, so any non-empty value satisfies the
# contract. The org-import path does not run the Create handler's
# registry model-validation, so "mock" is accepted as-is.
# POST /org/import is AdminAuth-gated (router.go:778). When the platform has
# ADMIN_TOKEN set (as the e2e-api CI job now does), an unauthenticated import
# 401s with {"error":"admin auth required"}. Send the same admin bearer the
# mint helper uses (MOLECULE_ADMIN_TOKEN, ADMIN_TOKEN fallback) — guarded so a
# bootstrap/dev platform with no admin token (fail-open) still works.
local admin_bearer="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}"
local admin_auth=()
[ -n "$admin_bearer" ] && admin_auth=(-H "Authorization: Bearer $admin_bearer")
local import_resp wsid
import_resp=$(curl -s -X POST "$BASE/org/import" -H "Content-Type: application/json" \
${admin_auth[@]+"${admin_auth[@]}"} \
-d '{
"template": {
"name": "Priority E2E Mock Org",
"defaults": {"runtime": "mock", "model": "mock", "tier": 1},
"workspaces": [
{"name": "Priority E2E (mock)", "runtime": "mock", "model": "mock", "tier": 1}
]
}
}')
# org-import returns {"org":..., "count":N, "workspaces":[{"id":...,
# "name":...,"tier":...}, ...]} (handlers/org.go:898-901). Pull the id of
# the single workspace we declared. (Older "results" key fallback kept for
# forward/back compat in case the response shape is ever versioned.)
wsid=$(echo "$import_resp" | python3 -c '
import json, sys
try:
d = json.load(sys.stdin)
except Exception:
sys.exit(0)
for r in (d.get("workspaces") or d.get("results") or []):
if r.get("name") == "Priority E2E (mock)" and r.get("id"):
print(r["id"]); break
') || true
if [ -z "$wsid" ]; then
# mock org-import is the REQUIRE-LIVE backbone and is EXPECTED to succeed in
# CI now that the e2e-api job wires an admin token (ADMIN_TOKEN on the
# platform + MOLECULE_ADMIN_TOKEN sent above). A missing id here is a REAL
# break (admin-auth wiring, org-import create, or the mock short-circuit) and
# MUST red the gate — so this is a hard fail(), not a best-effort miss. Under
# E2E_REQUIRE_LIVE=1 a FAIL also forces a non-zero exit via
# evaluate_require_live_gate. Surface the response so the break is visible
# (e.g. {"error":"admin auth required"} would mean the token wiring regressed).
fail "create mock workspace (org-import)" "$import_resp"
return 0
fi
CREATED_WSIDS+=("$wsid")
echo " workspace=$wsid"
# Mock goes straight to online (no container boot) — a short budget is
# plenty; if it is NOT online quickly the mock short-circuit in
# createWorkspaceTree is genuinely broken and the gate SHOULD red.
local final
final=$(wait_for_status "$wsid" "online failed" 60) || true
if [ "$final" != "online" ]; then
fail "mock workspace reaches online" "final status: $final (mock should go online without provisioning)"
return 0
fi
pass "mock workspace reaches online"
# Mock workspaces are not created with an inline token; mint one via the
# admin endpoint (same fallback every other arm uses).
local token
token=$(e2e_mint_workspace_token "$wsid") || true
if [ -z "$token" ]; then
fail "resolve mock workspace token" "no token returned from POST /admin/workspaces/:id/tokens"
return 0
fi
# A2A round-trip. The mock proxy returns a canned non-error reply (one
# of the "On it!" variants) — NOT "PONG" — so we validate on the
# non-empty branch. A non-error, non-empty reply means the A2A proxy
# short-circuit + reply-shape contract are intact end-to-end.
local reply
if reply=$(send_test_prompt "$wsid" "$token"); then
validated "mock reply non-empty (canned; first 80 chars: ${reply:0:80})"
assert_activity_logged "mock" "$wsid" "$token"
else
fail "mock reply" "${reply:-<empty or error>} (mock A2A short-circuit should always return a canned reply)"
fi
}
####################################################################
# MiniMax live arm — OPPORTUNISTIC (best-effort) real-LLM arm.
####################################################################
# NOTE: this is now a BEST-EFFORT arm, not the REQUIRE-LIVE backbone.
# mock (run_mock above) is the guaranteed, no-key validation that keeps
# the gate honest. MiniMax-create is fragile in CI: the namespaced model
# id minimax:MiniMax-M2.7 is NOT in claude-code's native model set and
# does NOT resolve via DeriveProvider (its only prefix-owner, byok-minimax,
# is not wired as a claude-code runtime arm), so the create is rejected
# 422 UNREGISTERED_MODEL_FOR_RUNTIME before any provisioning (RCA core
# registry_gen.go Runtimes["claude-code"]). Rather than red the REQUIRED
# gate on that registry-skew (or on any transient MiniMax provisioning /
# model-registration issue), this arm reports a best-effort MISS via
# bestfail() and lets mock carry the validation. If MiniMax DOES come up
# it validates as a bonus real-LLM check.
# Drives the claude-code runtime against MiniMax (BYOK) using the
# already-present Gitea secret MOLECULE_STAGING_MINIMAX_API_KEY,
# surfaced into the env as E2E_MINIMAX_API_KEY (same name + secret the
# staging-smoke / continuous-synth canaries use — see staging-smoke.yml
# and continuous-synth-e2e.yml). NO new credential is introduced.
#
# Why this is the arm that keeps the REQUIRED gate honest:
# - claude-code's `minimax` provider (providers.yaml / registry_gen.go)
# is third_party_anthropic_compat: it reads MINIMAX_API_KEY at boot
# and routes ANTHROPIC_BASE_URL → api.minimax.io/anthropic. So the
# ONLY tenant secret needed is {"MINIMAX_API_KEY": <key>} — exactly
# the SECRETS_JSON branch test_staging_full_saas.sh uses.
# - Model id is the NAMESPACED colon-form `minimax:MiniMax-M2.7`, the
# registered BYOK arm for claude-code (registry_gen.go Runtimes
# ["claude-code"]["minimax"]). Per core#2263 the BARE `MiniMax-M2`
# id can 400 on a registry-skewed ws-server build; the namespaced
# form resolves the way kimi's `moonshot/…` does, so it's the
# robust choice for the gate.
run_minimax() {
echo ""
echo "=== minimax (claude-code BYOK) happy path ==="
if [ -z "${E2E_MINIMAX_API_KEY:-}" ]; then
skip "E2E_MINIMAX_API_KEY not set (MiniMax live arm needs the MiniMax key)"
return 0
fi
local secrets
secrets=$(python3 -c "
import json, os
# claude-code's minimax provider (third_party_anthropic_compat) reads
# MINIMAX_API_KEY and points ANTHROPIC_BASE_URL at api.minimax.io/anthropic
# at boot — so the ONLY tenant secret needed is the MiniMax key itself.
print(json.dumps({'MINIMAX_API_KEY': os.environ['E2E_MINIMAX_API_KEY']}))
")
local resp wsid
# Namespaced BYOK model id (core#2263): bare MiniMax-M2 can 400 on a
# registry-skewed ws-server build; minimax:MiniMax-M2.7 is the
# registered claude-code BYOK arm and resolves like kimi's moonshot/…
resp=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \
-d "{\"name\":\"Priority E2E (minimax)\",\"runtime\":\"claude-code\",\"model\":\"minimax:MiniMax-M2.7\",\"tier\":1,\"secrets\":$secrets}")
wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true
if [ -z "$wsid" ]; then
# BEST-EFFORT: MiniMax-create is fragile (see header — the namespaced
# model id is registry-skewed → 422). Do NOT red the gate; mock is the
# required backbone. Report the create response so the skew is visible.
bestfail "create minimax workspace (best-effort; mock carries the gate)" "$resp"
return 0
fi
CREATED_WSIDS+=("$wsid")
echo " workspace=$wsid"
# claude-code runtime image is already pulled; cold boot ~30-90s. The
# first MiniMax cold-call can be slow but that's covered by send_test_prompt's
# --max-time 180.
local final
final=$(wait_for_status "$wsid" "online failed" 240) || true
if [ "$final" != "online" ]; then
bestfail "minimax workspace reaches online (best-effort)" "final status: $final"
return 0
fi
pass "minimax workspace reaches online"
local token
token=$(echo "$resp" | e2e_extract_token)
if [ -z "$token" ]; then
token=$(e2e_mint_workspace_token "$wsid")
fi
if [ -z "$token" ]; then
bestfail "resolve minimax workspace token (best-effort)" "no token returned"
return 0
fi
local reply
if reply=$(send_test_prompt "$wsid" "$token"); then
if echo "$reply" | grep -q "PONG"; then
validated "minimax reply contains PONG"
else
validated "minimax reply non-empty (first 80 chars: ${reply:0:80})"
fi
assert_activity_logged "minimax" "$wsid" "$token"
else
bestfail "minimax reply (best-effort)" "${reply:-<empty or error>}"
fi
}
# `mock` runs FIRST and by default: it is the no-key REQUIRE-LIVE backbone
# that guarantees >=1 validation on a healthy platform (see run_mock). The
# real-LLM arms (claude-code/codex/hermes/openclaw/minimax) run if their
# secrets are present and add real-provider coverage on top; minimax is
# best-effort (never reds the gate).
WANT="${E2E_RUNTIMES:-mock claude-code codex hermes openclaw minimax}"
WANT="${E2E_RUNTIMES:-claude-code codex hermes openclaw}"
for r in $WANT; do
case "$r" in
mock) run_mock ;;
claude-code) run_claude_code ;;
codex) run_codex ;;
hermes) run_hermes ;;
openclaw) run_openclaw ;;
minimax) run_minimax ;;
all) run_mock; run_claude_code; run_codex; run_hermes; run_openclaw; run_minimax ;;
all) run_claude_code; run_codex; run_hermes; run_openclaw ;;
*) echo "unknown runtime in E2E_RUNTIMES: $r" >&2; exit 2 ;;
esac
done
echo ""
echo "=== Results: $PASS passed, $FAIL failed, $SKIP skipped, $VALIDATED runtime(s) validated end-to-end ==="
# Final exit decision lives in evaluate_require_live_gate (defined at the top of
# this file, before any platform I/O) so the same logic is unit-tested in
# isolation by test_require_live_priority_gate_unit.sh. Mirror its return code
# into the process exit code.
evaluate_require_live_gate
exit $?
echo "=== Results: $PASS passed, $FAIL failed, $SKIP skipped ==="
[ "$FAIL" -eq 0 ]
@@ -50,11 +50,7 @@
# Optional env (mirrors the full-saas harness where they overlap):
# E2E_RUNTIME claude-code (default)
# E2E_PROVISION_TIMEOUT_SECS default 900 (cold EC2 budget)
# E2E_WORKSPACE_ONLINE_TIMEOUT_SECS default 900 (15min). A workspace that
# cannot reach online in 15min is a staging/boot problem,
# not slow cold-boot — fail fast so the trap tears down the
# EC2 instead of hanging ~1h and leaking a running instance
# (observed: run 216031 hung 32min with a live e2e-rec EC2).
# E2E_WORKSPACE_ONLINE_TIMEOUT_SECS default 3600 (cold-boot worst-case)
# E2E_RECONCILE_OFFLINE_TIMEOUT_SECS default 180 (PRIMARY: leave 'online'.
# Reconciler cadence is 60s — 3 cycles +
# AWS terminate-visibility slack.)
@@ -86,7 +82,7 @@ CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}"
ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}"
RUNTIME="${E2E_RUNTIME:-claude-code}"
PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}"
WORKSPACE_ONLINE_TIMEOUT_SECS="${E2E_WORKSPACE_ONLINE_TIMEOUT_SECS:-900}"
WORKSPACE_ONLINE_TIMEOUT_SECS="${E2E_WORKSPACE_ONLINE_TIMEOUT_SECS:-3600}"
# PRIMARY bound: the reconciler ticks every 60s; it needs one cycle to see
# the dead instance after AWS makes the terminate visible to DescribeInstances
# (typically seconds, but can lag). 180s = ~3 cycles + slack.
@@ -329,18 +325,7 @@ ws_field() {
# tolerable — but wiring the same keys keeps boot behaviour identical to the
# sibling and avoids a config path that only this test would exercise.
SECRETS_JSON='{}'
# Platform-managed path (E2E_LLM_PATH=platform, the DEFAULT for this test):
# the workspace boots on the CP LLM proxy with NO tenant key, model
# moonshot/kimi-k2.6 — the exact create combo test_staging_full_saas.sh uses
# successfully. This test only needs the workspace to reach status=online so
# it can kill the EC2 and assert the reconciler heals it; it does NOT exercise
# a real LLM completion, so the platform path is both sufficient and the one
# proven to create cleanly. (The BYOK key paths below 400'd at create — see
# the create-failure capture added below — which is why platform is default.)
if [ "${E2E_LLM_PATH:-platform}" = "platform" ]; then
log " LLM path: PLATFORM-MANAGED (no tenant key; moonshot/kimi-k2.6 via proxy)"
SECRETS_JSON='{}'
elif [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
SECRETS_JSON=$(python3 -c "import json,os; print(json.dumps({'MINIMAX_API_KEY': os.environ['E2E_MINIMAX_API_KEY']}))")
elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
SECRETS_JSON=$(python3 -c "import json,os; print(json.dumps({'ANTHROPIC_API_KEY': os.environ['E2E_ANTHROPIC_API_KEY']}))")
@@ -360,32 +345,21 @@ print(json.dumps({
")
fi
E2E_LLM_PATH="${E2E_LLM_PATH:-platform}" MODEL_SLUG=$(E2E_LLM_PATH="${E2E_LLM_PATH:-platform}" pick_model_slug "$RUNTIME")
MODEL_SLUG=$(pick_model_slug "$RUNTIME")
log " MODEL_SLUG=$MODEL_SLUG"
log "4/6 Provisioning workspace (runtime=$RUNTIME)..."
# --fail-with-body makes curl exit non-zero on a 4xx/5xx but STILL writes the
# response body to stdout; the `|| { ... }` catches that so the body is printed
# instead of `set -e` aborting the command-substitution silently (the old bug
# that hid the real HTTP-400 reason). $WS_RESP holds the body either way.
WS_RESP=$(tenant_call POST /workspaces \
-H "Content-Type: application/json" \
-d "{\"name\":\"E2E Reconciler\",\"runtime\":\"$RUNTIME\",\"tier\":2,\"model\":\"$MODEL_SLUG\",\"secrets\":$SECRETS_JSON}") || {
rc=$?
fail "Workspace create failed (curl rc=$rc, model=$MODEL_SLUG). Response body: $WS_RESP"
}
WS_ID=$(echo "$WS_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null)
[ -z "$WS_ID" ] && fail "Workspace create response missing 'id' (model=$MODEL_SLUG): $WS_RESP"
-d "{\"name\":\"E2E Reconciler\",\"runtime\":\"$RUNTIME\",\"tier\":2,\"model\":\"$MODEL_SLUG\",\"secrets\":$SECRETS_JSON}")
WS_ID=$(echo "$WS_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['id'])")
[ -z "$WS_ID" ] && fail "Workspace create response missing 'id': $WS_RESP"
log " WS_ID=$WS_ID"
# Wait for the workspace to reach status=online and capture its instance_id.
log " Waiting for workspace to reach status=online (up to $((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min)..."
ONLINE_DEADLINE=$(( $(date +%s) + WORKSPACE_ONLINE_TIMEOUT_SECS ))
ORIGINAL_INSTANCE_ID=""
ONLINE_SINCE=""
# Grace before falling back to the AWS workspace tag when the tenant API
# does not surface instance_id (observed on staging).
INSTANCE_ID_GRACE_SECS="${E2E_INSTANCE_ID_GRACE_SECS:-45}"
WS_LAST_STATUS=""
while true; do
if [ "$(date +%s)" -gt "$ONLINE_DEADLINE" ]; then
@@ -398,27 +372,11 @@ while true; do
WS_LAST_STATUS="$WS_STATUS"
fi
if [ "$WS_STATUS" = "online" ]; then
[ -z "$ONLINE_SINCE" ] && ONLINE_SINCE=$(date +%s)
ORIGINAL_INSTANCE_ID=$(ws_field "$WS_ID" "instance_id")
if [ -n "$ORIGINAL_INSTANCE_ID" ]; then
break
fi
# The workspace is online but the tenant API does not surface instance_id
# (observed on staging — the DB has it, the API response omits it). After a
# short grace, fall back to the AWS workspace-instance tag so the kill step
# can proceed. The reconciler reads instance_id from the DB and acts on the
# real EC2 regardless of what the API surfaces, so the AWS-tag instance is
# the correct kill target. Without this fallback the loop spins to the online
# deadline and fails with a misleading "never reached online".
if [ $(( $(date +%s) - ONLINE_SINCE )) -ge "$INSTANCE_ID_GRACE_SECS" ]; then
# ws-tenant-<slug>-<wsid...> is the workspace EC2 (vs tenant-<slug>).
ORIGINAL_INSTANCE_ID=$(e2e_ec2_instances_for_slug "$SLUG" 2>/dev/null \
| awk '$2 ~ /^ws-tenant-/ {print $1}' | sort -u | head -1)
if [ -n "$ORIGINAL_INSTANCE_ID" ]; then
log " instance_id not surfaced by API after ${INSTANCE_ID_GRACE_SECS}s — using AWS workspace tag: $ORIGINAL_INSTANCE_ID"
break
fi
fi
# online but instance_id not surfaced yet — keep polling briefly.
log " $WS_ID online but instance_id not populated yet — waiting"
fi
# 'failed' is transient on cold boot (bootstrap-watcher deadline vs heartbeat
-124
View File
@@ -1,124 +0,0 @@
#!/usr/bin/env bash
# Fail-direction / load-bearing proof for the E2E_REQUIRE_LIVE
# fail-closed-on-skip guard in test_staging_full_saas.sh.
#
# WHY (harden/e2e-staging-saas-failclosed): the staging SaaS E2E is being
# hardened to become a HARD merge-gate. A gate that can reach its final `ok`
# WITHOUT having actually exercised a provision→online→A2A cycle is a
# false-green — it would let a refactor that short-circuits the lifecycle
# (or a skip path that swallows it) report PASS. require_live_or_die() is the
# guard; this test proves it FAILS (exit 5) when milestones are missing and
# PASSES when all fired — the watch-it-fail counterpart the dev-SOP requires.
#
# Runs entirely offline (no LLM, no network, no provisioning) — pure shell
# logic — so it can run on every PR in the fast lane and locally via `bash`.
set -uo pipefail
# Scratch dir for the generated guard-runner stubs. EXIT trap guarantees
# cleanup even when an assertion exits the test non-zero (lint_cleanup_traps).
TMPDIR_E2E=$(mktemp -d -t require-live-guard-XXXXXX)
trap 'rm -rf "$TMPDIR_E2E"' EXIT INT TERM
PASS=0
FAIL=0
# Reproduce the EXACT guard logic from test_staging_full_saas.sh. Kept in
# lockstep with the host script: if the host logic changes, this test must
# change with it (and a divergence is itself a signal to re-prove the gate).
make_guard_runner() {
cat <<'EOF'
REQUIRE_LIVE="${E2E_REQUIRE_LIVE:-0}"
LIVE_MILESTONES=""
live_milestone() {
case " $LIVE_MILESTONES " in
*" $1 "*) ;;
*) LIVE_MILESTONES="$LIVE_MILESTONES $1" ;;
esac
}
require_live_or_die() {
[ "$REQUIRE_LIVE" = "1" ] || return 0
local required="provisioned tenant_online workspace_online a2a_roundtrip"
local m missing=""
for m in $required; do
case " $LIVE_MILESTONES " in
*" $m "*) ;;
*) missing="$missing $m" ;;
esac
done
if [ -n "$missing" ]; then
echo "MISSING:${missing}" >&2
exit 5
fi
}
EOF
}
# run_case <E2E_REQUIRE_LIVE value> <space-separated milestones to stamp>
# echoes the observed exit code.
run_case() {
local require_live="$1"; shift
local milestones="$1"; shift || true
local stub observed m
stub=$(mktemp "$TMPDIR_E2E/stub.XXXXXX")
{
echo "#!/usr/bin/env bash"
echo "set -uo pipefail"
make_guard_runner
for m in $milestones; do
echo "live_milestone $m"
done
echo "require_live_or_die"
echo 'echo REACHED_END'
} > "$stub"
E2E_REQUIRE_LIVE="$require_live" bash "$stub" >/dev/null 2>&1
observed=$?
rm -f "$stub"
echo "$observed"
}
assert_rc() {
local label="$1" require_live="$2" milestones="$3" expected="$4"
local observed
observed=$(run_case "$require_live" "$milestones")
if [ "$observed" = "$expected" ]; then
echo "$label: REQUIRE_LIVE=$require_live milestones='$milestones' → rc=$observed"
PASS=$((PASS+1))
else
echo "$label: REQUIRE_LIVE=$require_live milestones='$milestones' expected=$expected OBSERVED=$observed" >&2
FAIL=$((FAIL+1))
fi
}
echo "=== E2E_REQUIRE_LIVE fail-closed-on-skip guard proof ==="
echo
# DECISIVE (false-green trap): REQUIRE_LIVE=1 but NO lifecycle ran → exit 5.
assert_rc "require-live, nothing ran → exit 5 (the false-green trap)" \
1 "" 5
# REQUIRE_LIVE=1 with a partial lifecycle (provisioned but no A2A) → exit 5.
assert_rc "require-live, partial lifecycle → exit 5" \
1 "provisioned tenant_online workspace_online" 5
# REQUIRE_LIVE=1 with every required milestone → pass (rc=0).
assert_rc "require-live, full lifecycle → pass" \
1 "provisioned tenant_online workspace_online a2a_roundtrip" 0
# Idempotency: duplicate stamps don't break membership; full set still passes.
assert_rc "require-live, duplicate stamps still pass" \
1 "provisioned provisioned tenant_online workspace_online a2a_roundtrip a2a_roundtrip" 0
# Guard is a no-op when CI did not demand a live run: a non-live local run
# with nothing stamped must NOT exit 5 (we don't break local/debug runs).
assert_rc "no require-live, nothing ran → pass (guard is opt-in)" \
0 "" 0
assert_rc "require-live unset-equivalent (0), partial → pass" \
0 "provisioned" 0
# Extra unknown milestone is harmless as long as required set is present.
assert_rc "require-live, extra milestone tolerated" \
1 "provisioned tenant_online workspace_online a2a_roundtrip extra_thing" 0
echo
echo "=== Results: $PASS passed, $FAIL failed ==="
[ "$FAIL" -eq 0 ]
@@ -1,114 +0,0 @@
#!/usr/bin/env bash
# Fail-direction / load-bearing proof for the E2E_REQUIRE_LIVE zero-validated
# gate in test_priority_runtimes_e2e.sh (the REQUIRED `E2E API Smoke Test`).
#
# WHY (harden/enforce-ci-gates-core-v2, PR #2286): the priority-runtimes E2E's
# only historical exit gate was `[ "$FAIL" -eq 0 ]`. When every runtime SKIPs
# because no live secret is present — exactly what the CI step did — PASS=0
# FAIL=0 and the script exited 0 (GREEN) while validating ZERO runtimes. The
# REQUIRED merge gate was therefore false-green: passing without exercising a
# single runtime. The fix adds a VALIDATED counter and makes a zero-validated
# run RED when E2E_REQUIRE_LIVE is set.
#
# That zero-validated→RED decision lives in evaluate_require_live_gate() in
# test_priority_runtimes_e2e.sh. CI cannot prove it via a live arm — the CI
# substrate can't provision ANY runtime end-to-end (MiniMax 422, mock org-
# import create fails, claude-code needs a key CI lacks), so the live e2e-api
# job does NOT force E2E_REQUIRE_LIVE (that would red the required gate for
# everyone). This UNIT test is the regression coverage instead: it drives the
# REAL evaluate_require_live_gate() function — not a copy — in isolation by
# sourcing the script with E2E_PRIORITY_UNIT_SOURCE=1 (which stops before any
# platform I/O), setting the counters, and asserting the gate's return code.
#
# Because it exercises the actual function, a future revert of the zero-
# validated→RED logic in test_priority_runtimes_e2e.sh fails THIS test on
# every PR — so the false-green can't silently come back.
#
# Runs entirely offline (no LLM, no network, no provisioning) — pure shell
# logic — so it runs on every PR in the fast lane and locally via `bash`.
set -uo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
GATE_SCRIPT="$SCRIPT_DIR/test_priority_runtimes_e2e.sh"
if [ ! -f "$GATE_SCRIPT" ]; then
echo "FATAL: cannot find $GATE_SCRIPT" >&2
exit 2
fi
PASS=0
FAIL=0
# run_case <E2E_REQUIRE_LIVE value> <VALIDATED count> <FAIL count>
# Sources the REAL test_priority_runtimes_e2e.sh under the unit source-guard
# (E2E_PRIORITY_UNIT_SOURCE=1 → it returns right after defining the counters
# and evaluate_require_live_gate(), before _lib.sh / the live pre-sweep curl),
# sets the counters to the scenario, calls the real gate, and echoes the
# return code. Each case runs in a fresh `bash -c` so set -e/-u inside the
# sourced script can't leak between cases or kill this harness.
run_case() {
local require_live="$1" validated="$2" failcount="$3"
local observed
E2E_PRIORITY_UNIT_SOURCE=1 \
E2E_REQUIRE_LIVE="$require_live" \
GATE_SCRIPT="$GATE_SCRIPT" \
VAL="$validated" \
FL="$failcount" \
bash -c '
set -uo pipefail
# shellcheck disable=SC1090
source "$GATE_SCRIPT" # returns at the source-guard (no platform I/O)
VALIDATED="$VAL"
FAIL="$FL"
evaluate_require_live_gate >/dev/null 2>&1
exit $?
'
observed=$?
echo "$observed"
}
assert_rc() {
local label="$1" require_live="$2" validated="$3" failcount="$4" expected="$5"
local observed
observed=$(run_case "$require_live" "$validated" "$failcount")
if [ "$observed" = "$expected" ]; then
echo "$label: REQUIRE_LIVE=$require_live VALIDATED=$validated FAIL=$failcount → rc=$observed"
PASS=$((PASS + 1))
else
echo "$label: REQUIRE_LIVE=$require_live VALIDATED=$validated FAIL=$failcount expected=$expected OBSERVED=$observed" >&2
FAIL=$((FAIL + 1))
fi
}
echo "=== E2E_REQUIRE_LIVE priority-runtimes zero-validated gate proof ==="
echo " (drives the REAL evaluate_require_live_gate from $GATE_SCRIPT)"
echo
# (a) DECISIVE false-green trap: REQUIRE_LIVE=1 + zero validated → RED (exit 1).
assert_rc "require-live, zero validated → RED (the false-green trap)" \
1 0 0 1
# (b) REQUIRE_LIVE=1 + at least one validated → GREEN (exit 0).
assert_rc "require-live, one validated → GREEN" \
1 1 0 0
assert_rc "require-live, several validated → GREEN" \
1 3 0 0
# (c) REQUIRE_LIVE unset-equivalent (0) + zero validated → GREEN (loud skip).
assert_rc "no require-live, zero validated → GREEN (dev-convenience loud skip)" \
0 0 0 0
# REQUIRE_LIVE=true (string form) is also honoured by the gate.
assert_rc "require-live='true', zero validated → RED" \
true 0 0 1
# A real FAIL is always red, regardless of REQUIRE_LIVE / VALIDATED — the
# zero-validated guard must not mask (nor be masked by) a genuine failure.
assert_rc "real FAIL with validations, no require-live → RED" \
0 2 1 1
assert_rc "real FAIL, zero validated, no require-live → RED" \
0 0 1 1
echo
echo "=== Results: $PASS passed, $FAIL failed ==="
[ "$FAIL" -eq 0 ]
+26 -146
View File
@@ -40,25 +40,9 @@
# E2E_INTENTIONAL_FAILURE 1 → break a step on purpose to verify
# the EXIT trap still tears down (mirrors
# the full-saas harness's safety net).
# E2E_REQUIRE_LIVE 1 → fail-closed if the harness exits 0
# WITHOUT having driven all four
# awaiting_agent transitions. CI sets this
# so a future skip / early-return can never
# masquerade as a green run. Mirrors CP
# serving-e2e SERVING_E2E_REQUIRE_LIVE.
# E2E_STALE_POLL_DEADLINE_SECS default 240. Upper bound for the
# heartbeat-staleness READINESS poll (step
# 6). Replaces the old fixed sleep+one-shot
# assert that raced the sweep cadence.
# E2E_TRANSIENT_RETRIES default 8. Bounded retries for register /
# re-register against transient edge errors
# (502/503/504 from Caddy during cold TLS /
# agent boot). Mirrors the full-saas
# cold-start retry loop — NOT a bare sleep.
#
# Exit codes: 0 happy, 1 generic, 2 missing env, 3 provision timeout,
# 4 teardown leak, 5 REQUIRE_LIVE violation (exited 0 having validated
# nothing).
# 4 teardown leak.
set -euo pipefail
@@ -67,13 +51,6 @@ ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway s
PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}"
RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}"
STALE_WAIT_SECS="${E2E_STALE_WAIT_SECS:-180}"
# Readiness-poll deadline for the sweep transition (step 6). Must exceed
# STALE_WAIT_SECS (the no-heartbeat window) by at least one sweep
# interval so a slightly-late sweep tick is polled-for, not misread as a
# stuck 'online'. 240 = 180s window + 60s sweep-cadence headroom.
STALE_POLL_DEADLINE_SECS="${E2E_STALE_POLL_DEADLINE_SECS:-240}"
TRANSIENT_RETRIES="${E2E_TRANSIENT_RETRIES:-8}"
REQUIRE_LIVE="${E2E_REQUIRE_LIVE:-0}"
SLUG="e2e-ext-$(date +%Y%m%d)-${RUN_ID_SUFFIX}"
SLUG=$(echo "$SLUG" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-' | head -c 32)
@@ -82,66 +59,6 @@ log() { echo "[$(date +%H:%M:%S)] $*"; }
fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; }
ok() { echo "[$(date +%H:%M:%S)] ✅ $*"; }
# REQUIRE_LIVE bookkeeping: count the four awaiting_agent transitions the
# test is contracted to prove. The EXIT trap fails-closed (exit 5) if the
# script reaches a clean exit without all four — so a silent skip, an
# early `return 0`, or a refactor that drops a step can never show green.
TRANSITIONS_VERIFIED=0
EXPECTED_TRANSITIONS=4
require_transition() { # $1 = human label
TRANSITIONS_VERIFIED=$((TRANSITIONS_VERIFIED + 1))
log " [require-live] transition ${TRANSITIONS_VERIFIED}/${EXPECTED_TRANSITIONS} proven: $1"
}
# Redact bearer tokens from any HTTP body before logging (mirrors the
# full-saas sanitize_http_body so transient-error logs never leak creds).
sanitize_http_body() {
sed -E 's/(Bearer|token)[[:space:]]+[A-Za-z0-9._-]+/\1 REDACTED/g'
}
# Bounded retry-on-transient for POST /registry/register. The tenant edge
# (Caddy) returns 502/503/504 with an identifiable body while TLS / the
# workspace agent finishes cold-booting — a single shot here was the
# un-named flake (a transient edge error misread as a register failure).
# This mirrors the full-saas cold-start loop (test_staging_full_saas.sh
# ~L780-816): retry ONLY on a transient TRANSPORT class (5xx + body
# match), bounded by TRANSIENT_RETRIES, and FAIL CLOSED (non-zero) once
# the budget is spent. It deliberately does NOT retry on a 4xx — that's a
# real contract bug (e.g. wrong payload field) and must stay red.
# Sets REGISTER_RESP (body + trailing "HTTP_CODE=NNN" line) on success;
# returns non-zero (caller `fail`s) when the bounded budget is exhausted.
register_with_retry() { # $1 = step label, $2 = request body
local label="$1" body="$2"
local attempt code resp safe
for attempt in $(seq 1 "$TRANSIENT_RETRIES"); do
set +e
resp=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST \
"$TENANT_URL/registry/register" \
-H "Authorization: Bearer $WS_AUTH_TOKEN" \
-H "X-Molecule-Org-Id: $ORG_ID" \
-H "Content-Type: application/json" \
-d "$body")
set -e
code=$(printf '%s' "$resp" | sed -n 's/^HTTP_CODE=//p' | tail -n1)
code=${code:-000}
if [ "$code" = "200" ]; then
REGISTER_RESP="$resp"
return 0
fi
safe=$(printf '%s' "$resp" | sanitize_http_body | head -c 300)
# Retry ONLY on a transient transport class; a 4xx is a real bug.
if echo "$code" | grep -Eq '^(502|503|504)$' \
&& echo "$safe" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|error code: 502|error code: 504|workspace agent unreachable|connection refused|no healthy upstream'; then
log " ${label} transient $code attempt ${attempt}/${TRANSIENT_RETRIES}: $safe"
[ "$attempt" -lt "$TRANSIENT_RETRIES" ] && { sleep 10; continue; }
fi
# Non-transient (4xx, or unrecognized 5xx body): stop and fail closed.
REGISTER_RESP="$resp"
return 1
done
return 1
}
CURL_COMMON=(-sS --fail-with-body --max-time 30)
# ─── cleanup trap (mirrors full-saas) ────────────────────────────────────
@@ -181,19 +98,8 @@ cleanup_org() {
fi
ok "Teardown clean — no orphan resources for $SLUG (${elapsed}s)"
# REQUIRE_LIVE fail-closed gate. Only meaningful on an OTHERWISE-CLEAN
# exit (entry_rc==0): a script that completed all steps but somehow did
# not register all four transitions (a skip, an early return, a dropped
# assertion in a refactor) must NOT report success. A non-zero entry_rc
# already carries its own failure semantics — don't mask it with 5.
if [ "$entry_rc" = "0" ] && [ "${REQUIRE_LIVE}" = "1" ] \
&& [ "$TRANSITIONS_VERIFIED" -lt "$EXPECTED_TRANSITIONS" ]; then
echo "❌ REQUIRE_LIVE: exited 0 but only ${TRANSITIONS_VERIFIED}/${EXPECTED_TRANSITIONS} awaiting_agent transitions were proven — refusing to report green." >&2
exit 5
fi
case "$entry_rc" in
0|1|2|3|4|5) ;;
0|1|2|3|4) ;;
*) exit 1 ;;
esac
}
@@ -342,7 +248,6 @@ GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
DB_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
[ "$DB_STATUS" != "awaiting_agent" ] && fail "DB row status=$DB_STATUS (expected awaiting_agent — migration 046 likely not applied)"
ok "DB row stored as awaiting_agent (proof migration 046 applied)"
require_transition "create: provisioning → awaiting_agent (DB-verified)"
# ─── 5. Register the workspace (transitions to online) ──────────────────
# Pre-fix this path was actually fine because it writes 'online', a value
@@ -372,20 +277,20 @@ log "5/8 Registering workspace via /registry/register..."
# url — accepted but not dispatched-to in poll mode, so
# example.invalid is a valid sentinel.
REGISTER_BODY=$(printf '{"id":"%s","url":"https://example.invalid:443","delivery_mode":"poll","agent_card":{"name":"e2e-ext","skills":[{"id":"echo","name":"Echo"}]}}' "$WS_ID")
# Bounded retry-on-transient (see register_with_retry). The previous
# single-shot here would `fail` on a cold-boot 502 from the tenant edge —
# an un-named transient misread as a register break. The helper retries
# ONLY that class and fails closed on a real 4xx or an exhausted budget.
REGISTER_RESP=""
register_with_retry "register" "$REGISTER_BODY" \
|| fail "register returned non-200 after bounded retries — body: $(printf '%s' "$REGISTER_RESP" | sanitize_http_body | head -c 300)"
log " register response: $(echo "$REGISTER_RESP" | sanitize_http_body | head -c 300)"
# Disable --fail-with-body for this one call so a 4xx surfaces the response
# body (the bare CURL_COMMON would `set -e`-kill before we could log it).
REGISTER_RESP=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST "$TENANT_URL/registry/register" \
-H "Authorization: Bearer $WS_AUTH_TOKEN" \
-H "X-Molecule-Org-Id: $ORG_ID" \
-H "Content-Type: application/json" \
-d "$REGISTER_BODY") || true
log " register response: $(echo "$REGISTER_RESP" | head -c 300)"
echo "$REGISTER_RESP" | grep -q "HTTP_CODE=200" || fail "register returned non-200 — see body above"
GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
ONLINE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
[ "$ONLINE_STATUS" != "online" ] && fail "Expected online after register, got $ONLINE_STATUS"
ok "Workspace transitioned to online"
require_transition "register: awaiting_agent → online"
# Confirm the register handler echoed back delivery_mode=poll. We read
# this from the register RESPONSE, not the workspace GET response, because
@@ -405,63 +310,38 @@ fi
# This is the SECOND silent-failure path (registry/healthsweep.go's
# sweepStaleRemoteWorkspaces). Pre-migration-046 the heartbeat-staleness
# UPDATE silently failed and the workspace stuck on 'online' forever
# even though no agent was alive.
#
# FLAKE FIX (named: sweep-cadence race). The old code did a FIXED
# `sleep $STALE_WAIT_SECS` then a SINGLE assert. The staleness sweep is a
# periodic tick (REMOTE_LIVENESS_STALE_AFTER + a sweep interval); if the
# tick that flips the row lands even one second after the fixed sleep, the
# one-shot GET reads 'online' and the test fails — a real transition,
# misread as a flake because the assert was racing the sweep cadence.
# Replace with: sleep through the mandatory no-heartbeat window ONCE (the
# sweep cannot fire before the window elapses, so polling earlier is
# pointless), then READINESS-POLL for the awaiting_agent transition up to
# STALE_POLL_DEADLINE_SECS, hard-failing with a clear message at the
# deadline. Deterministic: a slow-but-working sweep passes; a genuinely
# stuck 'online' still fails (now with how long we actually waited).
log "6/8 Waiting ${STALE_WAIT_SECS}s no-heartbeat window, then polling for sweep (up to ${STALE_POLL_DEADLINE_SECS}s total)..."
[ "$STALE_POLL_DEADLINE_SECS" -le "$STALE_WAIT_SECS" ] && \
fail "Misconfigured: STALE_POLL_DEADLINE_SECS ($STALE_POLL_DEADLINE_SECS) must exceed STALE_WAIT_SECS ($STALE_WAIT_SECS) by at least one sweep interval"
# even though no agent was alive. We wait the full window + a sweep
# interval and assert the row transitions back to 'awaiting_agent'.
log "6/8 Waiting ${STALE_WAIT_SECS}s for heartbeat-staleness sweep (no heartbeat sent)..."
sleep "$STALE_WAIT_SECS"
STALE_DEADLINE=$(( $(date +%s) + (STALE_POLL_DEADLINE_SECS - STALE_WAIT_SECS) ))
STALE_STATUS=""
while true; do
GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
STALE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
[ "$STALE_STATUS" = "awaiting_agent" ] && break
if [ "$(date +%s)" -gt "$STALE_DEADLINE" ]; then
fail "After ${STALE_POLL_DEADLINE_SECS}s with no heartbeat, status still '$STALE_STATUS' (expected awaiting_agent sweep transition) — migration 046 likely not applied OR sweep not running"
fi
sleep 10
done
GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
STALE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
[ "$STALE_STATUS" != "awaiting_agent" ] && \
fail "After ${STALE_WAIT_SECS}s with no heartbeat, expected status=awaiting_agent (sweep transition), got $STALE_STATUS — migration 046 likely not applied OR sweep not running"
ok "Heartbeat-staleness sweep transitioned online → awaiting_agent (proof healthsweep.go fix working)"
require_transition "sweep: online → awaiting_agent (no heartbeat)"
# ─── 7. Re-register and confirm we can come back online ─────────────────
# This proves the awaiting_agent state is recoverable (re-registrable),
# which is the whole point of using it instead of 'offline'.
log "7/8 Re-registering after stale → confirming recovery to online..."
# Same payload contract as step 5 (id + agent_card both required). See note
# there for why workspace_id would 400. Same bounded retry-on-transient.
REGISTER_RESP=""
register_with_retry "re-register" "$REGISTER_BODY" \
|| fail "re-register returned non-200 after bounded retries — body: $(printf '%s' "$REGISTER_RESP" | sanitize_http_body | head -c 300)"
log " re-register response: $(echo "$REGISTER_RESP" | sanitize_http_body | head -c 300)"
# there for why workspace_id would 400.
REREG_RESP=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST "$TENANT_URL/registry/register" \
-H "Authorization: Bearer $WS_AUTH_TOKEN" \
-H "X-Molecule-Org-Id: $ORG_ID" \
-H "Content-Type: application/json" \
-d "$REGISTER_BODY") || true
log " re-register response: $(echo "$REREG_RESP" | head -c 300)"
echo "$REREG_RESP" | grep -q "HTTP_CODE=200" || fail "re-register returned non-200 — see body above"
GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
RECOVERED_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
[ "$RECOVERED_STATUS" != "online" ] && \
fail "Expected re-register to return workspace to online, got $RECOVERED_STATUS"
ok "Re-register succeeded — awaiting_agent → online (operator-recoverable)"
require_transition "re-register: awaiting_agent → online (recovery)"
# ─── 8. Done — cleanup runs in the EXIT trap ───────────────────────────
# REQUIRE_LIVE belt-and-braces: assert here too (in addition to the EXIT
# trap) so the failure surfaces in step order, not only post-teardown.
if [ "${REQUIRE_LIVE}" = "1" ] && [ "$TRANSITIONS_VERIFIED" -lt "$EXPECTED_TRANSITIONS" ]; then
fail "REQUIRE_LIVE: only ${TRANSITIONS_VERIFIED}/${EXPECTED_TRANSITIONS} transitions proven at end of run"
fi
log "8/8 All four awaiting_agent transitions verified."
log "═══════════════════════════════════════════════════════════════════"
ok "External-runtime E2E PASSED on $SLUG"
+14 -166
View File
@@ -47,15 +47,6 @@
# tear down cleanly (and exit 4 on leak).
# Used by a dedicated sanity workflow
# that verifies the safety net.
# E2E_REQUIRE_LIVE 1 → fail-closed-on-skip guard (CI sets this).
# When set, the run MUST actually complete
# ≥1 full provision→online→A2A cycle. A run
# that reaches the end without having proven
# a real round-trip (e.g. a future refactor
# short-circuits a stage, or a skip path
# swallows the lifecycle) exits 5 rather than
# reporting a false green. Mirrors CP
# serving-e2e's SERVING_E2E_REQUIRE_LIVE.
#
# Exit codes:
# 0 happy path
@@ -63,37 +54,6 @@
# 2 missing required env
# 3 provisioning timed out
# 4 teardown left orphan resources
# 5 E2E_REQUIRE_LIVE set but the run validated no real lifecycle (no
# false-green-on-skip)
#
# ─────────────────────────────────────────────────────────────────────────
# PROMOTION-READINESS (harden/e2e-staging-saas-failclosed):
# This harness is being hardened so `E2E Staging SaaS` + `E2E Staging
# Platform Boot` can become HARD merge-gates. continue-on-error is NOT
# flipped here — that promotion is the CTO's irreversible branch-protection
# call. What this branch makes fail-closed (was false-green / un-named
# flake before):
# • Provision/online waits are bounded readiness-POLLS, not fixed sleeps;
# each hard-fails with a named mechanism + last-seen signal on deadline,
# never a silent timeout (cp#245 boot-timeout class).
# • Peer-discovery (9b) asserts a real 2xx, not just "not 404" — a 5xx /
# 000 / empty no longer reads as "reachable".
# • Activity-log (9b) is ASSERTED reachable (2xx + parseable), not
# logged-and-ignored behind `|| echo '[]'`.
# • Child activity provenance (10) is asserted (was soft-logged).
# • E2E_REQUIRE_LIVE=1 (CI) makes the run exit 5 if it reached the end
# without proving a real provision→online→A2A round-trip — no
# false-green-on-skip.
# STILL BLOCKS making it REQUIRED (must clear before the CTO flips
# continue-on-error→false in .gitea/workflows/e2e-staging-saas.yml):
# • De-flake window: N consecutive green runs on main for BOTH jobs
# (platform-boot shares the cp#245 boot surface — #2187 tracks its
# flip). This harness removes the harness-side flake mechanisms; the
# remaining surface is real-infra (EC2 cold boot, CF DNS) latency,
# already bounded by the readiness polls above.
# • Branch-protection required-context wiring is a repo-settings change,
# not a code change in this PR.
# ─────────────────────────────────────────────────────────────────────────
set -euo pipefail
@@ -130,41 +90,6 @@ log() { echo "[$(date +%H:%M:%S)] $*"; }
fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; }
ok() { echo "[$(date +%H:%M:%S)] ✅ $*"; }
# ─── fail-closed-on-skip live-lifecycle guard ───────────────────────────
# E2E_REQUIRE_LIVE=1 (set by CI) asserts this run ACTUALLY exercised a full
# provision→online→A2A cycle. Each load-bearing lifecycle stage stamps a
# milestone via live_milestone(); at the very end, require_live_or_die()
# checks every required milestone fired. Mechanism: without this, a future
# refactor that short-circuits a stage — or a skip/early-return path that
# swallows the lifecycle — would let the script reach its final `ok` and
# report GREEN having validated nothing. Mirrors CP serving-e2e's
# SERVING_E2E_REQUIRE_LIVE (skip-if-absent must be LOUD, never silent green).
REQUIRE_LIVE="${E2E_REQUIRE_LIVE:-0}"
LIVE_MILESTONES=""
live_milestone() {
# Idempotent set-membership append. Space-delimited; names are tokens.
case " $LIVE_MILESTONES " in
*" $1 "*) ;;
*) LIVE_MILESTONES="$LIVE_MILESTONES $1" ;;
esac
}
require_live_or_die() {
# No-op unless CI demanded a live run.
[ "$REQUIRE_LIVE" = "1" ] || return 0
local required="provisioned tenant_online workspace_online a2a_roundtrip"
local m missing=""
for m in $required; do
case " $LIVE_MILESTONES " in
*" $m "*) ;;
*) missing="$missing $m" ;;
esac
done
if [ -n "$missing" ]; then
echo "[$(date +%H:%M:%S)] ❌ E2E_REQUIRE_LIVE=1 but the run did NOT prove a full live lifecycle — missing milestone(s):${missing}. Reached:${LIVE_MILESTONES:-<none>}. This is a false-green-on-skip guard: a run that validates no real provision→online→A2A cycle MUST NOT report green." >&2
exit 5
fi
}
# Per-runtime model slug dispatch — see lib/model_slug.sh for the rationale.
# Extracted so unit tests (tests/e2e/test_model_slug.sh) can pin every branch
# without booting the full 11-step lifecycle.
@@ -272,7 +197,7 @@ cleanup_org() {
# case statement, and opens a false-positive priority-high
# "safety net broken" issue (#2159, 2026-04-27).
case "$entry_rc" in
0|1|2|3|4|5) ;; # contracted codes — let bash use entry_rc
0|1|2|3|4) ;; # contracted codes — let bash use entry_rc
*) exit 1 ;; # anything else is a generic failure
esac
}
@@ -370,7 +295,6 @@ print('(no org row found for slug=$SLUG — DB drift?)')
esac
done
ok "Tenant provisioning complete"
live_milestone provisioned
# Derive tenant domain from CP hostname so the same harness works in
# both prod (api.moleculesai.app → moleculesai.app) and staging
@@ -427,7 +351,6 @@ while true; do
sleep 5
done
ok "Tenant reachable at $TENANT_URL"
live_milestone tenant_online
# Sanity-test path: once the tenant is provisioned, poisoning the
# tenant token proves the EXIT trap + leak assertion still fire.
@@ -647,7 +570,6 @@ fi
WS_TO_CHECK=("$PARENT_ID")
[ -n "$CHILD_ID" ] && WS_TO_CHECK+=("$CHILD_ID")
wait_workspaces_online_routable "7/11 Waiting for workspace(s) to reach status=online (up to $((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min — hermes cold boot)..." "${WS_TO_CHECK[@]}"
live_milestone workspace_online
# ─── 7a. Real chat image upload/download round-trip ───────────────────
# This deliberately uses the production workflow: tenant admin/session auth
@@ -964,7 +886,7 @@ fi
# identical on main's scheduled synthetic E2E and on PRs (so it is an
# environmental backend regression, never PR-introduced).
if echo "$AGENT_TEXT" | grep -qiF "message contained no text content"; then
fail "A2A — EMPTY COMPLETION (backend regression, NOT a platform/workspace-server bug). The configured model (MODEL_SLUG=${MODEL_SLUG:-?}) returned a 2xx completion with no text part; the runtime surfaced 'message contained no text content.'. Operator action: check the staging LLM backend / proxy for the canary model (the claude-code default is minimax:MiniMax-M2.7 since #2263; was bare MiniMax-M2 #2710) — empty assistant turns, not an auth/quota/boot fault. Raw: $AGENT_TEXT"
fail "A2A — EMPTY COMPLETION (backend regression, NOT a platform/workspace-server bug). The configured model (MODEL_SLUG=${MODEL_SLUG:-?}) returned a 2xx completion with no text part; the runtime surfaced 'message contained no text content.'. Operator action: check the staging LLM backend / proxy for the canary model (MiniMax-M2 since #2710) — empty assistant turns, not an auth/quota/boot fault. Raw: $AGENT_TEXT"
fi
# Generic catch-all — falls through if none of the known regressions hit.
if echo "$AGENT_TEXT" | grep -qiE "error|exception"; then
@@ -1030,14 +952,7 @@ for KA_ATTEMPT in $(seq 1 6); do
KA_SAFE_BODY=$(printf '%s' "$KA_RESP" | sanitize_http_body)
# Retry ONLY on transient transport errors — never on an agent-level
# error (those must surface and fail the gate).
# #2263: include the Cloudflare-shaped literal `error code: 502/504` token so a
# bare edge/gateway 502 (no "Bad Gateway" body) is retried here the same way the
# cold-start PONG probe (line ~800) and the delegation loop (line ~1234) already
# do. Without it, a single un-retried edge 502 right after a healthy round-trip
# fell through to break and failed the gate on the first attempt (Platform Boot
# job, task 268859). Bounded by the existing 6-attempt / sleep-10 loop — no new
# sleep-as-fix; this only widens the transient-match to the sibling pattern.
if echo "$KA_CODE" | grep -Eq '^(502|503|504)$' && echo "$KA_SAFE_BODY" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|error code: 502|error code: 504|workspace agent unreachable|connection refused|no healthy upstream|workspace agent busy|native_session'; then
if echo "$KA_CODE" | grep -Eq '^(502|503|504)$' && echo "$KA_SAFE_BODY" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|workspace agent unreachable|connection refused|no healthy upstream|workspace agent busy|native_session'; then
log " known-answer A2A transient $KA_CODE attempt $KA_ATTEMPT/6: $KA_SAFE_BODY"
if [ "$KA_ATTEMPT" -lt 6 ]; then sleep 10; continue; fi
fi
@@ -1059,11 +974,6 @@ except Exception:
" 2>/dev/null || echo "")
# CORE GATE: contains PINEAPPLE (real round-trip) AND no error-as-text.
a2a_assert_real_completion "$KA_TEXT" "PINEAPPLE" "A2A known-answer (parent, $RUNTIME/$MODEL_SLUG)"
# Real, deterministic LLM round-trip proven — the load-bearing milestone for
# the fail-closed-on-skip guard. Stamped AFTER a2a_assert_real_completion (not
# after the looser PONG check) so the milestone means a verified completion,
# not just a 2xx-with-text.
live_milestone a2a_roundtrip
# ─── 8c. byok-routing regression guard (#1994) ─────────────────────────
# The parent was provisioned with the customer's OWN vendor key
@@ -1189,50 +1099,18 @@ print(json.dumps({
ok "HMA memory write+read roundtripped"
log "9b. Peer discovery + activity log smoke..."
# FAIL-CLOSED: assert a real 2xx, not merely "not 404". The previous
# `[ "$PEERS_CODE" = "404" ] && fail` only caught the route-missing case —
# a 5xx, 000 (connection failure), or empty capture ALL fell through to
# "reachable" (false-green: a broken-but-present route read as healthy).
# Mechanism: route the http_code into its own tempfile (no stderr capture,
# which the old `2>&1 | head -1` could pollute with a curl error line) and
# require 2xx explicitly.
PEERS_TMP=$(e2e_tmp /tmp/e2e_peers.XXXXXX)
set +e
PEERS_CODE=$(tenant_call GET "/registry/$PARENT_ID/peers" \
-o "$PEERS_TMP" -w "%{http_code}" 2>/dev/null)
PEERS_RC=$?
tenant_call GET "/registry/$PARENT_ID/peers" -o /dev/null -w "%{http_code}\n" 2>&1 | head -1 > /tmp/peers_code.txt
set -e
PEERS_CODE=${PEERS_CODE:-000}
if [ "$PEERS_CODE" = "404" ]; then
fail "Peers endpoint missing (404) — route regression. /registry/$PARENT_ID/peers"
fi
if [ "$PEERS_RC" != "0" ] || [ "$PEERS_CODE" -lt 200 ] || [ "$PEERS_CODE" -ge 300 ]; then
fail "Peers endpoint unhealthy (curl_rc=$PEERS_RC, http=$PEERS_CODE) — not a clean 2xx, so 'reachable' would be a false-green. Body: $(head -c 200 "$PEERS_TMP" 2>/dev/null | sanitize_http_body)"
fi
PEERS_CODE=$(cat /tmp/peers_code.txt)
[ "$PEERS_CODE" = "404" ] && fail "Peers endpoint missing (404) — route regression"
ok "Peers endpoint reachable (HTTP $PEERS_CODE)"
# FAIL-CLOSED: the activity-log read was `|| echo '[]'` then the count was
# only LOGGED, never asserted — a 5xx / network failure silently became an
# empty list and the step exited 0 having validated nothing (false-green:
# "validated nothing" class). Assert the endpoint returns a 2xx and a
# parseable activity shape. We do NOT assert count>0 (the parent may
# legitimately have 0 events this early — that's a real, valid state), but
# we DO require the call to have actually succeeded and returned valid JSON.
ACTIVITY_TMP=$(e2e_tmp /tmp/e2e_activity.XXXXXX)
set +e
ACTIVITY_CODE=$(tenant_call GET "/activity?workspace_id=$PARENT_ID&limit=5" \
-o "$ACTIVITY_TMP" -w "%{http_code}" 2>/dev/null)
ACTIVITY_RC=$?
set -e
ACTIVITY_CODE=${ACTIVITY_CODE:-000}
if [ "$ACTIVITY_RC" != "0" ] || [ "$ACTIVITY_CODE" -lt 200 ] || [ "$ACTIVITY_CODE" -ge 300 ]; then
fail "Activity-log endpoint unhealthy (curl_rc=$ACTIVITY_RC, http=$ACTIVITY_CODE) — was previously swallowed by '|| echo []' and reported as 0 events (false-green). Body: $(head -c 200 "$ACTIVITY_TMP" 2>/dev/null | sanitize_http_body)"
fi
ACTIVITY_COUNT=$(python3 -c "import json,sys
d=json.load(open(sys.argv[1]))
print(len(d if isinstance(d, list) else d.get('events', [])))" "$ACTIVITY_TMP" 2>/dev/null) \
|| fail "Activity-log returned HTTP $ACTIVITY_CODE but body was not parseable JSON (events array / {events:[...]}). Body: $(head -c 200 "$ACTIVITY_TMP" 2>/dev/null | sanitize_http_body)"
log " Activity events observed: $ACTIVITY_COUNT (endpoint 2xx + parseable ✓)"
ACTIVITY=$(tenant_call GET "/activity?workspace_id=$PARENT_ID&limit=5" 2>/dev/null || echo '[]')
ACTIVITY_COUNT=$(echo "$ACTIVITY" | python3 -c "import json,sys
d=json.load(sys.stdin)
print(len(d if isinstance(d, list) else d.get('events', [])))" 2>/dev/null || echo 0)
log " Activity events observed: $ACTIVITY_COUNT"
# ─── 9c. Workspace KV memory Edit round-trip ─────────────────────────
# Pins the Edit affordance added to the canvas Memory tab. The UI calls
@@ -1383,44 +1261,14 @@ except Exception:
[ -z "$DELEG_TEXT" ] && fail "Delegation returned no text. Raw: ${DELEG_RESP:0:200}"
ok "Delegation proxy works (child responded: \"${DELEG_TEXT:0:60}\")"
# FAIL-CLOSED via bounded readiness-POLL (was soft-logged false-green).
# The activity pipeline is async, so an immediate single read can miss the
# parent reference — but "did not reference parent" was previously just
# LOGGED and the step passed regardless, so a genuinely broken provenance
# pipeline (parent never recorded as source) read as success. Mechanism:
# poll the child activity log for the parent id for a bounded window
# (E2E_CHILD_ACTIVITY_TIMEOUT_SECS, default 60s) — this is the real
# readiness signal (provenance row materialised), not a fixed sleep — and
# hard-fail with a named mechanism if it never appears.
CHILD_ACT_DEADLINE=$(( $(date +%s) + ${E2E_CHILD_ACTIVITY_TIMEOUT_SECS:-60} ))
CHILD_ACT_SEEN=0
CHILD_ACT_LASTCODE="000"
while true; do
CHILD_ACT_TMP=$(e2e_tmp /tmp/e2e_child_act.XXXXXX)
set +e
CHILD_ACT_CODE=$(tenant_call GET "/activity?workspace_id=$CHILD_ID&limit=20" \
-o "$CHILD_ACT_TMP" -w "%{http_code}" 2>/dev/null)
set -e
CHILD_ACT_LASTCODE=${CHILD_ACT_CODE:-000}
if grep -q "$PARENT_ID" "$CHILD_ACT_TMP" 2>/dev/null; then
CHILD_ACT_SEEN=1
break
fi
[ "$(date +%s)" -ge "$CHILD_ACT_DEADLINE" ] && break
sleep 5
done
if [ "$CHILD_ACT_SEEN" = "1" ]; then
CHILD_ACT=$(tenant_call GET "/activity?workspace_id=$CHILD_ID&limit=20" 2>/dev/null || echo '[]')
if echo "$CHILD_ACT" | grep -q "$PARENT_ID"; then
ok "Child activity log records parent as source"
else
fail "Child activity log never referenced parent $PARENT_ID within ${E2E_CHILD_ACTIVITY_TIMEOUT_SECS:-60}s (last http=$CHILD_ACT_LASTCODE) — delegation-provenance pipeline regression (parent not recorded as source). Previously soft-logged → false-green."
log "Child activity log did not reference parent (pipeline may be async)"
fi
fi
# ─── 11. Teardown runs via trap ────────────────────────────────────────
# Fail-closed-on-skip: before declaring PASS, assert (when CI demanded a live
# run) that every load-bearing lifecycle milestone actually fired. A run that
# reaches here without provision→online→A2A having truly happened exits 5
# instead of reporting green. Teardown still runs (EXIT trap) on that exit.
require_live_or_die
log "11/11 All checks passed. Teardown runs via EXIT trap."
ok "═══ STAGING $MODE-SAAS E2E PASSED ═══"
+4 -12
View File
@@ -2,15 +2,10 @@ package main
import "testing"
// TestResolveBindHost pins the precedence: BIND_ADDR explicit > local-dev
// loopback default of 127.0.0.1 > production-shape empty (all interfaces).
// TestResolveBindHost pins the precedence: BIND_ADDR explicit > dev-mode
// fail-open default of 127.0.0.1 > production-shape empty (all interfaces).
//
// (harden/no-fail-open-auth) The loopback default is now keyed on
// MOLECULE_ENV alone (IsLocalDevEnv), decoupled from ADMIN_TOKEN — a dev box
// defaults to loopback even when it provisions an ADMIN_TOKEN. This is
// defense-in-depth, not an auth lever; auth is fail-closed in every env.
//
// Mutation-test invariant: removing the IsLocalDevEnv() branch makes
// Mutation-test invariant: removing the IsDevModeFailOpen() branch makes
// "no_bindaddr_devmode_unset_admin" fail (returns "" instead of "127.0.0.1").
// Removing the BIND_ADDR branch makes "explicit_bindaddr_*" cases fail.
func TestResolveBindHost(t *testing.T) {
@@ -40,10 +35,7 @@ func TestResolveBindHost(t *testing.T) {
bindAddr: "",
adminToken: "secret",
molEnv: "dev",
// harden/no-fail-open-auth: loopback default is keyed on
// MOLECULE_ENV alone now — a dev box defaults to loopback even
// with ADMIN_TOKEN provisioned (which dev-start.sh now does).
want: "127.0.0.1",
want: "", // ADMIN_TOKEN flips IsDevModeFailOpen to false → all interfaces
},
{
name: "no_bindaddr_production_env",
+15 -15
View File
@@ -474,12 +474,12 @@ func main() {
// HTTP server with graceful shutdown.
//
// Bind host: in local dev (MOLECULE_ENV=dev|development) default the
// listener to loopback as defense-in-depth — a dev box shouldn't be
// reachable from the LAN. This is NOT an auth lever (auth is fail-closed
// in every env now); it's strictly the safer default. Operators who need
// LAN exposure set BIND_ADDR=0.0.0.0 explicitly. Production binds all
// interfaces (existing shape). See molecule-core#7.
// Bind host: in dev-mode (no ADMIN_TOKEN, MOLECULE_ENV=dev|development)
// the AdminAuth chain fails open by design; pairing that with a wildcard
// bind would expose unauth /workspaces to any same-LAN peer. Default to
// loopback when fail-open is active. Operators who need LAN exposure set
// BIND_ADDR=0.0.0.0 explicitly. Production (ADMIN_TOKEN set) is unchanged.
// See molecule-core#7.
bindHost := resolveBindHost()
srv := &http.Server{
Addr: fmt.Sprintf("%s:%s", bindHost, port),
@@ -489,7 +489,7 @@ func main() {
// Start server in goroutine
go func() {
log.Printf("Platform starting on %s:%s (local-dev-env=%v)", bindHost, port, middleware.IsLocalDevEnv())
log.Printf("Platform starting on %s:%s (dev-mode-fail-open=%v)", bindHost, port, middleware.IsDevModeFailOpen())
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
log.Fatalf("Server failed: %v", err)
}
@@ -528,20 +528,20 @@ func envOr(key, fallback string) string {
//
// Precedence:
// 1. BIND_ADDR — explicit operator override (any value, including "0.0.0.0").
// 2. local dev (MOLECULE_ENV=dev|development) → "127.0.0.1" (loopback only).
// 2. dev-mode fail-open active → "127.0.0.1" (loopback only).
// 3. otherwise → "" (Go binds every interface; existing prod/self-host shape).
//
// NOTE (harden/no-fail-open-auth): this is a defense-in-depth default, NOT an
// auth lever. Auth is fail-closed in every environment now, so the loopback
// default no longer compensates for a weak auth chain — it simply keeps a dev
// box off the LAN by default. It is keyed on MOLECULE_ENV alone (decoupled
// from ADMIN_TOKEN), because dev now provisions an ADMIN_TOKEN yet should
// still default to loopback. See molecule-core#7 for the original LAN finding.
// Coupling the loopback default to middleware.IsDevModeFailOpen() means the
// two safety levers — bind narrowness and auth strength — move together. A
// production deploy (ADMIN_TOKEN set) keeps binding to all interfaces because
// the auth chain is doing its job; a dev Mac (no ADMIN_TOKEN, MOLECULE_ENV=dev)
// is reachable only via loopback because the auth chain is fail-open. See
// molecule-core#7 for the original LAN exposure finding.
func resolveBindHost() string {
if v := os.Getenv("BIND_ADDR"); v != "" {
return v
}
if middleware.IsLocalDevEnv() {
if middleware.IsDevModeFailOpen() {
return "127.0.0.1"
}
return ""
@@ -1,141 +0,0 @@
package handlers
// a2a_outbound_envelope_test.go — outbound A2A `message/send` envelope
// CONTRACT gate (issue #2251).
//
// #2251: an outbound A2A envelope shipped without `role` and with text
// parts keyed `type` instead of the v0.3-canonical `kind`. The receiver's
// a-2-a-sdk v0.3 Pydantic validator silently rejected the message
// post-dispatch — the sender saw a happy 200/202 while the brief was
// dropped (the same invisible-rejection failure class as the v0.2→v0.3
// content bug pinned by a2a_corpus_test.go, but on the SEND side).
//
// The inbound corpus replay (a2a_corpus_test.go) proves normalizeA2APayload
// produces `parts[].kind` + a non-empty messageId, but it does NOT assert
// `role`, and it only covers what we RECEIVE. Nothing pins what core
// EMITS. This file pins the emit contract at the helper that builds the
// parts (buildA2AMessageParts, used by both delegate_task and
// delegate_task_async) and asserts the canonical Part key is `kind`.
//
// Part-object schema (A2A v0.3): every Part MUST carry a `kind`
// discriminator ("text" | "file" | "data"); there is NO `type` key. A
// text Part is {"kind":"text","text":"..."}. Emitting `type` makes the
// v0.3 validator drop the Part.
import (
"encoding/json"
"testing"
)
// TestBuildA2AMessageParts_TextPartUsesKindNotType pins the v0.3 Part
// discriminator for the text part emitted on every outbound A2A
// delegation. RED before #2251's fix (the helper emitted
// {"type":"text",...}); the receiver's v0.3 Pydantic validator drops a
// Part keyed `type`, silently losing the task text.
func TestBuildA2AMessageParts_TextPartUsesKindNotType(t *testing.T) {
parts := buildA2AMessageParts("do the work", nil)
if len(parts) == 0 {
t.Fatal("buildA2AMessageParts returned no parts for a non-empty task")
}
text := parts[0]
if _, hasType := text["type"]; hasType {
t.Errorf("text part uses forbidden v0.2 key `type` %v — A2A v0.3 Parts discriminate on `kind`; `type` is dropped by the receiver's validator (#2251)", text)
}
kind, ok := text["kind"].(string)
if !ok {
t.Fatalf("text part missing string `kind` discriminator; got %v", text)
}
if kind != "text" {
t.Errorf("text part kind = %q, want \"text\"", kind)
}
if text["text"] != "do the work" {
t.Errorf("text part text = %v, want \"do the work\"", text["text"])
}
}
// TestBuildA2AMessageParts_FilePartUsesKind guards the file-attachment
// Part the same way. The file path was already correct (it used `kind`),
// so this is a non-regression pin — it must STAY `kind` when the text
// path is fixed (a careless "make them consistent" edit could flip both
// to the wrong key).
func TestBuildA2AMessageParts_FilePartUsesKind(t *testing.T) {
atts := []AgentMessageAttachment{
{URI: "https://example.com/a.png", MimeType: "image/png", Name: "a.png"},
}
parts := buildA2AMessageParts("caption", atts)
if len(parts) < 2 {
t.Fatalf("expected text + file parts, got %d", len(parts))
}
file := parts[1]
if _, hasType := file["type"]; hasType {
t.Errorf("file part uses forbidden `type` key: %v", file)
}
if _, hasKind := file["kind"]; !hasKind {
t.Errorf("file part missing `kind` discriminator: %v", file)
}
}
// TestDelegationOutboundEnvelope_RoleAndKind pins the FULL outbound
// envelope contract — role + parts[].kind — on the canonical helper.
// A v0.3 `message` MUST carry `role` ("user" for a delegation request)
// and `parts` whose every entry discriminates on `kind`. This is the
// shape the receiver's MessageSendParams validator accepts; an envelope
// missing `role` or keyed `type` is silently rejected (#2251).
//
// Built from the same primitives delegation.go / mcp_tools.go assemble
// (role:"user" + buildA2AMessageParts) so the round-trip through
// json.Marshal proves the wire bytes are v0.3-valid.
func TestDelegationOutboundEnvelope_RoleAndKind(t *testing.T) {
envelope := map[string]interface{}{
"method": "message/send",
"params": map[string]interface{}{
"message": map[string]interface{}{
"role": "user",
"messageId": "deleg-1",
"parts": buildA2AMessageParts("do the work", nil),
},
},
}
raw, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
var parsed map[string]interface{}
if err := json.Unmarshal(raw, &parsed); err != nil {
t.Fatalf("unmarshal envelope: %v", err)
}
params, _ := parsed["params"].(map[string]interface{})
if params == nil {
t.Fatal("envelope missing params")
}
msg, _ := params["message"].(map[string]interface{})
if msg == nil {
t.Fatal("envelope missing params.message")
}
// role is mandatory on a v0.3 message — the receiver rejects without it.
role, hasRole := msg["role"].(string)
if !hasRole || role == "" {
t.Errorf("params.message missing non-empty `role` — v0.3 requires it; omitting it is the other half of #2251")
}
parts, _ := msg["parts"].([]interface{})
if len(parts) == 0 {
t.Fatal("params.message.parts is empty")
}
for i, p := range parts {
pm, _ := p.(map[string]interface{})
if pm == nil {
t.Errorf("part %d is not an object: %v", i, p)
continue
}
if _, hasType := pm["type"]; hasType {
t.Errorf("part %d uses forbidden `type` key (must be `kind`): %v", i, pm)
}
if _, hasKind := pm["kind"]; !hasKind {
t.Errorf("part %d missing `kind` discriminator: %v", i, pm)
}
}
}
@@ -801,18 +801,6 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) {
if _, hasID := msg["messageId"]; !hasID {
msg["messageId"] = uuid.New().String()
}
// #2251: default params.message.role to "user" when absent.
// The downstream a2a-sdk v0.3 Pydantic validator marks role a
// REQUIRED field; a role-less envelope fails parse with
// "params.message.role Field required". The Go builders
// (mcp_tools/delegation/scheduler/channels) already set it, but
// raw external/canvas POSTs to ProxyA2A may omit it — making this
// the single canonical choke that guarantees a schema-valid role.
// Mirror the messageId default exactly: inject only when missing,
// never overwrite a caller-supplied role (e.g. "agent").
if _, hasRole := msg["role"]; !hasRole {
msg["role"] = "user"
}
_, hasParts := msg["parts"]
rawContent, hasContent := msg["content"]
if !hasParts {
@@ -844,27 +832,6 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) {
}
}
}
// #2251: wire hygiene — the A2A v0.3 Part discriminator is
// "kind", but some builders/clients emit the legacy "type" key
// (e.g. delegation.go). The v0.3 Pydantic validator keys on
// "kind"; a stray "type" leaves the Part untagged. Rename
// "type" → "kind" on every Part that lacks an explicit "kind"
// so the discriminator is always present on the wire.
if parts, ok := msg["parts"].([]interface{}); ok {
for _, p := range parts {
part, ok := p.(map[string]interface{})
if !ok {
continue
}
if _, hasKind := part["kind"]; hasKind {
continue
}
if t, hasType := part["type"]; hasType {
part["kind"] = t
delete(part, "type")
}
}
}
}
}
@@ -1514,142 +1514,6 @@ func TestNormalizeA2APayload_NoMessageNoCheck(t *testing.T) {
}
}
// --- #2251: role default + part-kind hygiene contract tests ---
//
// These assert normalizeA2APayload is the single canonical Go choke that
// guarantees a schema-valid outbound message/send envelope: it injects a
// default params.message.role="user" when the sender omitted role (the bug
// that made delegate_task fail the peer's a2a Pydantic validator with
// "params.message.role Field required" while reply_to_workspace worked), and
// it renames the legacy Part discriminator "type"→"kind" for wire hygiene.
// normMsg is a small helper that runs normalizeA2APayload and returns the
// resolved params.message map, failing the test on any normalization error.
func normMsg(t *testing.T, raw string) map[string]interface{} {
t.Helper()
out, _, perr := normalizeA2APayload([]byte(raw))
if perr != nil {
t.Fatalf("normalizeA2APayload returned error: %+v", perr)
}
var parsed map[string]interface{}
if err := json.Unmarshal(out, &parsed); err != nil {
t.Fatalf("output not valid JSON: %v", err)
}
params, ok := parsed["params"].(map[string]interface{})
if !ok {
t.Fatalf("output missing params object: %s", string(out))
}
msg, ok := params["message"].(map[string]interface{})
if !ok {
t.Fatalf("output missing params.message object: %s", string(out))
}
return msg
}
func TestNormalizeA2APayload_DefaultsRoleWhenMissing(t *testing.T) {
cases := []struct {
name string
raw string
}{
{
name: "v0.3 parts, no role",
raw: `{"method":"message/send","params":{"message":{"parts":[{"kind":"text","text":"hi"}]}}}`,
},
{
name: "v0.2 string content, no role",
raw: `{"method":"message/send","params":{"message":{"content":"hi"}}}`,
},
{
name: "legacy type part, no role",
raw: `{"method":"message/send","params":{"message":{"parts":[{"type":"text","text":"hi"}]}}}`,
},
{
name: "already wrapped jsonrpc, no role",
raw: `{"jsonrpc":"2.0","id":"x","method":"message/send","params":{"message":{"parts":[{"kind":"text","text":"hi"}]}}}`,
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
msg := normMsg(t, tc.raw)
if msg["role"] != "user" {
t.Errorf("expected role defaulted to \"user\", got %v", msg["role"])
}
// Parts must remain valid (non-empty) after normalization.
parts, ok := msg["parts"].([]interface{})
if !ok || len(parts) == 0 {
t.Fatalf("expected non-empty parts after normalization, got %v", msg["parts"])
}
// Every part must carry the v0.3 "kind" discriminator.
for i, p := range parts {
part, ok := p.(map[string]interface{})
if !ok {
t.Fatalf("part %d is not an object: %v", i, p)
}
if _, hasKind := part["kind"]; !hasKind {
t.Errorf("part %d missing \"kind\" discriminator: %v", i, part)
}
if _, hasType := part["type"]; hasType {
t.Errorf("part %d still has legacy \"type\" key: %v", i, part)
}
}
})
}
}
func TestNormalizeA2APayload_PreservesExplicitRole(t *testing.T) {
// A caller-supplied role (e.g. "agent") must NOT be overwritten with "user".
msg := normMsg(t, `{"method":"message/send","params":{"message":{"role":"agent","parts":[{"kind":"text","text":"hi"}]}}}`)
if msg["role"] != "agent" {
t.Errorf("explicit role overwritten: expected \"agent\", got %v", msg["role"])
}
}
func TestNormalizeA2APayload_RenamesPartTypeToKind(t *testing.T) {
// Mirrors delegation.go's builder which emits {"type":"text",...}. After
// normalization the wire Part must be discriminated by "kind".
msg := normMsg(t, `{"method":"message/send","params":{"message":{"role":"user","parts":[{"type":"text","text":"a"},{"type":"file","uri":"workspace:/x"}]}}}`)
parts := msg["parts"].([]interface{})
if len(parts) != 2 {
t.Fatalf("expected 2 parts, got %d", len(parts))
}
wantKind := []string{"text", "file"}
for i, p := range parts {
part := p.(map[string]interface{})
if part["kind"] != wantKind[i] {
t.Errorf("part %d: expected kind=%q, got %v", i, wantKind[i], part["kind"])
}
if _, hasType := part["type"]; hasType {
t.Errorf("part %d still carries legacy \"type\": %v", i, part)
}
}
}
func TestNormalizeA2APayload_DoesNotClobberKindWithType(t *testing.T) {
// If a part has BOTH kind and type, kind wins and is left untouched.
msg := normMsg(t, `{"method":"message/send","params":{"message":{"role":"user","parts":[{"kind":"text","type":"ignored","text":"a"}]}}}`)
part := msg["parts"].([]interface{})[0].(map[string]interface{})
if part["kind"] != "text" {
t.Errorf("expected kind preserved as \"text\", got %v", part["kind"])
}
}
// TestNormalizeA2APayload_RoleDefault_ContractRegression documents the
// pre-fix failure: without the role default, a role-less message/send body
// emerged from normalization still missing params.message.role, which the
// peer's a2a Pydantic validator rejects. This asserts the POST-fix invariant
// (role present) directly; before the a2a_proxy.go change this assertion
// fails (role is absent → msg["role"] == nil).
func TestNormalizeA2APayload_RoleDefault_ContractRegression(t *testing.T) {
msg := normMsg(t, `{"method":"message/send","params":{"message":{"parts":[{"kind":"text","text":"delegate this"}]}}}`)
role, hasRole := msg["role"]
if !hasRole {
t.Fatal("REGRESSION (#2251): params.message.role absent after normalization — peer a2a validator will reject with 'role Field required'")
}
if role != "user" {
t.Errorf("expected default role \"user\", got %v", role)
}
}
// --- resolveAgentURL direct unit tests ---
func TestResolveAgentURL_CacheHit(t *testing.T) {
@@ -68,10 +68,6 @@ func TestPeers_CrossTenant_OrgRootNotLeaked(t *testing.T) {
caller := "org-a-root" // parent_id IS NULL — an org root for tenant A
// validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather.
// (Unordered match is set above, so this can be consumed at any point.)
seedDiscoveryGrandfather(mock, caller)
// parent_id lookup → NULL (caller is an org root)
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
WithArgs(caller).
@@ -132,9 +128,6 @@ func TestPeers_SameOrg_SiblingsStillWork(t *testing.T) {
caller := "org-a-child-1"
parent := "org-a-root"
// validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather.
seedDiscoveryGrandfather(mock, caller)
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
WithArgs(caller).
WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow(parent))
@@ -179,11 +179,8 @@ func (h *DelegationHandler) Delegate(c *gin.Context) {
"message": map[string]interface{}{
"role": "user",
"messageId": delegationID,
// A2A v0.3 Part discriminator is `kind`, NOT `type` (#2251) —
// a `type`-keyed Part is dropped by the receiver's v0.3
// validator, silently losing the delegated task.
"parts": []map[string]interface{}{{"kind": "text", "text": body.Task}},
"metadata": map[string]interface{}{"delegation_id": delegationID},
"parts": []map[string]interface{}{{"type": "text", "text": body.Task}},
"metadata": map[string]interface{}{"delegation_id": delegationID},
},
},
})
+15 -20
View File
@@ -422,33 +422,28 @@ func (h *DiscoveryHandler) CheckAccess(c *gin.Context) {
// workspaces with tokens must present a matching Bearer, token binding
// is strict (A's token cannot authenticate caller B).
//
// (harden/no-fail-open-auth) Fails CLOSED on DB error. This used to return nil
// (allow) on a HasAnyLiveToken hiccup "because discovery only exposes peer URLs
// already behind CanCommunicate" — but the CTO "nothing fail-open" directive is
// absolute, and a request must never gain access because the auth datastore is
// unreachable. A datastore error now writes 503 (availability tradeoff that
// grants NO access) and returns a non-nil error; the caller already does
// `if err != nil { return }` so the 503 body is what the client sees.
// Fail-open on DB hiccups. Unlike secrets.Values (which returns plaintext
// secrets and must fail closed), discovery only exposes peer URLs that
// are already behind the existing `CanCommunicate` hierarchy check — a
// momentary DB outage shouldn't take agent-to-agent discovery offline.
func validateDiscoveryCaller(ctx context.Context, c *gin.Context, workspaceID string) error {
hasLive, err := wsauth.HasAnyLiveToken(ctx, db.DB, workspaceID)
if err != nil {
log.Printf("wsauth: discovery HasAnyLiveToken(%s): datastore lookup failed (returning 503): %v", workspaceID, err)
c.JSON(http.StatusServiceUnavailable, gin.H{
"error": "platform datastore unavailable — retry shortly",
"code": "platform_unavailable",
})
return errors.New("auth datastore unavailable")
log.Printf("wsauth: discovery HasAnyLiveToken(%s) failed: %v — allowing request", workspaceID, err)
return nil
}
if !hasLive {
return nil // legacy / pre-upgrade
}
// (harden/no-fail-open-auth) The former dev-mode escape hatch that
// returned nil (allow) here when MOLECULE_ENV=dev + ADMIN_TOKEN unset
// has been REMOVED. Discovery callers must present a verified CP
// session or a valid bearer in every environment. Local dev now
// authenticates the Canvas with a provisioned ADMIN_TOKEN /
// NEXT_PUBLIC_ADMIN_TOKEN (see scripts/dev-start.sh), so the Details
// tab loads peers with a real credential rather than via fail-open.
// Tier-1b dev-mode hatch — same escape hatch AdminAuth and
// WorkspaceAuth apply on a local Docker setup. Without this, the
// canvas Details tab can never load peers for a workspace that has
// registered its live token, producing the 401 the user sees.
// Gated by MOLECULE_ENV=development + empty ADMIN_TOKEN, so SaaS
// production stays strict.
if middleware.IsDevModeFailOpen() {
return nil
}
// Try session cookie auth first (SaaS canvas path).
// verifiedCPSession returns (valid, presented):
@@ -49,10 +49,6 @@ func TestDiscover_WorkspaceNotFound_WithCaller(t *testing.T) {
setupTestRedis(t)
handler := NewDiscoveryHandler()
// validateDiscoveryCaller probes HasAnyLiveToken(callerID) first;
// grandfather (count=0) so the bearer-less request is allowed through.
seedDiscoveryGrandfather(mock, "ws-caller")
// CanCommunicate will need DB lookups — both workspace name lookups
// For the access check: caller lookup succeeds, target lookup fails
mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id =").
@@ -117,9 +113,6 @@ func TestPeers_WithParent(t *testing.T) {
setupTestRedis(t)
handler := NewDiscoveryHandler()
// validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather.
seedDiscoveryGrandfather(mock, "ws-sibling-1")
// Expect parent_id lookup for the requesting workspace
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
WithArgs("ws-sibling-1").
@@ -172,9 +165,6 @@ func TestPeers_NotFound(t *testing.T) {
setupTestRedis(t)
handler := NewDiscoveryHandler()
// validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather.
seedDiscoveryGrandfather(mock, "ws-ghost")
// Workspace not found
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
WithArgs("ws-ghost").
@@ -201,11 +191,6 @@ func TestPeers_DBError(t *testing.T) {
setupTestRedis(t)
handler := NewDiscoveryHandler()
// Auth probe grandfathers; this test targets a DB error on the
// *handler-body* parent_id query → 500 (distinct from the auth-probe
// DB error which now fails closed with 503).
seedDiscoveryGrandfather(mock, "ws-dberr")
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
WithArgs("ws-dberr").
WillReturnError(sql.ErrConnDone)
@@ -231,9 +216,6 @@ func TestPeers_RootWorkspace_NoPeers(t *testing.T) {
setupTestRedis(t)
handler := NewDiscoveryHandler()
// validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather.
seedDiscoveryGrandfather(mock, "ws-root-alone")
// Root workspace (parent_id is NULL)
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
WithArgs("ws-root-alone").
@@ -288,9 +270,6 @@ func peersFilterFixture(t *testing.T) (*DiscoveryHandler, sqlmock.Sqlmock) {
mock := setupTestDB(t)
setupTestRedis(t)
// validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather.
seedDiscoveryGrandfather(mock, "ws-self")
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
WithArgs("ws-self").
WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow("ws-pm"))
@@ -948,14 +927,13 @@ func TestDiscoverHostPeer_Smoke_Success(t *testing.T) {
}
}
// ==================== Peers auth — fail-CLOSED gate ====================
// ==================== Peers auth — dev-mode fail-open gate ====================
//
// (harden/no-fail-open-auth) validateDiscoveryCaller USED to apply a
// Tier-1b dev-mode hatch that let the bearer-less canvas session load the
// Details → PEERS list when MOLECULE_ENV=development AND ADMIN_TOKEN empty.
// That hatch has been REMOVED — discovery callers must present a verified
// CP session or a valid bearer in every environment. These tests pin the
// fail-closed contract against accidental re-introduction.
// validateDiscoveryCaller applies a Tier-1b dev-mode hatch so the canvas
// user session (which holds no workspace-scoped bearer) can still load
// the Details → PEERS list on a local Docker setup. The gate must pass
// ONLY when MOLECULE_ENV is development AND ADMIN_TOKEN is empty.
// These tests pin that contract against accidental polarity flips.
// peersAuthFixtureHasLiveToken seeds the mock rows required for the
// Peers handler to reach the auth branch: HasAnyLiveToken → true (a
@@ -968,30 +946,10 @@ func peersAuthFixtureHasLiveToken(mock sqlmock.Sqlmock, workspaceID string) {
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1))
}
// seedDiscoveryGrandfather seeds the FIRST query validateDiscoveryCaller
// issues (HasAnyLiveToken → 0 = legacy / pre-upgrade) so a bearer-less
// discovery request grandfathers through and the test can exercise the
// handler body.
//
// (harden/no-fail-open-auth) Before this branch, validateDiscoveryCaller
// returned nil (allow) when the HasAnyLiveToken probe ERRORED — so these
// handler-body tests never had to seed the probe at all; the unmatched
// COUNT query erred and the fail-open swallowed it. Now that the DB-error
// path fails CLOSED (503), the probe must be seeded explicitly. count=0 is
// the legitimate grandfather path (no live tokens for this workspace yet),
// which is what these pre-existing tests intend.
func seedDiscoveryGrandfather(mock sqlmock.Sqlmock, workspaceID string) {
mock.ExpectQuery("SELECT COUNT.+workspace_auth_tokens").
WithArgs(workspaceID).
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0))
}
func TestPeers_DevMode_BearerlessRequest_FailsClosed(t *testing.T) {
// (harden/no-fail-open-auth) Exact old-hatch conditions:
// MOLECULE_ENV=development AND ADMIN_TOKEN empty, with a live token in
// the DB. The bearer-less canvas-style request must now 401 — the
// dev-mode hatch that returned nil (allow) here is gone. Local dev
// authenticates via a provisioned ADMIN_TOKEN (scripts/dev-start.sh).
func TestPeers_DevModeFailOpen_AllowsBearerlessRequest(t *testing.T) {
// Dev mode: MOLECULE_ENV=development AND ADMIN_TOKEN empty. Canvas
// sends no bearer token; validateDiscoveryCaller must return nil
// (allow) and the handler must proceed to return the peer list.
t.Setenv("MOLECULE_ENV", "development")
t.Setenv("ADMIN_TOKEN", "")
@@ -999,10 +957,22 @@ func TestPeers_DevMode_BearerlessRequest_FailsClosed(t *testing.T) {
setupTestRedis(t)
handler := NewDiscoveryHandler()
// Only the HasAnyLiveToken probe runs; auth 401s before the peer
// queries, so no further expectations are seeded.
peersAuthFixtureHasLiveToken(mock, "ws-dev")
// Root workspace → children+parent queries still fire but the
// parent_id lookup comes first.
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
WithArgs("ws-dev").
WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow(nil))
peerCols := []string{"id", "name", "role", "tier", "status", "agent_card", "url", "parent_id", "active_tasks"}
mock.ExpectQuery("SELECT w.id.+WHERE w.parent_id IS NULL AND w.id").
WithArgs("ws-dev").
WillReturnRows(sqlmock.NewRows(peerCols))
// #383 — children query gained explicit `w.id != $2` self-filter.
mock.ExpectQuery("SELECT w.id.+WHERE w.parent_id = \\$1 AND w.id != \\$2 AND w.status").
WithArgs("ws-dev", "ws-dev").
WillReturnRows(sqlmock.NewRows(peerCols))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: "ws-dev"}}
@@ -1010,8 +980,8 @@ func TestPeers_DevMode_BearerlessRequest_FailsClosed(t *testing.T) {
handler.Peers(c)
if w.Code != http.StatusUnauthorized {
t.Fatalf("expected 401 (fail-closed) under old dev-mode hatch conditions, got %d: %s", w.Code, w.Body.String())
if w.Code != http.StatusOK {
t.Fatalf("expected 200 under dev-mode hatch, got %d: %s", w.Code, w.Body.String())
}
}
@@ -1064,70 +1034,6 @@ func TestPeers_DevModeFailOpen_ClosedInProduction(t *testing.T) {
}
}
// TestPeers_AuthProbeDBError_FailsClosed pins the removal of
// validateDiscoveryCaller's fail-open-on-DB-error branch
// (harden/no-fail-open-auth). When the HasAnyLiveToken auth probe ERRORS, the
// request must NOT be allowed through — it now returns 503 (availability
// tradeoff that grants NO access). Before this branch the function returned nil
// (allow) on a DB hiccup, so the request reached the peer queries.
//
// Watch-it-fail: restore `if err != nil { log; return nil }` in
// validateDiscoveryCaller → this flips 503→(200/handler path) and fails.
func TestPeers_AuthProbeDBError_FailsClosed(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
handler := NewDiscoveryHandler()
// The FIRST query validateDiscoveryCaller issues (HasAnyLiveToken) errors.
// No further expectations: a fail-closed 503 must be written before the
// peer-list queries run.
mock.ExpectQuery("SELECT COUNT.+workspace_auth_tokens").
WithArgs("ws-dberr-auth").
WillReturnError(sql.ErrConnDone)
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: "ws-dberr-auth"}}
c.Request = httptest.NewRequest("GET", "/registry/ws-dberr-auth/peers", nil)
handler.Peers(c)
if w.Code != http.StatusServiceUnavailable {
t.Fatalf("auth-probe DB error must fail CLOSED: expected 503, got %d: %s", w.Code, w.Body.String())
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
}
}
// TestDiscover_AuthProbeDBError_FailsClosed is the Discover-endpoint companion
// to TestPeers_AuthProbeDBError_FailsClosed: a HasAnyLiveToken error on the
// caller's discovery request fails CLOSED with 503 (was: fail-open allow).
func TestDiscover_AuthProbeDBError_FailsClosed(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
handler := NewDiscoveryHandler()
mock.ExpectQuery("SELECT COUNT.+workspace_auth_tokens").
WithArgs("ws-caller").
WillReturnError(sql.ErrConnDone)
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: "ws-target"}}
c.Request = httptest.NewRequest("GET", "/registry/discover/ws-target", nil)
c.Request.Header.Set("X-Workspace-ID", "ws-caller")
handler.Discover(c)
if w.Code != http.StatusServiceUnavailable {
t.Fatalf("Discover auth-probe DB error must fail CLOSED: expected 503, got %d: %s", w.Code, w.Body.String())
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
}
}
// ==================== Peers — #383 self never appears in result ====================
// TestPeers_ExcludeSelf_DefenseInDepth verifies the final-line filter in
@@ -1150,9 +1056,6 @@ func TestPeers_ExcludeSelf_DefenseInDepth(t *testing.T) {
const selfID = "ws-xiaodong"
// validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather.
seedDiscoveryGrandfather(mock, selfID)
// parent_id lookup — workspace has a parent.
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
WithArgs(selfID).
@@ -551,9 +551,6 @@ func TestDiscover_AccessDenied(t *testing.T) {
setupTestRedis(t)
handler := NewDiscoveryHandler()
// validateDiscoveryCaller probes HasAnyLiveToken(callerID) first; grandfather.
seedDiscoveryGrandfather(mock, "ws-child-a")
// CanCommunicate: different parents → denied
mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id =").
WithArgs("ws-child-a").
@@ -585,9 +582,6 @@ func TestDiscover_TargetOffline(t *testing.T) {
setupTestRedis(t)
handler := NewDiscoveryHandler()
// validateDiscoveryCaller probes HasAnyLiveToken(callerID) first; grandfather.
seedDiscoveryGrandfather(mock, "ws-caller")
// Share a parent so communication is allowed under post-#1955 rules
sharedParent := "ws-parent"
mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id =").
@@ -373,9 +373,6 @@ func TestExtended_DiscoverWithCallerID(t *testing.T) {
setupTestRedis(t)
handler := NewDiscoveryHandler()
// validateDiscoveryCaller probes HasAnyLiveToken(callerID) first; grandfather.
seedDiscoveryGrandfather(mock, "ws-caller")
// CanCommunicate needs to look up both workspaces
// Share a parent so communication is allowed under post-#1955 rules
sharedParent := "ws-parent"
@@ -467,9 +464,6 @@ func TestExtended_Peers(t *testing.T) {
setupTestRedis(t)
handler := NewDiscoveryHandler()
// validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather.
seedDiscoveryGrandfather(mock, "ws-peer")
// Expect parent_id lookup for requesting workspace (root-level, no parent)
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
WithArgs("ws-peer").
@@ -192,11 +192,7 @@ func (h *MCPHandler) toolGetWorkspaceInfo(ctx context.Context, workspaceID strin
// follow in the order provided, with kind derived from MIME type.
func buildA2AMessageParts(task string, attachments []AgentMessageAttachment) []map[string]interface{} {
parts := []map[string]interface{}{
// A2A v0.3 Part discriminator is `kind`, NOT `type` (#2251).
// The receiver's v0.3 Pydantic validator drops a Part keyed
// `type`, silently losing the task text — the file part below
// already uses `kind`, this is the matching fix for text.
{"kind": "text", "text": task},
{"type": "text", "text": task},
}
for _, att := range attachments {
kind := kindFromMimeType(att.MimeType)
@@ -161,7 +161,7 @@ func (h *PluginsHandler) uninstallViaDocker(ctx context.Context, c *gin.Context,
// 1. Strip plugin's rule/fragment markers from CLAUDE.md (mirrors
// AgentskillsAdaptor.uninstall lines 184-188). Best-effort: if
// the user edited CLAUDE.md, our marker stays untouched.
h.stripPluginMarkersFromMemory(ctx, containerName, pluginName)
h.stripPluginMarkersFromMemory(ctx, workspaceID, containerName, pluginName)
// 2. Remove copied skill dirs declared in the plugin's plugin.yaml.
for _, skill := range skillNames {
@@ -171,9 +171,11 @@ func (h *PluginsHandler) uninstallViaDocker(ctx context.Context, c *gin.Context,
log.Printf("Plugin uninstall: skipping invalid skill name %q in %s: %v", skill, pluginName, err)
continue
}
_, _ = h.execAsRoot(ctx, containerName, []string{
if _, rmErr := h.execAsRoot(ctx, containerName, []string{
"rm", "-rf", "/configs/skills/" + skill,
})
}); rmErr != nil {
log.Printf("Plugin uninstall: failed to remove skill %s from %s: %v", skill, workspaceID, rmErr)
}
}
// 3. Delete the plugin directory itself (as root to handle file ownership).
@@ -393,7 +393,7 @@ func (h *PluginsHandler) readPluginSkillsFromContainer(ctx context.Context, cont
// `# Plugin: <name> /` — mirrors AgentskillsAdaptor.uninstall's stripping
// logic so install/uninstall are symmetric. Best-effort: silent on read or
// write failure, since the rest of uninstall must still succeed.
func (h *PluginsHandler) stripPluginMarkersFromMemory(ctx context.Context, containerName, pluginName string) {
func (h *PluginsHandler) stripPluginMarkersFromMemory(ctx context.Context, workspaceID, containerName, pluginName string) {
// Use sed via bash -c for atomic in-place delete: drop the marker line
// and the blank line that follows it (install adds a leading blank line
// before the marker via append_to_memory). Three sed passes mirror the
@@ -417,7 +417,9 @@ func (h *PluginsHandler) stripPluginMarkersFromMemory(ctx context.Context, conta
`awk 'BEGIN{skip=0; blanks=0} /^%s/{skip=1; blanks=0; next} skip==1 && /^[[:space:]]*$/{blanks++; if(blanks>=2){skip=0; print; next} next} /^# Plugin: /{if(skip==1)skip=0} skip==1{next} {print}' /configs/CLAUDE.md > /tmp/claude.new && mv /tmp/claude.new /configs/CLAUDE.md`,
regexpEscapeForAwk(marker),
)
_, _ = h.execAsRoot(ctx, containerName, []string{"bash", "-c", script})
if _, awkErr := h.execAsRoot(ctx, containerName, []string{"bash", "-c", script}); awkErr != nil {
log.Printf("Plugin uninstall: failed to strip markers from CLAUDE.md for %s in %s: %v", pluginName, workspaceID, awkErr)
}
}
// regexpEscapeForAwk escapes characters that have special meaning inside an
@@ -89,16 +89,13 @@ func TestSecurity_GetTemplates_NoAuth_Returns401(t *testing.T) {
}
}
// TestSecurity_GetTemplates_FreshInstall_FailsClosed pins the post-hardening
// contract (harden/no-fail-open-auth): GET /templates on a fresh install (zero
// enrolled workspaces, no ADMIN_TOKEN) now 401s with no bearer. The former
// AdminAuth Tier-1 lazy-bootstrap fail-open (fresh install ⇒ 200) is gone — a
// new deployment must provision ADMIN_TOKEN (dev does so via dev-start.sh).
func TestSecurity_GetTemplates_FreshInstall_FailsClosed(t *testing.T) {
// TestSecurity_GetTemplates_FreshInstall_FailsOpen verifies that GET /templates
// still succeeds on a fresh install (zero enrolled workspaces → AdminAuth fail-open).
// This is the regression check: the auth gate must not break new deployments.
func TestSecurity_GetTemplates_FreshInstall_FailsOpen(t *testing.T) {
setupTestDB(t)
setupTestRedis(t)
t.Setenv("ADMIN_TOKEN", "")
t.Setenv("MOLECULE_ENV", "")
authDB, authMock := newFreshInstallAuthDB(t)
tmpDir := t.TempDir()
@@ -111,8 +108,8 @@ func TestSecurity_GetTemplates_FreshInstall_FailsClosed(t *testing.T) {
req, _ := http.NewRequest(http.MethodGet, "/templates", nil)
r.ServeHTTP(w, req)
if w.Code != http.StatusUnauthorized {
t.Errorf("#686 GET /templates fresh-install fail-closed: want 401, got %d body=%s", w.Code, w.Body.String())
if w.Code != http.StatusOK {
t.Errorf("#686 GET /templates fresh-install: want 200 (fail-open), got %d body=%s", w.Code, w.Body.String())
}
if err := authMock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet auth mock expectations: %v", err)
@@ -151,14 +148,12 @@ func TestSecurity_GetOrgTemplates_NoAuth_Returns401(t *testing.T) {
}
}
// TestSecurity_GetOrgTemplates_FreshInstall_FailsClosed mirrors the /templates
// fail-closed check for /org/templates (harden/no-fail-open-auth): a fresh
// install with no bearer / no ADMIN_TOKEN now 401s rather than fail-open.
func TestSecurity_GetOrgTemplates_FreshInstall_FailsClosed(t *testing.T) {
// TestSecurity_GetOrgTemplates_FreshInstall_FailsOpen mirrors the /templates
// regression check for /org/templates — fresh installs must still work.
func TestSecurity_GetOrgTemplates_FreshInstall_FailsOpen(t *testing.T) {
setupTestDB(t)
setupTestRedis(t)
t.Setenv("ADMIN_TOKEN", "")
t.Setenv("MOLECULE_ENV", "")
authDB, authMock := newFreshInstallAuthDB(t)
tmpDir := t.TempDir()
@@ -172,8 +167,8 @@ func TestSecurity_GetOrgTemplates_FreshInstall_FailsClosed(t *testing.T) {
req, _ := http.NewRequest(http.MethodGet, "/org/templates", nil)
r.ServeHTTP(w, req)
if w.Code != http.StatusUnauthorized {
t.Errorf("#686 GET /org/templates fresh-install fail-closed: want 401, got %d body=%s", w.Code, w.Body.String())
if w.Code != http.StatusOK {
t.Errorf("#686 GET /org/templates fresh-install: want 200 (fail-open), got %d body=%s", w.Code, w.Body.String())
}
if err := authMock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet auth mock expectations: %v", err)
+42 -34
View File
@@ -5,53 +5,61 @@ import (
"strings"
)
// Local-dev environment detection.
// Dev-mode escape hatch — factored out of AdminAuth + WorkspaceAuth so a
// future third caller (or a change to what "dev mode" means) touches one
// place. Narrowing the exposed seam also makes it grep-able from security
// reviews: every `isDevModeFailOpen()` call is an intentional fail-open.
//
// SECURITY (harden/no-fail-open-auth): this file used to export an auth
// escape hatch — `isDevModeFailOpen()` — that let AdminAuth, WorkspaceAuth,
// and the discovery handler serve admin/workspace-protected endpoints with
// NO bearer token whenever `ADMIN_TOKEN` was unset and `MOLECULE_ENV` was a
// dev value. The CTO directive is "nothing should be fail-open": auth is now
// fail-CLOSED in every environment, dev included. The hatch is GONE.
// Why the helper exists at all: on `go run ./cmd/server` the Canvas (at
// localhost:3000) calls the platform (at localhost:8080) cross-port. Both
// `isSameOriginCanvas` (Referer==Host) and the AdminAuth Tier-1 fail-open
// (no tokens in DB) close the moment the user creates their first
// workspace. Without this hatch the Canvas 401s on every /workspaces
// enumeration and every /workspaces/:id/* read until the operator sets
// `ADMIN_TOKEN` and rebuilds the Canvas bundle with a matching
// `NEXT_PUBLIC_ADMIN_TOKEN`. That's too much friction for a local smoke
// test — hence the hatch.
//
// What remains here is a NON-security predicate, `isLocalDevEnv()`, that
// reports ONLY whether `MOLECULE_ENV` names a local-dev environment. It does
// NOT consult `ADMIN_TOKEN` and it does NOT influence authentication. It is
// used for two convenience/defense-in-depth knobs that never grant access:
//
// - ratelimit.go: relax the per-caller request bucket on a single-user
// local stack (a DoS knob, not a credential — relaxing it cannot expose
// any protected data).
// - cmd/server resolveBindHost(): default the HTTP listener to loopback
// (127.0.0.1) in local dev. This is strictly *safer* than binding all
// interfaces and is unrelated to whether a request is authenticated.
//
// Local dev now stays AUTHENTICATED, not open: scripts/dev-start.sh
// provisions a deterministic `ADMIN_TOKEN` and hands the matching
// `NEXT_PUBLIC_ADMIN_TOKEN` to the Canvas, so the browser sends a real
// bearer. See scripts/dev-start.sh and canvas/src/lib/api.ts.
// Why it's safe for SaaS: hosted tenants are provisioned with both
// `ADMIN_TOKEN` (a random secret, checked by Tier-2 above) and
// `MOLECULE_ENV=production`. Either one being set makes this helper
// return false, so the fail-open branch is unreachable in production.
// Real token minting goes through AdminAuth, so local development keeps a
// narrow fail-open mode for browser/API smoke tests without an admin secret.
// devModeEnvValues is the set of MOLECULE_ENV values that count as
// "explicit local dev". Production callers don't set any of these.
// "explicit dev mode". Production callers don't set any of these.
// Case-insensitive compare via strings.ToLower below.
var devModeEnvValues = map[string]struct{}{
"development": {},
"dev": {},
}
// isLocalDevEnv reports whether MOLECULE_ENV names a local-dev environment
// ("development" / "dev"). It carries NO authentication semantics — callers
// must never use it to bypass a credential check. It exists only for
// dev-convenience / defense-in-depth knobs (rate-limit relaxation, loopback
// bind default) that cannot expose protected data.
func isLocalDevEnv() bool {
// isDevModeFailOpen reports whether the AdminAuth / WorkspaceAuth
// middleware should let a bearer-less request through despite live
// workspace tokens existing in the DB.
//
// True only when BOTH:
// - `ADMIN_TOKEN` is empty (operator has not opted in to the #684
// closure), AND
// - `MOLECULE_ENV` is explicitly a dev value ("development" / "dev").
//
// Either condition failing returns false — that's the SaaS safety
// guarantee. Tests: `devmode_test.go` covers every branch.
func isDevModeFailOpen() bool {
if os.Getenv("ADMIN_TOKEN") != "" {
return false
}
env := strings.ToLower(strings.TrimSpace(os.Getenv("MOLECULE_ENV")))
_, ok := devModeEnvValues[env]
return ok
}
// IsLocalDevEnv exposes isLocalDevEnv to packages outside the middleware
// module (cmd/server bind-host default). NON-security: see isLocalDevEnv.
func IsLocalDevEnv() bool {
return isLocalDevEnv()
// IsDevModeFailOpen exposes isDevModeFailOpen to packages outside the
// middleware module (handlers, discovery, etc.) so they can apply the
// same Tier-1b escape hatch their sibling AdminAuth / WorkspaceAuth
// already do. Keep every call site audit-tagged so security review can
// grep them.
func IsDevModeFailOpen() bool {
return isDevModeFailOpen()
}
@@ -4,66 +4,74 @@ import (
"testing"
)
// Unit tests for the isLocalDevEnv predicate.
//
// (harden/no-fail-open-auth) This predicate replaced the old
// isDevModeFailOpen() auth escape hatch. It carries NO authentication
// semantics and does NOT consult ADMIN_TOKEN — it reports ONLY whether
// MOLECULE_ENV names a local-dev environment. It gates non-security knobs
// (rate-limit relaxation, loopback bind default). The fail-CLOSED auth
// behaviour is enforced by no_fail_open_test.go.
// Unit tests for the isDevModeFailOpen predicate. The AdminAuth and
// WorkspaceAuth middleware tests exercise the same helper indirectly via
// HTTP, but a direct predicate test locks the pure-logic behaviour:
// future callers can add themselves to `devmode.go` with confidence.
func TestIsLocalDevEnv_Development_True(t *testing.T) {
func TestIsDevModeFailOpen_DevModeNoAdminToken_True(t *testing.T) {
t.Setenv("MOLECULE_ENV", "development")
if !isLocalDevEnv() {
t.Error("expected MOLECULE_ENV=development to be local dev")
t.Setenv("ADMIN_TOKEN", "")
if !isDevModeFailOpen() {
t.Error("expected dev mode + no admin token to return true")
}
}
func TestIsLocalDevEnv_ShortAlias_True(t *testing.T) {
func TestIsDevModeFailOpen_DevModeShortAlias_True(t *testing.T) {
// "dev" is a valid alias for "development".
t.Setenv("MOLECULE_ENV", "dev")
if !isLocalDevEnv() {
t.Error("expected MOLECULE_ENV=dev to be treated as local dev")
t.Setenv("ADMIN_TOKEN", "")
if !isDevModeFailOpen() {
t.Error("expected MOLECULE_ENV=dev to be treated as dev mode")
}
}
func TestIsLocalDevEnv_IgnoresAdminToken(t *testing.T) {
// Decoupled from ADMIN_TOKEN: dev now provisions one, but the bind /
// rate-limit knobs still treat the env as local dev. Crucially this
// predicate grants no access, so the coupling no longer matters.
func TestIsDevModeFailOpen_AdminTokenSet_False(t *testing.T) {
// Setting ADMIN_TOKEN is the operator's explicit opt-in to the #684
// closure. Dev mode must NOT silently override that signal.
t.Setenv("MOLECULE_ENV", "development")
t.Setenv("ADMIN_TOKEN", "operator-set-this")
if !isLocalDevEnv() {
t.Error("ADMIN_TOKEN must not affect isLocalDevEnv (env-only predicate)")
t.Setenv("ADMIN_TOKEN", "operator-explicitly-set-this")
if isDevModeFailOpen() {
t.Error("explicit ADMIN_TOKEN must suppress the dev-mode hatch")
}
}
func TestIsLocalDevEnv_Production_False(t *testing.T) {
func TestIsDevModeFailOpen_Production_False(t *testing.T) {
// The SaaS-safety guarantee: production tenants always have
// MOLECULE_ENV=production, so the hatch is unreachable even if a
// misconfigured deployment also leaves ADMIN_TOKEN unset.
t.Setenv("MOLECULE_ENV", "production")
if isLocalDevEnv() {
t.Error("production must not count as local dev")
t.Setenv("ADMIN_TOKEN", "")
if isDevModeFailOpen() {
t.Error("production must never hit the dev-mode fail-open branch")
}
}
func TestIsLocalDevEnv_CaseInsensitive(t *testing.T) {
func TestIsDevModeFailOpen_CaseInsensitive(t *testing.T) {
// Operators shouldn't have to remember exact casing for a dev-only
// convenience. "Development", "DEV", " dev " all count.
cases := []string{"Development", "DEVELOPMENT", "Dev", "DEV", " dev "}
for _, env := range cases {
t.Run(env, func(t *testing.T) {
t.Setenv("MOLECULE_ENV", env)
if !isLocalDevEnv() {
t.Errorf("MOLECULE_ENV=%q should count as local dev", env)
t.Setenv("ADMIN_TOKEN", "")
if !isDevModeFailOpen() {
t.Errorf("MOLECULE_ENV=%q should count as dev mode", env)
}
})
}
}
func TestIsLocalDevEnv_UnknownEnv_False(t *testing.T) {
func TestIsDevModeFailOpen_UnknownEnv_False(t *testing.T) {
// Arbitrary / unset MOLECULE_ENV values are NOT treated as dev mode.
// Keeps the fail-open branch narrow — no silent opt-in from a typo.
cases := []string{"", "staging", "local", "preview", "test", "devel"}
for _, env := range cases {
t.Run(env, func(t *testing.T) {
t.Setenv("MOLECULE_ENV", env)
if isLocalDevEnv() {
t.Errorf("MOLECULE_ENV=%q must not count as local dev", env)
t.Setenv("ADMIN_TOKEN", "")
if isDevModeFailOpen() {
t.Errorf("MOLECULE_ENV=%q must not enable fail-open", env)
}
})
}
@@ -1,245 +0,0 @@
package middleware
import (
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
"github.com/DATA-DOG/go-sqlmock"
"github.com/gin-gonic/gin"
)
// no_fail_open_test.go is the regression gate for the CTO directive
// "nothing should be fail-open" (branch harden/no-fail-open-auth).
//
// It asserts that AdminAuth and WorkspaceAuth fail CLOSED (401) under the
// EXACT conditions that used to trigger the removed dev-mode fail-open hatch:
// - ADMIN_TOKEN unset, AND
// - MOLECULE_ENV is a dev value ("development" / "dev"), AND
// - any HasAnyLiveTokenGlobal state (0 = fresh install, 1 = post-workspace).
//
// To prove this is RED against the old behaviour: temporarily restore the
// `if isDevModeFailOpen() { c.Next(); return }` short-circuit in
// wsauth_middleware.go (and the Tier-1 `if adminSecret == "" { c.Next() }`
// branch) — every sub-case below flips from 401 to 200 and fails. After the
// hardening, all sub-cases are 401.
// failOpenConditions enumerates the (MOLECULE_ENV, hasLiveTokens) combinations
// that the removed hatch keyed on. ADMIN_TOKEN is always unset here — that was
// a precondition of the old fail-open.
var failOpenConditions = []struct {
name string
molEnv string
liveCount int
}{
{"dev_alias_fresh_install", "dev", 0},
{"dev_alias_post_workspace", "dev", 1},
{"development_fresh_install", "development", 0},
{"development_post_workspace", "development", 1},
}
func TestAdminAuth_NoFailOpen_UnderOldHatchConditions(t *testing.T) {
for _, tc := range failOpenConditions {
t.Run(tc.name, func(t *testing.T) {
t.Setenv("ADMIN_TOKEN", "")
t.Setenv("MOLECULE_ENV", tc.molEnv)
// Ensure no CP-session path can accidentally pass.
t.Setenv("CP_UPSTREAM_URL", "")
mockDB, mock, err := sqlmock.New()
if err != nil {
t.Fatalf("sqlmock.New: %v", err)
}
defer mockDB.Close()
// AdminAuth always probes HasAnyLiveTokenGlobal (for the 503-on-
// outage semantics), so it must be expected for both counts.
mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(tc.liveCount))
r := gin.New()
r.GET("/admin/secrets", AdminAuth(mockDB), func(c *gin.Context) {
c.JSON(http.StatusOK, gin.H{"ok": true})
})
w := httptest.NewRecorder()
req, _ := http.NewRequest(http.MethodGet, "/admin/secrets", nil)
r.ServeHTTP(w, req)
if w.Code != http.StatusUnauthorized {
t.Errorf("AdminAuth must fail CLOSED under old hatch conditions "+
"(MOLECULE_ENV=%q, ADMIN_TOKEN unset, liveTokens=%d): expected 401, got %d: %s",
tc.molEnv, tc.liveCount, w.Code, w.Body.String())
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
}
})
}
}
func TestWorkspaceAuth_NoFailOpen_UnderOldHatchConditions(t *testing.T) {
for _, tc := range failOpenConditions {
t.Run(tc.name, func(t *testing.T) {
t.Setenv("ADMIN_TOKEN", "")
t.Setenv("MOLECULE_ENV", tc.molEnv)
t.Setenv("CP_UPSTREAM_URL", "")
// WorkspaceAuth 401s before any DB lookup when there is no
// bearer / cookie, so no queries are expected regardless of
// the nominal live-token count.
mockDB, _, err := sqlmock.New()
if err != nil {
t.Fatalf("sqlmock.New: %v", err)
}
defer mockDB.Close()
r := gin.New()
r.GET("/workspaces/:id/activity", WorkspaceAuth(mockDB), func(c *gin.Context) {
c.JSON(http.StatusOK, gin.H{"ok": true})
})
w := httptest.NewRecorder()
req, _ := http.NewRequest(http.MethodGet,
"/workspaces/00000000-0000-0000-0000-000000000000/activity", nil)
r.ServeHTTP(w, req)
if w.Code != http.StatusUnauthorized {
t.Errorf("WorkspaceAuth must fail CLOSED under old hatch conditions "+
"(MOLECULE_ENV=%q, ADMIN_TOKEN unset): expected 401, got %d: %s",
tc.molEnv, w.Code, w.Body.String())
}
})
}
}
// TestCanvasOrBearer_NoFailOpen_UnderOldHatchConditions is the regression gate
// for the two fail-open branches removed from CanvasOrBearer
// (harden/no-fail-open-auth, "nothing fail-open" pass 2):
//
// (a) lazy-bootstrap pass: `if !hasLive { c.Next(); return }` — a zero-token
// install used to pass EVERYTHING through. Now a bearer-less request on a
// fresh install (HasAnyLiveTokenGlobal → 0) fails CLOSED with 401.
// (b) fail-open-on-DB-error: `if err != nil { log; c.Next(); return }` — a
// HasAnyLiveTokenGlobal error used to ALLOW. Now it fails CLOSED with 503.
//
// Watch-it-fail: restore either short-circuit in CanvasOrBearer and the
// matching sub-case flips (401→200 / 503→200) and fails.
func TestCanvasOrBearer_NoFailOpen_UnderOldHatchConditions(t *testing.T) {
// (a) Fresh install (0 live tokens), no bearer, no ADMIN_TOKEN → 401.
t.Run("zero_token_install_no_bearer_fails_closed_401", func(t *testing.T) {
t.Setenv("ADMIN_TOKEN", "")
t.Setenv("CORS_ORIGINS", "")
mockDB, mock, err := sqlmock.New()
if err != nil {
t.Fatalf("sqlmock.New: %v", err)
}
defer mockDB.Close()
mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0))
handlerCalled := false
r := gin.New()
r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) {
handlerCalled = true
c.JSON(http.StatusOK, gin.H{"ok": true})
})
w := httptest.NewRecorder()
req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil)
r.ServeHTTP(w, req)
if w.Code != http.StatusUnauthorized {
t.Errorf("CanvasOrBearer lazy-bootstrap fail-open removed: zero-token install must 401, got %d: %s",
w.Code, w.Body.String())
}
if handlerCalled {
t.Error("handler reached on a fresh-install bearer-less request — lazy-bootstrap fail-open not removed")
}
})
// (b) Auth datastore error → 503 (NOT allow).
t.Run("db_error_fails_closed_503", func(t *testing.T) {
mockDB, mock, err := sqlmock.New()
if err != nil {
t.Fatalf("sqlmock.New: %v", err)
}
defer mockDB.Close()
mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
WillReturnError(http.ErrAbortHandler) // any non-nil error suffices
handlerCalled := false
r := gin.New()
r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) {
handlerCalled = true
c.JSON(http.StatusOK, gin.H{"ok": true})
})
w := httptest.NewRecorder()
req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil)
r.ServeHTTP(w, req)
if w.Code != http.StatusServiceUnavailable {
t.Errorf("CanvasOrBearer DB-error fail-open removed: must 503, got %d: %s", w.Code, w.Body.String())
}
if handlerCalled {
t.Error("handler reached on a datastore-error request — DB-error fail-open not removed")
}
})
}
// TestNoFailOpenAuthHelperReexists is a source-guard: it asserts that no
// fail-open auth helper (the removed isDevModeFailOpen / IsDevModeFailOpen)
// has crept back into the middleware package as real code. The replacement
// predicate is the NON-security isLocalDevEnv (bind / rate-limit only);
// re-introducing the old fail-open identifier as a declaration or call is a
// regression of the CTO directive.
//
// It matches the *invocation/declaration* form `isDevModeFailOpen(` (which
// only appears in live code) and deliberately ignores prose mentions in
// `//` comments, so the historical references kept in doc comments don't
// trip the guard.
func TestNoFailOpenAuthHelperReexists(t *testing.T) {
forbidden := []string{"isDevModeFailOpen(", "IsDevModeFailOpen("}
entries, err := os.ReadDir(".")
if err != nil {
t.Fatalf("ReadDir: %v", err)
}
for _, e := range entries {
name := e.Name()
if e.IsDir() || !strings.HasSuffix(name, ".go") {
continue
}
// Skip this guard file itself (it names the forbidden tokens on
// purpose, including inside a comment).
if name == "no_fail_open_test.go" {
continue
}
data, err := os.ReadFile(filepath.Clean(name))
if err != nil {
t.Fatalf("ReadFile %s: %v", name, err)
}
for i, line := range strings.Split(string(data), "\n") {
// Ignore single-line comments — historical mentions live there.
code := line
if idx := strings.Index(code, "//"); idx >= 0 {
code = code[:idx]
}
for _, f := range forbidden {
if strings.Contains(code, f) {
t.Errorf("%s:%d uses forbidden fail-open auth helper %q — "+
"the dev-mode fail-open hatch must stay removed (harden/no-fail-open-auth). "+
"Use isLocalDevEnv (NON-security) for dev-only knobs instead.",
name, i+1, strings.TrimSuffix(f, "("))
}
}
}
}
}
@@ -102,16 +102,15 @@ func (rl *RateLimiter) keyFor(c *gin.Context) string {
// the priority list and rationale.
func (rl *RateLimiter) Middleware() gin.HandlerFunc {
return func(c *gin.Context) {
// Local-dev rate-limit relaxation (NON-security; see devmode.go).
// On a local single-user stack the 600-req/min bucket fills fast:
// a 15-workspace canvas + activity polling + approvals polling +
// A2A overlay + initial hydration all land in one bucket, so a
// minute of active use can trip 429 and blank the page. This only
// relaxes a DoS knob — it grants no access and is unrelated to
// authentication (auth is fail-closed in every env). Gated solely
// by MOLECULE_ENV=dev/development so SaaS production keeps the
// bucket. Decoupled from ADMIN_TOKEN (dev now provisions one).
if isLocalDevEnv() {
// Tier-1b dev-mode hatch — same gate as AdminAuth / WorkspaceAuth /
// discovery. On a local single-user Docker setup the 600-req/min
// bucket fills fast: a 15-workspace canvas + activity polling +
// approvals polling + A2A overlay + initial hydration all land in
// one bucket (whichever keyFor returns — typically the dev user's
// IP or shared admin token), so a minute of active use can trip
// 429 and blank the page. Gated by MOLECULE_ENV=development +
// empty ADMIN_TOKEN so SaaS production keeps the bucket.
if isDevModeFailOpen() {
c.Header("X-RateLimit-Limit", "unlimited")
c.Next()
return
@@ -120,12 +120,12 @@ func WorkspaceAuth(database *sql.DB) gin.HandlerFunc {
return
}
}
// No bearer, no verified CP session: fail CLOSED in EVERY
// environment (harden/no-fail-open-auth). The old local-dev
// escape hatch that let bearer-less requests through when
// ADMIN_TOKEN was unset + MOLECULE_ENV=dev has been removed —
// local dev now authenticates with a provisioned ADMIN_TOKEN
// (see scripts/dev-start.sh).
// Local-dev escape hatch — see devmode.go. Unreachable on SaaS
// (hosted tenants always have ADMIN_TOKEN + MOLECULE_ENV=production).
if isDevModeFailOpen() {
c.Next()
return
}
c.AbortWithStatusJSON(http.StatusUnauthorized, gin.H{"error": "missing workspace auth token"})
}
}
@@ -133,18 +133,11 @@ func WorkspaceAuth(database *sql.DB) gin.HandlerFunc {
// AdminAuth returns a Gin middleware for global/admin routes (e.g.
// /settings/secrets, /admin/secrets) that have no per-workspace scope.
//
// FAIL-CLOSED in every environment (harden/no-fail-open-auth): there is no
// bearer-less path through this middleware. A request reaches the handler
// ONLY by presenting a valid credential (verified CP session cookie, org
// token, ADMIN_TOKEN, or — deprecated — a live workspace token). The former
// "Tier-1 lazy-bootstrap fail-open" (no live tokens + no ADMIN_TOKEN ⇒ pass)
// has been removed: it let an attacker pre-empt the first user by POSTing
// /org/import before any token was minted (C4 SaaS-launch finding). A fresh
// install must set ADMIN_TOKEN to reach admin routes.
//
// # Credential tier (evaluated in order)
//
// 1. Verified CP session cookie (SaaS canvas) — upstream-confirmed.
// 1. Lazy-bootstrap fail-open: if no live workspace token exists anywhere on
// the platform (fresh install / pre-Phase-30 upgrade), every request passes
// through so existing deployments keep working.
//
// 2. ADMIN_TOKEN env var (recommended, closes #684): when set, the bearer
// MUST equal this value exactly (constant-time comparison). Workspace
@@ -170,17 +163,33 @@ func AdminAuth(database *sql.DB) gin.HandlerFunc {
ctx := c.Request.Context()
adminSecret := os.Getenv("ADMIN_TOKEN")
// (harden/no-fail-open-auth) Both former fail-open branches have
// been REMOVED here:
// - Tier-1 lazy-bootstrap (no live tokens + no ADMIN_TOKEN ⇒ pass)
// - Tier-1b local-dev escape hatch (isDevModeFailOpen ⇒ pass)
// Admin auth is now fail-CLOSED in every environment. We still probe
// HasAnyLiveTokenGlobal so a datastore outage returns a structured
// 503 (not a silent pass), but its result no longer opens any path.
if _, err := wsauth.HasAnyLiveTokenGlobal(ctx, database); err != nil {
hasLive, err := wsauth.HasAnyLiveTokenGlobal(ctx, database)
if err != nil {
abortAuthLookupError(c, "AdminAuth: HasAnyLiveTokenGlobal", err)
return
}
if !hasLive {
// Tier 1: fail-open is ONLY safe when ADMIN_TOKEN is unset
// (self-hosted dev, pre-Phase-30 upgrade). Hosted SaaS always
// sets ADMIN_TOKEN at provision time, and C4 (SaaS-launch
// blocker) showed that without this guard an attacker can
// pre-empt the first user by POSTing /org/import before any
// token gets minted. When ADMIN_TOKEN is set we fall through
// into the same bearer-check path Tier-2 uses below.
if adminSecret == "" {
c.Next()
return
}
}
// Tier 1b: Local-dev escape hatch — see devmode.go. Lets the
// Canvas dashboard keep working after the first workspace token
// lands in the DB on `go run ./cmd/server`. Unreachable on SaaS
// (hosted tenants always have ADMIN_TOKEN + MOLECULE_ENV=production).
if isDevModeFailOpen() {
c.Next()
return
}
// SaaS-canvas path: when the request carries a WorkOS session
// cookie AND the CP confirms it's valid, accept without a
@@ -272,46 +281,34 @@ func cpSessionActor(cookieHeader string) string {
// Accepts either:
//
// 1. A valid bearer token (same contract as AdminAuth) — covers molecli,
// agent-to-platform calls, the browser canvas (which now sends
// Authorization: Bearer $NEXT_PUBLIC_ADMIN_TOKEN on every platform
// call — see canvas/src/lib/api.ts platformAuthHeaders), and anyone
// using the API directly.
// 2. A same-origin canvas request (Referer/Host match), but ONLY when the
// combined-tenant canvas proxy is active (CANVAS_PROXY_URL set). This is
// a real same-origin check the browser cannot forge cross-origin (see
// isSameOriginCanvas / IsVerifiedCanvasSession, #623/#194) — NOT the
// trivially-forgeable cross-origin Origin header. The forgeable
// CORS_ORIGINS Origin-match path was REMOVED under the CTO
// "nothing fail-open" directive (a no-bearer request passing purely on a
// spoofable Origin is effectively open even for a cosmetic route, and is
// no longer needed now that the canvas always sends a bearer).
// agent-to-platform calls, and anyone using the API directly.
// 2. A browser Origin header that matches CORS_ORIGINS (canvas itself).
// This is NOT a strict auth boundary — curl can forge Origin — but for
// cosmetic-only routes the trade-off is acceptable. Non-cosmetic routes
// MUST NOT use this middleware (see #194 review on why it would re-open
// #164 CRITICAL if applied to /bundles/import).
//
// Non-cosmetic routes MUST NOT use this middleware (see #194 review on why it
// would re-open #164 CRITICAL if applied to /bundles/import).
//
// (harden/no-fail-open-auth) Two former fail-open branches are REMOVED:
// - DB-error on HasAnyLiveTokenGlobal used to `c.Next()` (allow); it now
// fails CLOSED with 503 (availability tradeoff that grants NO access).
// - The lazy-bootstrap pass (`!hasLive ⇒ c.Next()`) used to let a
// zero-token install through EVERYTHING; it is gone. Bootstrap is now via
// ADMIN_TOKEN (provisioned by scripts/dev-start.sh for local dev,
// operator/SaaS-set in production) — local mimics production.
// Lazy-bootstrap fail-open preserved: zero-token installs pass everything
// through so fresh self-hosted / dev sessions aren't bricked.
func CanvasOrBearer(database *sql.DB) gin.HandlerFunc {
return func(c *gin.Context) {
ctx := c.Request.Context()
// Probe global token state for the (no-bearer) same-origin path
// below. Fail CLOSED on a datastore error — an availability tradeoff
// that does NOT grant access (was: log + c.Next() fail-open).
if _, err := wsauth.HasAnyLiveTokenGlobal(ctx, database); err != nil {
abortAuthLookupError(c, "CanvasOrBearer: HasAnyLiveTokenGlobal", err)
hasLive, err := wsauth.HasAnyLiveTokenGlobal(ctx, database)
if err != nil {
log.Printf("wsauth: CanvasOrBearer HasAnyLiveTokenGlobal failed: %v — allowing request", err)
c.Next()
return
}
if !hasLive {
c.Next()
return
}
// Path 1: bearer present → bearer MUST validate. Do not fall through
// to the same-origin path on an invalid bearer — an attacker with a
// revoked / expired token would otherwise bypass auth.
// Empty bearer → fall to the same-origin canvas path.
// to Origin on an invalid bearer — an attacker with a revoked /
// expired token + a matching Origin would otherwise bypass auth.
// Empty bearer → skip to Origin path (canvas never sends one).
if tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization")); tok != "" {
// Admin token accepted for canvas dashboard
adminSecret := os.Getenv("ADMIN_TOKEN")
@@ -327,10 +324,13 @@ func CanvasOrBearer(database *sql.DB) gin.HandlerFunc {
return
}
// Path 2: same-origin canvas (combined-tenant image). Gated behind
// canvasProxyActive (CANVAS_PROXY_URL) and a non-forgeable
// Referer/Host same-origin check — NOT the spoofable cross-origin
// Origin header (that path was removed, see doc comment above).
// Path 2: canvas origin match (cross-origin canvas).
if canvasOriginAllowed(c.GetHeader("Origin")) {
c.Next()
return
}
// Path 3: same-origin canvas (tenant image).
if isSameOriginCanvas(c) {
c.Next()
return
@@ -340,14 +340,30 @@ func CanvasOrBearer(database *sql.DB) gin.HandlerFunc {
}
}
// (harden/no-fail-open-auth) canvasOriginAllowed was REMOVED. It matched a
// request's (trivially forgeable, cross-origin) Origin header against
// CORS_ORIGINS and was the basis of CanvasOrBearer's no-bearer Origin-match
// pass — effectively open to any curl that sets a matching Origin. Under the
// CTO "nothing fail-open" directive that path is gone; the canvas now always
// sends a bearer (NEXT_PUBLIC_ADMIN_TOKEN), so nothing legitimate relied on it.
// The CORS *response-header* allowlist is handled by the real CORS middleware
// upstream, unaffected by this removal.
// canvasOriginAllowed returns true if origin matches any entry in the
// CORS_ORIGINS env var (comma-separated) or the localhost defaults.
// Exact-match only; no prefix or wildcard logic — that's handled by the
// real CORS middleware upstream. The intent here is "did this request come
// from the canvas page the user is already logged into?" — a binary check.
func canvasOriginAllowed(origin string) bool {
if origin == "" {
return false
}
allowed := []string{"http://localhost:3000", "http://localhost:3001"}
if v := os.Getenv("CORS_ORIGINS"); v != "" {
for _, o := range strings.Split(v, ",") {
if o = strings.TrimSpace(o); o != "" {
allowed = append(allowed, o)
}
}
}
for _, a := range allowed {
if a == origin {
return true
}
}
return false
}
// isSameOriginCanvas returns true when the request appears to come from the
// canvas UI served by the same Go process (tenant image). In this topology,
@@ -143,15 +143,11 @@ func TestCanvasOrBearer_AdminTokenEnv_Passes(t *testing.T) {
}
}
// TestCanvasOrBearer_DBError_FailsClosed pins the removal of the
// fail-open-on-DB-error branch (harden/no-fail-open-auth). A
// HasAnyLiveTokenGlobal failure used to log + c.Next() (allow); it now fails
// CLOSED with 503 — an availability tradeoff that grants NO access. The
// handler must NOT be reached.
//
// Watch-it-fail: restore `if err != nil { log; c.Next(); return }` in
// CanvasOrBearer → this flips 503→200 and fails.
func TestCanvasOrBearer_DBError_FailsClosed(t *testing.T) {
// TestCanvasOrBearer_DBError_FailOpen pins the documented behavior on a
// HasAnyLiveTokenGlobal failure. The middleware logs and falls open so a
// flaky DB doesn't lock canvas users out of cosmetic routes. Hardcoded in
// the comment block; this is a reminder if anyone changes that semantic.
func TestCanvasOrBearer_DBError_FailOpen(t *testing.T) {
mockDB, mock, err := sqlmock.New()
if err != nil {
t.Fatalf("sqlmock: %v", err)
@@ -160,10 +156,8 @@ func TestCanvasOrBearer_DBError_FailsClosed(t *testing.T) {
mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
WillReturnError(http.ErrAbortHandler) // any non-nil error suffices
handlerCalled := false
r := gin.New()
r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) {
handlerCalled = true
c.JSON(http.StatusOK, gin.H{"ok": true})
})
@@ -171,11 +165,8 @@ func TestCanvasOrBearer_DBError_FailsClosed(t *testing.T) {
req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil)
r.ServeHTTP(w, req)
if w.Code != http.StatusServiceUnavailable {
t.Errorf("DB error must fail CLOSED: got %d, want 503 (%s)", w.Code, w.Body.String())
}
if handlerCalled {
t.Error("handler reached on a datastore-error request — DB-error fail-open not removed")
if w.Code != http.StatusOK {
t.Errorf("DB error fail-open: got %d, want 200 (%s)", w.Code, w.Body.String())
}
}
@@ -339,24 +339,15 @@ func TestWorkspaceAuth_WrongWorkspace_Returns401(t *testing.T) {
// TestAdminAuth_FailOpen_NoTokensGlobally — C10/C11: on a fresh install (no
// live tokens anywhere) the middleware must let the request through so existing
// deployments keep working during the Phase-30 rollout.
// TestAdminAuth_FreshInstallNoTokens_FailsClosed pins the post-hardening
// contract (harden/no-fail-open-auth): on a fresh install with NO live
// tokens anywhere AND no ADMIN_TOKEN, a bearer-less admin request now 401s.
// The former Tier-1 "lazy-bootstrap fail-open" (no tokens ⇒ 200) is GONE —
// it let an attacker pre-empt the first user via /org/import (C4). A fresh
// install must provision ADMIN_TOKEN to reach admin routes.
func TestAdminAuth_FreshInstallNoTokens_FailsClosed(t *testing.T) {
func TestAdminAuth_FailOpen_NoTokensGlobally(t *testing.T) {
t.Setenv("ADMIN_TOKEN", "")
t.Setenv("MOLECULE_ENV", "")
mockDB, mock, err := sqlmock.New()
if err != nil {
t.Fatalf("sqlmock.New: %v", err)
}
defer mockDB.Close()
// HasAnyLiveTokenGlobal returns 0 — fresh install. We still probe it
// (so a DB outage yields a structured 503), but the result no longer
// opens any path.
// HasAnyLiveTokenGlobal returns 0 — fresh install.
mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0))
@@ -369,8 +360,8 @@ func TestAdminAuth_FreshInstallNoTokens_FailsClosed(t *testing.T) {
req, _ := http.NewRequest(http.MethodGet, "/admin/secrets", nil)
r.ServeHTTP(w, req)
if w.Code != http.StatusUnauthorized {
t.Errorf("fresh-install no-token fail-closed: expected 401, got %d: %s", w.Code, w.Body.String())
if w.Code != http.StatusOK {
t.Errorf("C10 fail-open (no global tokens): expected 200, got %d: %s", w.Code, w.Body.String())
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
@@ -840,23 +831,18 @@ func TestAdminAuth_Issue180_ApprovalsListing_NoBearer_Returns401(t *testing.T) {
}
}
// TestAdminAuth_Issue180_ApprovalsListing_FreshInstall_FailsClosed pins the
// post-hardening contract (harden/no-fail-open-auth): on a fresh install (no
// tokens anywhere, no ADMIN_TOKEN), the canvas polling /approvals/pending with
// no bearer now gets 401. The former #180 fail-open (200 on no-tokens) is gone
// — local dev now provisions an ADMIN_TOKEN and the canvas authenticates with
// it (scripts/dev-start.sh).
func TestAdminAuth_Issue180_ApprovalsListing_FreshInstall_FailsClosed(t *testing.T) {
// TestAdminAuth_Issue180_ApprovalsListing_FailOpen_NoTokens documents the
// fail-open contract: on a fresh install (no tokens anywhere), the middleware
// must not block the canvas from polling /approvals/pending.
func TestAdminAuth_Issue180_ApprovalsListing_FailOpen_NoTokens(t *testing.T) {
t.Setenv("ADMIN_TOKEN", "")
t.Setenv("MOLECULE_ENV", "")
mockDB, mock, err := sqlmock.New()
if err != nil {
t.Fatalf("sqlmock.New: %v", err)
}
defer mockDB.Close()
// HasAnyLiveTokenGlobal returns 0 — fresh install, no tokens yet. Probed
// for the 503-on-outage semantics, but it opens no path now.
// HasAnyLiveTokenGlobal returns 0 — fresh install, no tokens yet.
mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0))
@@ -869,21 +855,24 @@ func TestAdminAuth_Issue180_ApprovalsListing_FreshInstall_FailsClosed(t *testing
req, _ := http.NewRequest(http.MethodGet, "/approvals/pending", nil)
r.ServeHTTP(w, req)
if w.Code != http.StatusUnauthorized {
t.Errorf("#180 fresh-install fail-closed: expected 401, got %d: %s", w.Code, w.Body.String())
if w.Code != http.StatusOK {
t.Errorf("#180 fail-open (no tokens): expected 200, got %d: %s", w.Code, w.Body.String())
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
}
}
// TestWorkspaceAuth_DevMode_NoBearer_FailsClosed pins the post-hardening
// contract (harden/no-fail-open-auth): the former local-dev escape hatch on
// WorkspaceAuth — which let a bearer-less request through when
// MOLECULE_ENV=dev + ADMIN_TOKEN unset — is GONE. Under exactly those
// conditions the request now 401s. Local dev authenticates with a
// provisioned ADMIN_TOKEN handed to the Canvas (scripts/dev-start.sh).
func TestWorkspaceAuth_DevMode_NoBearer_FailsClosed(t *testing.T) {
// TestWorkspaceAuth_DevModeEscapeHatch_NoBearer_FailsOpen documents the
// local-dev escape hatch on WorkspaceAuth. On `go run ./cmd/server` +
// `npm run dev`, Canvas at localhost:3000 calls the platform at
// localhost:8080 cross-port, so isSameOriginCanvas's Host==Referer
// check fails. Without this hatch the Canvas can't show per-workspace
// activity/delegations.
//
// SaaS never fires this branch because tenant provisioning sets both
// MOLECULE_ENV=production and ADMIN_TOKEN.
func TestWorkspaceAuth_DevModeEscapeHatch_NoBearer_FailsOpen(t *testing.T) {
t.Setenv("MOLECULE_ENV", "development")
t.Setenv("ADMIN_TOKEN", "")
@@ -893,9 +882,7 @@ func TestWorkspaceAuth_DevMode_NoBearer_FailsClosed(t *testing.T) {
}
defer mockDB.Close()
// No DB queries expected — WorkspaceAuth 401s before any lookup when
// there is no bearer / cookie. The hatch that used to short-circuit
// here no longer exists.
// No DB queries expected — the hatch short-circuits before any lookup.
r := gin.New()
r.GET("/workspaces/:id/activity", WorkspaceAuth(mockDB), func(c *gin.Context) {
@@ -907,8 +894,8 @@ func TestWorkspaceAuth_DevMode_NoBearer_FailsClosed(t *testing.T) {
"/workspaces/00000000-0000-0000-0000-000000000000/activity", nil)
r.ServeHTTP(w, req)
if w.Code != http.StatusUnauthorized {
t.Errorf("WorkspaceAuth dev-mode fail-closed: expected 401, got %d: %s", w.Code, w.Body.String())
if w.Code != http.StatusOK {
t.Errorf("WorkspaceAuth dev-mode hatch: expected 200, got %d: %s", w.Code, w.Body.String())
}
}
@@ -970,14 +957,15 @@ func TestWorkspaceAuth_DevModeEscapeHatch_IgnoredWhenAdminTokenSet(t *testing.T)
}
}
// TestAdminAuth_DevMode_NoBearer_FailsClosed pins the post-hardening contract
// (harden/no-fail-open-auth): the former Tier-1b dev-mode escape hatch — which
// let AdminAuth pass a bearer-less request when MOLECULE_ENV=dev + ADMIN_TOKEN
// unset, even with live tokens in the DB — is GONE. Under exactly those
// conditions the request now 401s. Local dev authenticates with a provisioned
// ADMIN_TOKEN handed to the Canvas as NEXT_PUBLIC_ADMIN_TOKEN
// (scripts/dev-start.sh).
func TestAdminAuth_DevMode_NoBearer_FailsClosed(t *testing.T) {
// TestAdminAuth_DevModeEscapeHatch_FailsOpenWithHasLiveTokens documents the
// Tier-1b dev-mode escape hatch. When the platform runs with MOLECULE_ENV=development
// and ADMIN_TOKEN is unset, AdminAuth must stay fail-open even after workspace
// tokens land in the DB. This keeps the Canvas dashboard usable in local dev
// after the first workspace is created (PR #1871 — quickstart bugless).
//
// SaaS never hits this path because tenant provisioning sets both
// ADMIN_TOKEN and MOLECULE_ENV=production.
func TestAdminAuth_DevModeEscapeHatch_FailsOpenWithHasLiveTokens(t *testing.T) {
t.Setenv("MOLECULE_ENV", "development")
t.Setenv("ADMIN_TOKEN", "")
@@ -988,7 +976,7 @@ func TestAdminAuth_DevMode_NoBearer_FailsClosed(t *testing.T) {
defer mockDB.Close()
// HasAnyLiveTokenGlobal returns 1 — tokens exist (post first-workspace).
// Probed for the 503-on-outage semantics, but it opens no path now.
// The Tier-1 fail-open branch WOULD close here. Tier-1b must still open.
mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1))
@@ -1001,8 +989,8 @@ func TestAdminAuth_DevMode_NoBearer_FailsClosed(t *testing.T) {
req, _ := http.NewRequest(http.MethodGet, "/workspaces", nil)
r.ServeHTTP(w, req)
if w.Code != http.StatusUnauthorized {
t.Errorf("dev-mode fail-closed: expected 401, got %d: %s", w.Code, w.Body.String())
if w.Code != http.StatusOK {
t.Errorf("dev-mode escape hatch: expected 200, got %d: %s", w.Code, w.Body.String())
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
@@ -1116,16 +1104,7 @@ func TestAdminAuth_Issue120_PatchWorkspace_NoBearer_Returns401(t *testing.T) {
// Accepts bearer or a matching Origin header. MUST NOT be used anywhere a
// forged request would leak data or create resources.
// TestCanvasOrBearer_NoTokens_FailsClosed pins the removal of the
// lazy-bootstrap fail-open (harden/no-fail-open-auth): a zero-token install
// must NOT pass everything through. A bearer-less request on a fresh install
// (HasAnyLiveTokenGlobal → 0) now 401s. Bootstrap is via ADMIN_TOKEN
// (scripts/dev-start.sh provisions it for local dev; operator/SaaS sets it in
// production) — not a zero-config fail-open.
//
// Watch-it-fail: restore `if !hasLive { c.Next(); return }` in CanvasOrBearer
// → this flips 401→200 and fails.
func TestCanvasOrBearer_NoTokens_FailsClosed(t *testing.T) {
func TestCanvasOrBearer_NoTokens_FailOpen(t *testing.T) {
mockDB, mock, err := sqlmock.New()
if err != nil {
t.Fatalf("sqlmock: %v", err)
@@ -1135,10 +1114,8 @@ func TestCanvasOrBearer_NoTokens_FailsClosed(t *testing.T) {
mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0))
handlerCalled := false
r := gin.New()
r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) {
handlerCalled = true
c.JSON(http.StatusOK, gin.H{"ok": true})
})
@@ -1146,11 +1123,8 @@ func TestCanvasOrBearer_NoTokens_FailsClosed(t *testing.T) {
req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil)
r.ServeHTTP(w, req)
if w.Code != http.StatusUnauthorized {
t.Errorf("zero-token install must fail CLOSED (lazy-bootstrap fail-open removed): got %d, want 401 (%s)", w.Code, w.Body.String())
}
if handlerCalled {
t.Error("handler reached on a fresh-install bearer-less request — lazy-bootstrap fail-open not removed")
if w.Code != http.StatusOK {
t.Errorf("bootstrap fail-open: got %d, want 200 (%s)", w.Code, w.Body.String())
}
}
@@ -1221,16 +1195,7 @@ func TestCanvasOrBearer_TokensExist_WrongOrigin_Returns401(t *testing.T) {
}
}
// TestCanvasOrBearer_TokensExist_ForgeableOrigin_NoBearer_FailsClosed pins the
// removal of the cross-origin Origin-match cosmetic path
// (harden/no-fail-open-auth). A no-bearer request whose forgeable Origin header
// matches CORS_ORIGINS used to pass; it now 401s. The canvas always sends a
// bearer (NEXT_PUBLIC_ADMIN_TOKEN), so legitimate traffic is unaffected, and a
// curl that forges Origin can no longer reach even a cosmetic route.
//
// Watch-it-fail: restore `if canvasOriginAllowed(c.GetHeader("Origin")) {
// c.Next(); return }` in CanvasOrBearer → this flips 401→200 and fails.
func TestCanvasOrBearer_TokensExist_ForgeableOrigin_NoBearer_FailsClosed(t *testing.T) {
func TestCanvasOrBearer_TokensExist_CanvasOrigin_Passes(t *testing.T) {
mockDB, mock, err := sqlmock.New()
if err != nil {
t.Fatalf("sqlmock: %v", err)
@@ -1242,24 +1207,18 @@ func TestCanvasOrBearer_TokensExist_ForgeableOrigin_NoBearer_FailsClosed(t *test
t.Setenv("CORS_ORIGINS", "https://acme.moleculesai.app,https://bob.moleculesai.app")
handlerCalled := false
r := gin.New()
r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) {
handlerCalled = true
c.JSON(http.StatusOK, gin.H{"ok": true})
})
w := httptest.NewRecorder()
req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil)
// A matching-but-forgeable Origin with NO bearer must NOT pass anymore.
req.Header.Set("Origin", "https://acme.moleculesai.app")
r.ServeHTTP(w, req)
if w.Code != http.StatusUnauthorized {
t.Errorf("no-bearer request on a forgeable matching Origin must fail CLOSED (Origin-match path removed): got %d, want 401 (%s)", w.Code, w.Body.String())
}
if handlerCalled {
t.Error("handler reached on a no-bearer forgeable-Origin request — Origin-match fail-open not removed")
if w.Code != http.StatusOK {
t.Errorf("canvas origin: got %d, want 200 (%s)", w.Code, w.Body.String())
}
}
@@ -1339,9 +1298,21 @@ func TestCanvasOrBearer_WrongOrigin_Blocked(t *testing.T) {
}
}
// (harden/no-fail-open-auth) TestCanvasOriginAllowed_* were REMOVED along with
// the canvasOriginAllowed helper they exercised — the forgeable cross-origin
// Origin-match cosmetic path no longer exists in CanvasOrBearer.
func TestCanvasOriginAllowed_EmptyOriginRejected(t *testing.T) {
if canvasOriginAllowed("") {
t.Error("empty Origin must not pass")
}
}
func TestCanvasOriginAllowed_LocalhostDefault(t *testing.T) {
t.Setenv("CORS_ORIGINS", "")
if !canvasOriginAllowed("http://localhost:3000") {
t.Error("localhost:3000 should be allowed by default")
}
if canvasOriginAllowed("http://evil.example.com") {
t.Error("random origin should not be allowed")
}
}
// ── Issue #623 regression ─────────────────────────────────────────────────────
// AdminAuth must NOT accept forged Origin headers. Any container on the Docker
@@ -1,160 +0,0 @@
package models
// Contract test: the EXACT request bodies the workspace runtime emits for
// POST /registry/register and POST /registry/heartbeat bind cleanly against
// the real RegisterPayload / HeartbeatPayload structs — and a body missing a
// binding:"required" field is REJECTED.
//
// Why this exists — the same blind-spot class as the #2251 A2A bug
// ----------------------------------------------------------------
// The existing registry_test.go binds HAND-WRITTEN JSON literals
// (`{"id":"ws-123","agent_card":{...}}`) that encode the *test author's*
// idea of the wire shape, not the bytes the runtime actually produces. The
// runtime's producer (molecule-ai-workspace-runtime main.py:484 /
// heartbeat.py:233) is a separate hand-rolled dict. Nothing pinned that the
// two agree on the required keys.
//
// These golden bodies are byte-for-byte the shapes the runtime emits (see the
// companion Python contract test test_registry_payload_contract.py, which
// asserts the runtime PRODUCES exactly these required keys). Together the two
// halves form a producer→consumer contract: if the runtime drops a required
// key, the Python test fails; if this struct adds/renames a required field,
// the Go test below fails — drift can't pass silently on either side.
//
// gin's ShouldBindJSON runs `binding.JSON.BindBody`, which is json.Unmarshal
// followed by the go-playground validator on the `binding` tags. We invoke
// that exact path here without standing up a gin.Context / DB / Redis.
import (
"testing"
"github.com/gin-gonic/gin/binding"
)
// bindJSON mirrors gin's ShouldBindJSON: decode + validate the `binding` tags.
func bindJSON(t *testing.T, body []byte, out any) error {
t.Helper()
return binding.JSON.BindBody(body, out)
}
// ---- /registry/register --------------------------------------------------
// The exact body main.py emits (workspace_id + workspace_url + the hand-rolled
// agent_card_dict). agent_card is json.RawMessage on the struct so its inner
// shape is opaque to the bind — only presence is required.
const runtimeRegisterBody = `{
"id": "11111111-1111-1111-1111-111111111111",
"url": "https://ws.example/a2a",
"agent_card": {
"name": "pm",
"description": "team lead",
"version": "1.0.0",
"url": "https://ws.example/a2a",
"skills": [{"id": "coding", "name": "coding", "description": "coding", "tags": []}],
"capabilities": {"streaming": true, "pushNotifications": false},
"configuration_status": "ready"
}
}`
func TestRegisterPayload_RuntimeBodyBinds(t *testing.T) {
var p RegisterPayload
if err := bindJSON(t, []byte(runtimeRegisterBody), &p); err != nil {
t.Fatalf("runtime register body must bind against RegisterPayload, got: %v", err)
}
if p.ID != "11111111-1111-1111-1111-111111111111" {
t.Errorf("id not decoded: %q", p.ID)
}
if len(p.AgentCard) == 0 {
t.Error("agent_card must be present (binding:required)")
}
if p.URL == "" {
t.Error("url should round-trip from the runtime body")
}
}
func TestRegisterPayload_MissingID_Rejected(t *testing.T) {
// The #2251-style regression: runtime drops the required `id` key.
const noID = `{"url":"https://ws.example/a2a","agent_card":{"name":"pm"}}`
var p RegisterPayload
if err := bindJSON(t, []byte(noID), &p); err == nil {
t.Fatal("a register body missing the required `id` MUST be rejected (would 400); got nil error")
}
}
func TestRegisterPayload_MissingAgentCard_Rejected(t *testing.T) {
const noCard = `{"id":"ws-1","url":"https://ws.example/a2a"}`
var p RegisterPayload
if err := bindJSON(t, []byte(noCard), &p); err == nil {
t.Fatal("a register body missing the required `agent_card` MUST be rejected (would 400); got nil error")
}
}
// ---- /registry/heartbeat -------------------------------------------------
// The exact body heartbeat.py:233 emits (no wedge/metadata, the healthy case).
const runtimeHeartbeatBody = `{
"workspace_id": "00000000-0000-0000-0000-000000000688",
"error_rate": 0.0,
"sample_error": "",
"active_tasks": 0,
"current_task": "",
"uptime_seconds": 42
}`
func TestHeartbeatPayload_RuntimeBodyBinds(t *testing.T) {
var p HeartbeatPayload
if err := bindJSON(t, []byte(runtimeHeartbeatBody), &p); err != nil {
t.Fatalf("runtime heartbeat body must bind against HeartbeatPayload, got: %v", err)
}
if p.WorkspaceID != "00000000-0000-0000-0000-000000000688" {
t.Errorf("workspace_id not decoded: %q", p.WorkspaceID)
}
if p.UptimeSeconds != 42 {
t.Errorf("uptime_seconds not decoded: %d", p.UptimeSeconds)
}
}
// The wedged-runtime heartbeat (heartbeat.py _runtime_state_payload +
// _runtime_metadata_payload layered on) must also bind — runtime_metadata is a
// pointer so a present block decodes, and an absent one stays nil.
const runtimeHeartbeatWedgedBody = `{
"workspace_id": "00000000-0000-0000-0000-000000000688",
"error_rate": 0.5,
"active_tasks": 1,
"current_task": "stuck",
"uptime_seconds": 99,
"runtime_state": "wedged",
"sample_error": "Control request timeout: initialize",
"runtime_metadata": {
"capabilities": {"heartbeat": true, "scheduler": false},
"idle_timeout_seconds": 600
}
}`
func TestHeartbeatPayload_WedgedRuntimeBodyBinds(t *testing.T) {
var p HeartbeatPayload
if err := bindJSON(t, []byte(runtimeHeartbeatWedgedBody), &p); err != nil {
t.Fatalf("wedged heartbeat body must bind, got: %v", err)
}
if p.RuntimeState != "wedged" {
t.Errorf("runtime_state not decoded: %q", p.RuntimeState)
}
if p.RuntimeMetadata == nil {
t.Fatal("runtime_metadata must decode to a non-nil pointer when present")
}
if got := p.RuntimeMetadata.Capabilities["heartbeat"]; !got {
t.Error("runtime_metadata.capabilities[heartbeat] should be true")
}
if p.RuntimeMetadata.IdleTimeoutSeconds == nil || *p.RuntimeMetadata.IdleTimeoutSeconds != 600 {
t.Error("runtime_metadata.idle_timeout_seconds should decode to 600")
}
}
func TestHeartbeatPayload_MissingWorkspaceID_Rejected(t *testing.T) {
// The drift the producer-side Python test guards: workspace_id renamed/dropped.
const renamed = `{"id":"ws-688","error_rate":0.0,"active_tasks":0}`
var p HeartbeatPayload
if err := bindJSON(t, []byte(renamed), &p); err == nil {
t.Fatal("a heartbeat body missing the required `workspace_id` MUST be rejected (would 400); got nil error")
}
}
@@ -1,336 +0,0 @@
package providers
// derive_provider_matrix_test.go — the SSOT-DRIVEN provider-routing matrix
// (internal#718 / coverage-audit hole closure).
//
// GOAL (CTO "e2e covers every supported runtime and provider, no regressions"):
// a KEYLESS, REQUIRED-lane-gateable test that asserts EVERY offered
// (runtime × model/provider arm) in the providers SSOT resolves to the EXACT
// correct provider via DeriveProvider — closing the provider-routing-correctness
// hole for ALL providers + every BYOK arm without needing any LLM key.
//
// WHY THIS IS THE HIGH-LEVERAGE TEST. DeriveProvider(runtime, modelId) +
// ModelPrefixMatch resolve a model id to a provider with NO upstream call — it
// is a pure function of the merged registry. So the ENTIRE offered routing table
// (every (runtime → provider) pair, including hermes's 17 name-only BYOK arms,
// claude-code's zai/deepseek/xiaomi-mimo, openclaw's byok-openai/byok-minimax/
// groq/openrouter/custom, codex's byok-minimax, etc.) is gateable in the
// REQUIRED `CI / all-required` lane with zero secrets. A regression in the
// routing table (wrong provider, dropped arm, bad regex) now reds CI instead of
// shipping silently and wedging a tenant agent at boot.
//
// SELF-MAINTAINING / SSOT-DRIVEN (NOT hardcoded). The matrix is DRIVEN FROM the
// loaded manifest (LoadManifest().Runtimes — the same SSOT production reads),
// not a hand-listed table:
//
// - EXACT-LISTED arms (a runtime ref with non-empty Models): every model id is
// iterated straight off the SSOT and its EXPECTED provider is COMPUTED from
// the SSOT (the native arm(s) that exact-list the id; first-declared wins the
// "one id, two auth arms" codex/anthropic shape) — so a newly-added model is
// AUTO-COVERED and a misrouted one fails RED naming the mismatch.
// - NAME-ONLY arms (a runtime ref with zero Models — pure prefix-routing BYOK):
// these have no exact id to iterate, so each is probed with a REPRESENTATIVE
// BYOK id its regex must own (representativeBYOKModel). The matrix REQUIRES a
// representative for EVERY name-only arm it encounters in the SSOT — so
// "added a name-only provider arm but forgot to wire routing / supply a
// sample" fails RED here, keeping the probe set honest as the SSOT grows.
//
// Every asserted (runtime, model) is ALSO checked for registration-validity
// (the validateRegisteredModelForRuntime predicate: on the platform menu OR
// DeriveProvider resolves) so the matrix proves no offered id silently falls
// through to "unregistered/unselectable".
import (
"sort"
"testing"
)
// representativeBYOKModel maps a provider NAME to a representative BYOK model id
// that the provider's model_prefix_match MUST own. It is the routing probe for
// NAME-ONLY native arms (refs with zero exact Models — the cp#529 pure-prefix
// BYOK arms). The matrix asserts every name-only arm in the SSOT has an entry
// here AND that DeriveProvider routes the sample id to exactly that provider.
//
// Adding a name-only provider arm to providers.yaml WITHOUT adding a
// representative here fails TestDeriveProviderMatrix_SSOTDriven loudly — that is
// the self-maintaining contract that keeps "new BYOK arm but no routing proof"
// from shipping. The id must be one the provider's regex matches and NO sibling
// native arm of the same runtime also matches (else it is a registry overlap the
// load guard or the auth-env tie-break — not this map — must resolve).
var representativeBYOKModel = map[string]string{
// hermes passthrough + bare-vendor BYOK arms (all ^name[:/]) ----------
"openrouter": "openrouter/anthropic/claude-3.5-sonnet",
"huggingface": "huggingface/meta-llama/Llama-3.3-70B",
"ai-gateway": "ai-gateway/openai/gpt-4o",
"opencode-zen": "opencode-zen/some-model",
"opencode-go": "opencode-go/some-model",
"kilocode": "kilocode/some-model",
"custom": "custom/my-endpoint-model",
"nvidia": "nvidia/nemotron-4",
"arcee": "arcee/coder",
"ollama-cloud": "ollama-cloud/qwen2.5",
"minimax-cn": "minimax-cn/abab6.5",
"nousresearch": "nousresearch/Hermes-3",
// claude-code + hermes name-only arms (case-insensitive vendor-prefixed)
"deepseek": "deepseek-chat",
"zai": "zai:glm-4.6",
"xiaomi-mimo": "mimo-7b",
"alibaba": "qwen-max",
// dedicated BYOK-vendor arms (hermes/openclaw/codex) ------------------
"byok-anthropic": "anthropic/claude-opus-4-7",
"byok-gemini": "gemini/gemini-2.5-pro",
"byok-openai": "openai:gpt-4o",
"byok-minimax": "minimax:MiniMax-M2.7",
"groq": "groq:llama-3.3-70b",
}
// sortedRuntimeNames returns the manifest's runtime names sorted, for a
// deterministic iteration order (the loaded Runtimes is a map).
func sortedRuntimeNames(m *Manifest) []string {
out := make([]string, 0, len(m.Runtimes))
for rt := range m.Runtimes {
out = append(out, rt)
}
sort.Strings(out)
return out
}
// expectedExactProvider computes, FROM THE SSOT, the provider DeriveProvider
// must resolve a given exact-listed model id to for a runtime — without calling
// DeriveProvider. It mirrors DeriveProvider's exact-id rule (steps 3): the
// native arm(s), in declaration order, that exact-list the id. When exactly one
// arm lists it, that arm is the answer. When MORE THAN ONE lists it (the
// legitimate "one model id, two auth arms" codex gpt-* / claude-code anthropic
// shape), the FIRST-declared arm is the deterministic no-auth default. Returns
// "" if no native arm exact-lists the id (caller treats that as "not an
// exact-listed case").
func expectedExactProvider(native RuntimeNativeSet, model string) string {
for _, ref := range native.Providers {
for _, mid := range ref.Models {
if mid == model {
// First-declared arm that lists it = DeriveProvider's no-auth
// answer (single-arm → that arm; multi-arm → first declared).
return ref.Name
}
}
}
return ""
}
// isRegisteredForRuntime mirrors handlers.validateRegisteredModelForRuntime's
// allow predicate against the SAME registry (kept here so the matrix lives in
// the providers package, which owns the SSOT and cannot import handlers): a
// (runtime, model) is registration-valid iff it is on the runtime's platform
// menu (ModelsForRuntime) OR DeriveProvider resolves a native provider for it.
// A name-only BYOK id is NOT on the platform menu but IS routable — exactly the
// cp#529 routability-aware OR the handler enforces and the controlplane drift
// checker mirrors.
func isRegisteredForRuntime(m *Manifest, runtime, model string) bool {
models, err := m.ModelsForRuntime(runtime)
if err != nil {
return false // unknown runtime — not registration-valid here.
}
for _, mid := range models {
if mid == model {
return true
}
}
_, derr := m.DeriveProvider(runtime, model, nil)
return derr == nil
}
// TestDeriveProviderMatrix_SSOTDriven is the headline coverage test: for EVERY
// runtime in the registry and EVERY model/provider arm offered to it in the
// SSOT, it asserts (a) DeriveProvider resolves to the EXACT expected provider,
// (b) the (runtime, model) is registration-valid, and (c) NO offered id silently
// resolves to the wrong arm or falls through to a default/error. The table is
// DRIVEN FROM LoadManifest().Runtimes (the production SSOT) — not hardcoded — so
// a newly-added provider/model is auto-covered and an unrouteable one fails RED.
func TestDeriveProviderMatrix_SSOTDriven(t *testing.T) {
m, err := LoadManifest()
if err != nil {
t.Fatalf("LoadManifest() error = %v", err)
}
var (
exactPairs int // (runtime, exact-listed model) assertions
nameOnlyArms int // (runtime, name-only arm) routing probes
coveredArms int // distinct (runtime, provider) arms touched
coveredProvs = map[string]struct{}{}
coveredRTs = map[string]struct{}{}
)
for _, rt := range sortedRuntimeNames(m) {
native := m.Runtimes[rt]
coveredRTs[rt] = struct{}{}
for _, ref := range native.Providers {
coveredArms++
coveredProvs[ref.Name] = struct{}{}
if len(ref.Models) == 0 {
// ---- NAME-ONLY ARM: pure prefix-routing BYOK -----------------
// No exact id to iterate; probe the provider's regex with a
// representative BYOK id and assert it routes to THIS arm. The
// representative MUST exist (self-maintaining contract).
nameOnlyArms++
sample, ok := representativeBYOKModel[ref.Name]
if !ok {
t.Errorf("name-only arm %q on runtime %q has NO representativeBYOKModel entry — add a sample BYOK id its model_prefix_match owns so its routing is proven (SSOT grew, probe set did not)", ref.Name, rt)
continue
}
t.Run(rt+"/name-only/"+ref.Name, func(t *testing.T) {
got, derr := m.DeriveProvider(rt, sample, nil)
if derr != nil {
t.Fatalf("DeriveProvider(%q, %q [name-only arm %q sample]) errored: %v — the arm is offered but its sample id does not route (bad/missing regex, or a sibling arm shadows it)", rt, sample, ref.Name, derr)
}
if got.Name != ref.Name {
t.Errorf("DeriveProvider(%q, %q) = %q, want %q (the name-only arm the sample id was chosen to probe) — routing table misroutes this BYOK arm", rt, sample, got.Name, ref.Name)
}
if !isRegisteredForRuntime(m, rt, sample) {
t.Errorf("name-only arm probe (%q, %q) is not registration-valid — a routable BYOK id must pass validateRegisteredModelForRuntime", rt, sample)
}
})
continue
}
// ---- EXACT-LISTED ARM: iterate every model id off the SSOT -------
for _, model := range ref.Models {
model := model
want := expectedExactProvider(native, model)
// want is computed from the SSOT and equals THIS ref.Name unless
// an earlier-declared arm also exact-lists the same id (the
// "one id, two auth arms" codex/anthropic shape), in which case
// the first-declared arm is the no-auth default. Either way the
// computed `want` is what DeriveProvider(no-auth) must return.
t.Run(rt+"/"+ref.Name+"/"+model, func(t *testing.T) {
exactPairs++
got, derr := m.DeriveProvider(rt, model, nil)
if derr != nil {
t.Fatalf("DeriveProvider(%q, %q) errored: %v — an OFFERED exact-listed id must resolve, never fall through", rt, model, derr)
}
if got.Name != want {
t.Errorf("DeriveProvider(%q, %q) = %q, want %q — offered id resolves to the WRONG arm (routing regression)", rt, model, got.Name, want)
}
if !isRegisteredForRuntime(m, rt, model) {
t.Errorf("offered exact-listed id (%q, %q) is not registration-valid (validateRegisteredModelForRuntime would reject it) — SSOT lists it but it is unrouteable/unregistered", rt, model)
}
})
}
}
}
t.Logf("MATRIX COVERAGE: %d runtimes, %d (runtime×provider) arms (%d distinct providers), %d exact-listed (runtime×model) assertions, %d name-only BYOK routing probes",
len(coveredRTs), coveredArms, len(coveredProvs), exactPairs, nameOnlyArms)
// Floor guards: if a refactor accidentally empties the SSOT or skips arms,
// these fail rather than letting an empty matrix pass green (a coverage
// regression that would otherwise be invisible).
if exactPairs == 0 {
t.Error("matrix asserted ZERO exact-listed pairs — the SSOT-driven iteration is broken")
}
if nameOnlyArms == 0 {
t.Error("matrix asserted ZERO name-only arm probes — the name-only BYOK routing is no longer being exercised")
}
}
// TestDeriveProviderMatrix_RepresentativesAreUsed proves the
// representativeBYOKModel map carries no DEAD entries: every key must correspond
// to a provider that actually appears as a name-only arm in the SSOT. A stale
// entry (a provider renamed/removed but its sample left behind) would silently
// rot; this fails it RED so the probe set stays a faithful mirror of the SSOT's
// name-only arm set. (The complementary direction — every name-only arm HAS a
// representative — is enforced inside TestDeriveProviderMatrix_SSOTDriven.)
func TestDeriveProviderMatrix_RepresentativesAreUsed(t *testing.T) {
m, err := LoadManifest()
if err != nil {
t.Fatalf("LoadManifest() error = %v", err)
}
nameOnlyProviders := map[string]struct{}{}
for _, native := range m.Runtimes {
for _, ref := range native.Providers {
if len(ref.Models) == 0 {
nameOnlyProviders[ref.Name] = struct{}{}
}
}
}
for name := range representativeBYOKModel {
if _, ok := nameOnlyProviders[name]; !ok {
t.Errorf("representativeBYOKModel has a DEAD entry %q — no name-only arm in the SSOT uses it; remove it or fix the name", name)
}
}
}
// TestDeriveProviderMatrix_KnownTrickyForms pins, as EXPLICIT assertions, the
// historically-bug-prone routing forms the SSOT-driven loop covers implicitly —
// so a regression in any of them names the exact class it broke (these are the
// cases #2263/#2274/#2265 shipped/nearly-shipped before the SSOT tightening).
// Explicit here = a failing CI line that says "the colon-vs-slash minimax split
// broke" rather than only a generic matrix-cell failure.
func TestDeriveProviderMatrix_KnownTrickyForms(t *testing.T) {
m, err := LoadManifest()
if err != nil {
t.Fatalf("LoadManifest() error = %v", err)
}
cases := []struct {
name string
runtime string
model string
authEnv []string
want string // provider name; "" => expect an unregistered/unrouteable error
wantErr bool
}{
// --- the #2263/#2274 colon-vs-slash-vs-bare MiniMax triple on claude-code:
// THREE spellings, THREE distinct outcomes. A routing-table edit that
// collapses any two of these reds here.
{"minimax bare -> BYOK minimax provider", "claude-code", "MiniMax-M2.7", []string{"MINIMAX_API_KEY"}, "minimax", false},
{"minimax slash -> platform (proxy upstream)", "claude-code", "minimax/MiniMax-M2.7", nil, "platform", false},
{"minimax colon -> UNREGISTERED on claude-code (adapter can't strip minimax:)", "claude-code", "minimax:MiniMax-M2.7", nil, "", true},
// --- openai namespaced is REJECTED on platform-shared runtimes that do
// not natively wire an openai arm (#2265 class): claude-code offers NO
// openai/openai-* native arm, so a bare gpt-* and an `openai:`/`openai/`
// id are both unregistered for it (the platform-shared openai vendor is
// never wired into a BYOK runtime → cannot bill the platform's key).
{"claude-code bare gpt -> unregistered (#2265)", "claude-code", "gpt-5.5", nil, "", true},
{"claude-code openai-namespaced -> unregistered (#2265)", "claude-code", "openai:gpt-4o", nil, "", true},
// --- groq routes to groq (openclaw's dedicated BYOK groq arm) ----------
{"openclaw groq: -> groq", "openclaw", "groq:llama-3.3-70b", nil, "groq", false},
// --- openclaw colon BYOK minimax (the runtime's DEFAULT model) ---------
{"openclaw minimax: -> byok-minimax", "openclaw", "minimax:MiniMax-M2.7", nil, "byok-minimax", false},
// --- hermes namespaced shared-vendor ids route to the BYOK-vendor arm,
// NOT platform (the cp#529 billing-safety property: a tenant's
// anthropic/gemini/openai/minimax id bills the TENANT key, not platform).
{"hermes anthropic/ -> byok-anthropic NOT platform", "hermes", "anthropic/claude-opus-4-7", nil, "byok-anthropic", false},
{"hermes gemini/ -> byok-gemini", "hermes", "gemini/gemini-2.5-pro", nil, "byok-gemini", false},
{"hermes openai: -> byok-openai", "hermes", "openai:gpt-4o", nil, "byok-openai", false},
{"hermes minimax: -> byok-minimax", "hermes", "minimax:MiniMax-M2", nil, "byok-minimax", false},
// --- codex BYOK minimax token-plan id routes via the narrow codex- leg --
{"codex codex-minimax- -> byok-minimax", "codex", "codex-minimax-m2.7", nil, "byok-minimax", false},
// --- codex gpt-* default (no auth) -> openai-subscription (first arm) ---
{"codex gpt default -> openai-subscription", "codex", "gpt-5.5", nil, "openai-subscription", false},
{"codex gpt with OPENAI_API_KEY -> openai-api", "codex", "gpt-5.5", []string{"OPENAI_API_KEY"}, "openai-api", false},
// --- google-adk platform vs BYOK google split -------------------------
{"google-adk platform: -> platform", "google-adk", "platform:gemini-2.5-pro", nil, "platform", false},
{"google-adk bare gemini -> google (BYOK)", "google-adk", "gemini-2.5-pro", nil, "google", false},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
got, derr := m.DeriveProvider(tc.runtime, tc.model, tc.authEnv)
if tc.wantErr {
if derr == nil {
t.Fatalf("DeriveProvider(%q, %q, %v) = %q, want an unregistered/unrouteable ERROR", tc.runtime, tc.model, tc.authEnv, got.Name)
}
if got.Name != "" {
t.Errorf("DeriveProvider(%q, %q) on error must return a zero Provider, got %q", tc.runtime, tc.model, got.Name)
}
return
}
if derr != nil {
t.Fatalf("DeriveProvider(%q, %q, %v) errored: %v, want %q", tc.runtime, tc.model, tc.authEnv, derr, tc.want)
}
if got.Name != tc.want {
t.Errorf("DeriveProvider(%q, %q, %v) = %q, want %q", tc.runtime, tc.model, tc.authEnv, got.Name, tc.want)
}
})
}
}
@@ -16,7 +16,7 @@ const SchemaVersion = 1
// Fingerprint is a stable content hash of the generated projection (schema
// version + provider catalog + runtime native sets). It changes iff the
// registry DATA changes (comment-only YAML edits do not churn it).
const Fingerprint = "e457249eb0fd77a2"
const Fingerprint = "ec6b93409e7b9cf8"
// GenProvider is the generated projection of one provider catalog entry —
// the subset a downstream consumer needs to derive + display a provider.
@@ -84,8 +84,8 @@ var Runtimes = map[string][]GenRuntimeRef{
"claude-code": {
{Name: "anthropic-oauth", Models: []string{"sonnet", "opus", "haiku", "anthropic:sonnet", "anthropic:opus", "anthropic:haiku"}},
{Name: "anthropic-api", Models: []string{"claude-sonnet-4-6", "claude-opus-4-7", "claude-haiku-4-5", "claude-sonnet-4-5", "anthropic:claude-sonnet-4-6", "anthropic:claude-opus-4-7", "anthropic:claude-haiku-4-5", "anthropic:claude-sonnet-4-5"}},
{Name: "kimi-coding", Models: []string{"kimi-for-coding", "kimi-k2.5", "kimi-k2"}},
{Name: "minimax", Models: []string{"MiniMax-M2", "MiniMax-M2.7", "MiniMax-M2.7-highspeed", "MiniMax-M3"}},
{Name: "kimi-coding", Models: []string{"kimi-for-coding", "kimi-k2.5", "kimi-k2", "moonshot:kimi-k2.6", "moonshot:kimi-k2.5"}},
{Name: "minimax", Models: []string{"MiniMax-M2", "MiniMax-M2.7", "MiniMax-M2.7-highspeed", "MiniMax-M3", "minimax:MiniMax-M2", "minimax:MiniMax-M2.7", "minimax:MiniMax-M2.7-highspeed", "minimax:MiniMax-M3"}},
{Name: "platform", Models: []string{"anthropic/claude-opus-4-7", "anthropic/claude-sonnet-4-6", "moonshot/kimi-k2.6", "moonshot/kimi-k2.5", "minimax/MiniMax-M2.7", "minimax/MiniMax-M2.7-highspeed", "minimax/MiniMax-M3"}},
{Name: "zai", Models: []string{}},
{Name: "deepseek", Models: []string{}},
@@ -827,25 +827,29 @@ runtimes:
- anthropic:claude-sonnet-4-5
- name: kimi-coding
# BYOK kimi-coding gateway ids — bare form is the canonical id
# the gateway routes. The colon form `moonshot:kimi-k2.*` was
# removed because claude-code's adapter cannot strip the
# `moonshot:` prefix — it only handles `anthropic:`/`claude:`
# (cp#521). The bare forms already cover these models.
# the gateway routes; the colon form `moonshot:kimi-k2.*` is the
# legacy BYOK selection form (already in use on the openclaw
# native set below). claude-code's adapter accepts both
# (internal#718 P4 PR-1).
models:
- kimi-for-coding
- kimi-k2.5
- kimi-k2
- moonshot:kimi-k2.6
- moonshot:kimi-k2.5
- name: minimax
# BYOK MiniMax ids — bare form is the canonical id. The colon
# forms `minimax:MiniMax-*` were removed because claude-code's
# adapter cannot strip the `minimax:` prefix — it only handles
# `anthropic:`/`claude:` (cp#521). The bare forms already cover
# these models.
# BYOK MiniMax ids — bare form is the canonical id; colon form is
# the legacy BYOK selection spelling carried in the create corpus
# and the openclaw template (internal#718 P4 PR-1).
models:
- MiniMax-M2
- MiniMax-M2.7
- MiniMax-M2.7-highspeed
- MiniMax-M3
- minimax:MiniMax-M2
- minimax:MiniMax-M2.7
- minimax:MiniMax-M2.7-highspeed
- minimax:MiniMax-M3
# Platform-managed (no tenant key; Molecule owns billing). The
# vendor/model-namespaced ids the proxy resolves to the upstream vendor.
# Canonical for the template's `provider: platform` model entries — the
@@ -324,46 +324,3 @@ func TestVertexProviderRegistered(t *testing.T) {
}
}
}
// TestPlatformProvider_AuthEnvIsUsageTokenOnly is the SSOT-side regression
// gate for the platform-managed auth_env drift class (issue #2250 — the
// codex template's `platform` provider shipped
// auth_env: [MOLECULE_LLM_USAGE_TOKEN, ANTHROPIC_API_KEY], wrongly
// advertising a vendor key under a platform-managed provider).
//
// The `platform` provider is the closed Molecule proxy arm: the platform
// owns billing and injects MOLECULE_LLM_USAGE_TOKEN, so a tenant supplies
// NO vendor credential. Listing ANTHROPIC_API_KEY (or any other vendor key)
// in its auth_env makes the canvas demand a credential the platform path
// neither needs nor uses, and lets a stray vendor key satisfy the
// "auth present" check on a path that ignores it — exactly the wrong-bill /
// silent-no-op failure mode the BYOK-vs-platform split exists to prevent.
//
// EXACT-equality (not membership): the prior template-side test only
// asserted `"MOLECULE_LLM_USAGE_TOKEN" in auth_env`, which PASSED against
// the buggy two-element list. Pin the WHOLE set so an extra vendor key
// trips the gate. This is the core providers.yaml SSOT; the template
// derives from / must byte-match this set (drift-gated by molecule-ci).
// On core this currently PASSES (auth_env is already clean; the vendor
// key lives in the separate auth_token_env field) — the gate locks that
// in so a future drift onto this SSOT trips CI.
func TestPlatformProvider_AuthEnvIsUsageTokenOnly(t *testing.T) {
ps, err := Load()
if err != nil {
t.Fatalf("Load() error = %v", err)
}
var platform *Provider
for i := range ps {
if ps[i].Name == "platform" {
platform = &ps[i]
break
}
}
if platform == nil {
t.Fatal("platform provider missing from providers.yaml — the closed proxy arm must exist")
}
want := []string{"MOLECULE_LLM_USAGE_TOKEN"}
if len(platform.AuthEnv) != len(want) || platform.AuthEnv[0] != want[0] {
t.Errorf("platform provider auth_env = %v, want exactly %v — a vendor key under a platform-managed provider is the #2250 drift; auth_token_env (the proxy's internal projection target) is a SEPARATE field and must not leak into auth_env", platform.AuthEnv, want)
}
}
@@ -117,15 +117,14 @@ func TestModelsForRuntime_ExactModelIDs(t *testing.T) {
"anthropic:claude-haiku-4-5", "anthropic:claude-sonnet-4-5",
// anthropic via platform proxy (namespaced)
"anthropic/claude-opus-4-7", "anthropic/claude-sonnet-4-6",
// kimi (kimi-coding gateway, bare form only — colon-forms removed
// because claude-code's adapter cannot strip the moonshot: prefix;
// openclaw retains them natively, cp#521).
// kimi (kimi-coding gateway, bare + legacy colon-namespaced BYOK)
"kimi-for-coding", "kimi-k2.5", "kimi-k2",
"moonshot:kimi-k2.6", "moonshot:kimi-k2.5",
// kimi via platform proxy
"moonshot/kimi-k2.6", "moonshot/kimi-k2.5",
// minimax BYOK (bare form only — colon-forms removed because
// claude-code's adapter cannot strip the minimax: prefix, cp#521).
// minimax BYOK (bare + legacy colon-namespaced)
"MiniMax-M2", "MiniMax-M2.7", "MiniMax-M2.7-highspeed", "MiniMax-M3",
"minimax:MiniMax-M2", "minimax:MiniMax-M2.7", "minimax:MiniMax-M2.7-highspeed", "minimax:MiniMax-M3",
// minimax via platform proxy
"minimax/MiniMax-M2.7", "minimax/MiniMax-M2.7-highspeed", "minimax/MiniMax-M3",
},
@@ -29,7 +29,7 @@ import (
// canonicalProvidersYAMLSHA256 is the sha256 of the canonical providers.yaml as
// synced from molecule-controlplane. Bumped deliberately on each re-sync (see
// file doc). Cross-checked live by the sync-providers-yaml CI workflow.
const canonicalProvidersYAMLSHA256 = "9eb6f97fc37b528c91936be4a75dd87f6c7172742b4535d76b9bb2231ee18e80"
const canonicalProvidersYAMLSHA256 = "846ddef11ec423ebf2e96b5da21bd89129dbc3f0a2d14ac086940e005c079387"
func TestSyncedYAMLMatchesCanonicalSHA(t *testing.T) {
sum := sha256.Sum256(embeddedYAML)
@@ -36,22 +36,11 @@ package registry
// runtime <> 'external'. Paused/hibernated/removed/provisioning/
// awaiting_agent rows are out of scope; external rows are covered by
// the remote-heartbeat pass.
// - Per-cycle row cap + per-cycle deadline + per-workspace timeout so
// one slow CP call (or a degraded-but-not-erroring CP) can't stall
// the sweep.
// - TOCTOU re-confirm before any flip: IsRunning resolves instance_id
// independently, so a row whose instance_id was cleared/NULLed (by a
// concurrent delete, the CP-orphan-sweeper, or a reprovision) between
// the reconciler's SELECT and the IsRunning probe yields a STALE
// (false, nil) that does NOT prove the EC2 is dead. We re-read the
// row's current (status, instance_id) and flip ONLY when the SAME
// non-empty instance we asked CP about is still the workspace's
// recorded instance AND it's still online/degraded. Mirrors the
// guarded-write re-confirm in healthsweep.
// - Per-cycle row cap + per-workspace timeout so one slow CP call can't
// stall the sweep.
import (
"context"
"database/sql"
"log"
"time"
@@ -81,20 +70,6 @@ const CPInstanceReconcileLimit = 200
// timeout context derived from the cycle context.
const cpInstanceCheckTimeout = 10 * time.Second
// cpInstanceCycleDeadline bounds the wall-time of one whole reconcile
// pass. With per-workspace 10s timeouts and a 200-row cap, a degraded-
// but-not-erroring CP (each IsRunning slow but under the per-workspace
// cap) could otherwise drag one cycle out for tens of minutes and starve
// the next tick. Mirrors cp_orphan_sweeper's orphanSweepDeadline; chosen
// under the 60s interval so a stuck cycle is abandoned before the next
// one is due and the backlog drains across subsequent cycles.
const cpInstanceCycleDeadline = 45 * time.Second
// cpInstanceReconfirmTimeout bounds the TOCTOU re-confirm read. This is a
// single indexed primary-key lookup, so it should never be slow; a tight
// timeout keeps the re-confirm from itself becoming a stall point.
const cpInstanceReconfirmTimeout = 5 * time.Second
// StartCPInstanceReconciler runs the authoritative EC2-state reconcile
// loop until ctx is cancelled. A nil checker makes the loop a no-op
// (matches the nil-tolerant pattern of the sibling CP sweeper).
@@ -131,41 +106,21 @@ func StartCPInstanceReconciler(ctx context.Context, checker InstanceRunningCheck
}
}
// reconcileRow pairs a workspace id with the instance_id captured in the
// SAME SELECT, so the TOCTOU re-confirm can verify CP's (false, nil)
// answer is about the instance the row still records — not one cleared
// out from under us between the SELECT and the IsRunning probe.
type reconcileRow struct {
id string
instanceID string
}
// reconcileOnce executes one reconcile pass. Defensive against db.DB
// being nil so a misconfigured boot doesn't panic.
//
// Scope: online/degraded + SaaS-EC2 workspaces only. runtime='external'
// rows are excluded (covered by the remote-heartbeat pass); paused/
// hibernated/removed/provisioning/awaiting_agent are excluded by the
// status filter. `degraded` is included because a SaaS workspace whose
// heartbeat handler flipped it degraded then lost its EC2 falls through
// every other sweep (matches healthsweep's `status IN ('online',
// 'degraded')`).
func reconcileOnce(parent context.Context, checker InstanceRunningChecker, onOffline OfflineHandler) {
// Scope: online + SaaS-EC2 workspaces only. runtime='external' rows are
// excluded (covered by the remote-heartbeat pass); paused/hibernated/
// removed/provisioning/awaiting_agent are excluded by the status filter.
func reconcileOnce(ctx context.Context, checker InstanceRunningChecker, onOffline OfflineHandler) {
if db.DB == nil {
return
}
// Per-cycle deadline so a degraded-but-not-erroring CP (each IsRunning
// slow but under the per-workspace cap) can't drag one cycle out for
// tens of minutes and starve the next tick. Per-workspace IsRunning
// timeouts derive from this cycle context.
cycleCtx, cancelCycle := context.WithTimeout(parent, cpInstanceCycleDeadline)
defer cancelCycle()
rows, err := db.DB.QueryContext(cycleCtx, `
SELECT id::text, instance_id
rows, err := db.DB.QueryContext(ctx, `
SELECT id::text
FROM workspaces
WHERE status IN ('online', 'degraded')
WHERE status = 'online'
AND instance_id IS NOT NULL
AND instance_id != ''
AND COALESCE(runtime, '') <> 'external'
@@ -178,130 +133,46 @@ func reconcileOnce(parent context.Context, checker InstanceRunningChecker, onOff
}
defer rows.Close()
var candidates []reconcileRow
var ids []string
for rows.Next() {
var r reconcileRow
if scanErr := rows.Scan(&r.id, &r.instanceID); scanErr != nil {
var id string
if scanErr := rows.Scan(&id); scanErr != nil {
log.Printf("cp-instance-reconciler: row scan failed: %v", scanErr)
continue
}
candidates = append(candidates, r)
ids = append(ids, id)
}
if iterErr := rows.Err(); iterErr != nil {
log.Printf("cp-instance-reconciler: rows iteration failed: %v", iterErr)
return
}
processed, skipped := 0, 0
for _, c := range candidates {
// Abandon the cycle if we've blown the per-cycle deadline; the
// next tick re-reads from the top (ORDER BY updated_at DESC) and
// drains the backlog. Without this a slow CP could keep one cycle
// running past its interval and never let a fresh one start.
if cycleCtx.Err() != nil {
log.Printf("cp-instance-reconciler: cycle deadline reached — processed %d, %d skipped (TOCTOU/changed), remaining deferred to next cycle", processed, skipped)
return
}
processed++
for _, id := range ids {
// Per-workspace timeout so one slow CP round-trip can't stall
// the whole sweep. Derived from cycleCtx so the cycle deadline
// always dominates.
checkCtx, cancel := context.WithTimeout(cycleCtx, cpInstanceCheckTimeout)
running, checkErr := checker.IsRunning(checkCtx, c.id)
// the whole sweep.
checkCtx, cancel := context.WithTimeout(ctx, cpInstanceCheckTimeout)
running, checkErr := checker.IsRunning(checkCtx, id)
cancel()
if checkErr != nil {
// FAIL-SAFE: transient DB/transport error (or a no-backend
// signal). IsRunning returns (true, err) on these, so never
// flip — leave the row online and retry next cycle.
log.Printf("cp-instance-reconciler: IsRunning(%s) errored, leaving online (fail-safe): %v", c.id, checkErr)
log.Printf("cp-instance-reconciler: IsRunning(%s) errored, leaving online (fail-safe): %v", id, checkErr)
continue
}
if running {
continue
}
// (false, nil) is NOT yet proof the EC2 is dead. IsRunning
// resolves instance_id independently (resolveInstanceID); if the
// row's instance_id was cleared/NULLed (concurrent delete, the
// CP-orphan-sweeper NULLing it, a reprovision) or the row moved
// off online/degraded between our SELECT and this probe,
// IsRunning returns a STALE (false, nil) that reflects a missing
// instance_id, NOT a confirmed-terminated EC2. Re-confirm against
// the row's CURRENT state and flip ONLY when the SAME non-empty
// instance we asked CP about is still recorded AND the row is
// still online/degraded. Mirrors healthsweep's guarded write.
if !reconfirmStillOfflineCandidate(cycleCtx, c) {
skipped++
continue
}
// CONFIRMED "not running" — CP authoritatively reports the EC2 is
// terminated/stopped/absent AND the row still records that exact
// instance as online/degraded. Feed it into the existing offline +
// CLEAN "not running" — CP authoritatively reports the EC2 is
// terminated/stopped/absent. Feed it into the existing offline +
// auto-heal machinery: onOffline flips the row offline and
// triggers RestartByID, which reprovisions with the existing
// volume.
log.Printf("cp-instance-reconciler: workspace %s (instance %s) is online/degraded but its EC2 is not running (terminated/stopped) — flipping offline + triggering reprovision", c.id, c.instanceID)
log.Printf("cp-instance-reconciler: workspace %s is status=online but its EC2 is not running (terminated/stopped) — flipping offline + triggering reprovision", id)
if onOffline != nil {
onOffline(cycleCtx, c.id)
onOffline(ctx, id)
}
}
}
// reconfirmStillOfflineCandidate re-reads the workspace's CURRENT
// (status, instance_id) and reports whether it is STILL a valid offline
// candidate for the instance we just probed. It returns true ONLY when:
//
// - the row still exists, AND
// - current status IN ('online','degraded'), AND
// - current instance_id is non-empty, AND
// - current instance_id == the instance_id captured in the original
// SELECT (the one whose liveness CP just answered about).
//
// Any other outcome (row gone, status moved off online/degraded,
// instance_id cleared or now points at a different instance) means the
// IsRunning (false, nil) was a stale/cleared-instance snapshot rather
// than a confirmed-terminated EC2 — return false so the caller skips the
// flip. A DB error during re-confirm is treated as "not confirmed"
// (false): fail-safe toward NOT flipping a workspace we can't re-verify.
func reconfirmStillOfflineCandidate(parent context.Context, c reconcileRow) bool {
if db.DB == nil {
return false
}
ctx, cancel := context.WithTimeout(parent, cpInstanceReconfirmTimeout)
defer cancel()
var curStatus, curInstanceID string
err := db.DB.QueryRowContext(ctx, `
SELECT status, COALESCE(instance_id, '')
FROM workspaces
WHERE id = $1
`, c.id).Scan(&curStatus, &curInstanceID)
if err != nil {
if err == sql.ErrNoRows {
// Row deleted between SELECT and re-confirm — definitely not a
// terminated-EC2 signal. Skip.
log.Printf("cp-instance-reconciler: re-confirm %s: row gone — skipping flip (stale snapshot, not a dead EC2)", c.id)
return false
}
// Transient DB error — fail-safe toward NOT flipping.
log.Printf("cp-instance-reconciler: re-confirm %s errored, skipping flip (fail-safe): %v", c.id, err)
return false
}
if curStatus != "online" && curStatus != "degraded" {
log.Printf("cp-instance-reconciler: re-confirm %s: status moved to %q since SELECT — skipping flip", c.id, curStatus)
return false
}
if curInstanceID == "" {
log.Printf("cp-instance-reconciler: re-confirm %s: instance_id cleared since SELECT — skipping flip (CP answered about a now-detached instance)", c.id)
return false
}
if curInstanceID != c.instanceID {
log.Printf("cp-instance-reconciler: re-confirm %s: instance_id changed %s -> %s since SELECT (reprovision) — skipping flip", c.id, c.instanceID, curInstanceID)
return false
}
return true
}
@@ -63,48 +63,16 @@ func (r *recordingOffline) got() []string {
}
// expectReconcileQuery registers the reconciler's SELECT, pinning the
// scope-critical predicates: status IN ('online','degraded'), instance_id
// present (captured as a column for the TOCTOU re-confirm), and runtime
// <> 'external'. A future widening that drops any of these (e.g. sweeping
// paused rows, or external rows the heartbeat pass owns), or that drops
// the instance_id column the re-confirm depends on, fails every test that
// uses this helper.
// scope-critical predicates: status='online', instance_id present, and
// runtime <> 'external'. A future widening that drops any of these (e.g.
// sweeping paused rows, or external rows the heartbeat pass owns) fails
// every test that uses this helper.
func expectReconcileQuery(mock sqlmock.Sqlmock, rows *sqlmock.Rows) {
mock.ExpectQuery(`(?s)^\s*SELECT id::text, instance_id\s+FROM workspaces\s+WHERE status IN \('online', 'degraded'\)\s+AND instance_id IS NOT NULL\s+AND instance_id != ''\s+AND COALESCE\(runtime, ''\) <> 'external'\s+ORDER BY updated_at DESC\s+LIMIT \$1`).
mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces\s+WHERE status = 'online'\s+AND instance_id IS NOT NULL\s+AND instance_id != ''\s+AND COALESCE\(runtime, ''\) <> 'external'\s+ORDER BY updated_at DESC\s+LIMIT \$1`).
WithArgs(CPInstanceReconcileLimit).
WillReturnRows(rows)
}
// reconcileRows builds the two-column (id, instance_id) result the
// reconciler's SELECT now returns. Pass id/instance_id pairs.
func reconcileRows(pairs ...[2]string) *sqlmock.Rows {
r := sqlmock.NewRows([]string{"id", "instance_id"})
for _, p := range pairs {
r.AddRow(p[0], p[1])
}
return r
}
// expectReconfirm registers the TOCTOU re-confirm primary-key lookup for
// workspace id `wsID`, returning the row's CURRENT (status, instance_id).
// This is what the reconciler re-reads after IsRunning returns (false,
// nil), before it flips: it only flips when the SAME non-empty instance
// is still recorded AND status is still online/degraded.
func expectReconfirm(mock sqlmock.Sqlmock, wsID, curStatus, curInstanceID string) {
mock.ExpectQuery(`(?s)^\s*SELECT status, COALESCE\(instance_id, ''\)\s+FROM workspaces\s+WHERE id = \$1`).
WithArgs(wsID).
WillReturnRows(sqlmock.NewRows([]string{"status", "instance_id"}).AddRow(curStatus, curInstanceID))
}
// expectReconfirmNoRows registers a re-confirm lookup that finds the row
// gone (deleted between SELECT and re-confirm) — the reconciler must
// treat this as "not a dead EC2" and skip the flip.
func expectReconfirmNoRows(mock sqlmock.Sqlmock, wsID string) {
mock.ExpectQuery(`(?s)^\s*SELECT status, COALESCE\(instance_id, ''\)\s+FROM workspaces\s+WHERE id = \$1`).
WithArgs(wsID).
WillReturnRows(sqlmock.NewRows([]string{"status", "instance_id"}))
}
// TestReconcileOnce_NotRunning_FlipsOffline — the core bug (core#2247):
// an online SaaS workspace whose EC2 is terminated. CP reports a CLEAN
// (false, nil); onOffline MUST be called with that id so the existing
@@ -114,10 +82,7 @@ func TestReconcileOnce_NotRunning_FlipsOffline(t *testing.T) {
checker := &fakeRunningChecker{running: map[string]bool{"ws-dead": false}}
off := &recordingOffline{}
expectReconcileQuery(mock, reconcileRows([2]string{"ws-dead", "i-dead"}))
// (false,nil) → re-confirm: row still online with the SAME instance →
// confirmed-dead → flip.
expectReconfirm(mock, "ws-dead", "online", "i-dead")
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).AddRow("ws-dead"))
reconcileOnce(context.Background(), checker, off.handler())
@@ -137,8 +102,7 @@ func TestReconcileOnce_Running_DoesNotFlip(t *testing.T) {
checker := &fakeRunningChecker{running: map[string]bool{"ws-alive": true}}
off := &recordingOffline{}
// Running → no re-confirm, no flip.
expectReconcileQuery(mock, reconcileRows([2]string{"ws-alive", "i-alive"}))
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).AddRow("ws-alive"))
reconcileOnce(context.Background(), checker, off.handler())
@@ -162,9 +126,7 @@ func TestReconcileOnce_TransientError_DoesNotFlip(t *testing.T) {
}
off := &recordingOffline{}
// (true,err) short-circuits BEFORE the re-confirm — no re-confirm query
// is registered, so a stray re-confirm would fail ExpectationsWereMet.
expectReconcileQuery(mock, reconcileRows([2]string{"ws-blip", "i-blip"}))
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).AddRow("ws-blip"))
reconcileOnce(context.Background(), checker, off.handler())
@@ -181,20 +143,19 @@ func TestReconcileOnce_TransientError_DoesNotFlip(t *testing.T) {
// TestReconcileOnce_QueryScopeExcludesExternalAndNonOnline — pins the
// SELECT predicate. The regex in expectReconcileQuery requires
// status IN ('online','degraded') AND runtime <> 'external'; if a future
// edit widens the scope to include paused/hibernated/removed rows or
// external rows (owned by the heartbeat pass), or narrows it back to drop
// 'degraded', this query no longer matches and sqlmock fails the test.
// With the predicate intact, a DB that has only out-of-scope rows returns
// empty → no IsRunning, no flip.
// status='online' AND runtime <> 'external'; if a future edit widens the
// scope to include paused/hibernated/removed rows or external rows (owned
// by the heartbeat pass), this query no longer matches and sqlmock fails
// the test. With the predicate intact, a DB that has only out-of-scope
// rows returns empty → no IsRunning, no flip.
func TestReconcileOnce_QueryScopeExcludesExternalAndNonOnline(t *testing.T) {
mock := setupTestDB(t)
checker := &fakeRunningChecker{}
off := &recordingOffline{}
// The predicate filters out external + out-of-scope-status rows
// server-side, modelled as the empty result those filters produce.
expectReconcileQuery(mock, reconcileRows())
// The predicate filters out external + non-online rows server-side,
// modelled as the empty result those filters produce.
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}))
reconcileOnce(context.Background(), checker, off.handler())
@@ -219,13 +180,10 @@ func TestReconcileOnce_MixedBatch(t *testing.T) {
}
off := &recordingOffline{}
expectReconcileQuery(mock, reconcileRows(
[2]string{"ws-dead", "i-dead"},
[2]string{"ws-alive", "i-alive"},
[2]string{"ws-blip", "i-blip"},
))
// Only ws-dead reaches the re-confirm ((false,nil)); it confirms.
expectReconfirm(mock, "ws-dead", "online", "i-dead")
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).
AddRow("ws-dead").
AddRow("ws-alive").
AddRow("ws-blip"))
reconcileOnce(context.Background(), checker, off.handler())
@@ -237,147 +195,6 @@ func TestReconcileOnce_MixedBatch(t *testing.T) {
}
}
// TestReconcileOnce_TOCTOU_InstanceChanged_DoesNotFlip — the HIGH-1
// regression guard. IsRunning returns a CLEAN (false, nil), but between
// the reconciler's SELECT and the probe the row's instance_id changed
// (reprovision attached a fresh EC2). IsRunning's independent
// resolveInstanceID is the reason the (false,nil) is stale: it may have
// resolved an empty/old instance. The re-confirm sees a DIFFERENT
// instance_id and MUST skip — flipping here would knock out a workspace
// whose NEW EC2 is not proven dead and fire RestartByID on a just-
// reprovisioned row.
func TestReconcileOnce_TOCTOU_InstanceChanged_DoesNotFlip(t *testing.T) {
mock := setupTestDB(t)
checker := &fakeRunningChecker{running: map[string]bool{"ws-race": false}}
off := &recordingOffline{}
expectReconcileQuery(mock, reconcileRows([2]string{"ws-race", "i-old"}))
// Re-confirm: row is still online but now points at a DIFFERENT
// instance (reprovisioned) → the (false,nil) was about i-old which is
// no longer attached → skip.
expectReconfirm(mock, "ws-race", "online", "i-new")
reconcileOnce(context.Background(), checker, off.handler())
if got := off.got(); len(got) != 0 {
t.Fatalf("TOCTOU guard violated: instance_id changed since SELECT must NOT flip, got %v", got)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("unmet expectations: %v", err)
}
}
// TestReconcileOnce_TOCTOU_InstanceCleared_DoesNotFlip — same HIGH-1
// guard, the instance_id-NULLed variant (CP-orphan-sweeper or a delete
// cleared it). Re-confirm sees an empty instance_id → skip.
func TestReconcileOnce_TOCTOU_InstanceCleared_DoesNotFlip(t *testing.T) {
mock := setupTestDB(t)
checker := &fakeRunningChecker{running: map[string]bool{"ws-cleared": false}}
off := &recordingOffline{}
expectReconcileQuery(mock, reconcileRows([2]string{"ws-cleared", "i-gone"}))
expectReconfirm(mock, "ws-cleared", "online", "") // instance_id cleared
reconcileOnce(context.Background(), checker, off.handler())
if got := off.got(); len(got) != 0 {
t.Fatalf("TOCTOU guard violated: cleared instance_id must NOT flip, got %v", got)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("unmet expectations: %v", err)
}
}
// TestReconcileOnce_TOCTOU_StatusMoved_DoesNotFlip — same HIGH-1 guard,
// the status-moved variant. The row left online/degraded (e.g. paused or
// removed) between SELECT and re-confirm → skip.
func TestReconcileOnce_TOCTOU_StatusMoved_DoesNotFlip(t *testing.T) {
mock := setupTestDB(t)
checker := &fakeRunningChecker{running: map[string]bool{"ws-paused": false}}
off := &recordingOffline{}
expectReconcileQuery(mock, reconcileRows([2]string{"ws-paused", "i-keep"}))
expectReconfirm(mock, "ws-paused", "paused", "i-keep") // status moved out of scope
reconcileOnce(context.Background(), checker, off.handler())
if got := off.got(); len(got) != 0 {
t.Fatalf("TOCTOU guard violated: row no longer online/degraded must NOT flip, got %v", got)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("unmet expectations: %v", err)
}
}
// TestReconcileOnce_TOCTOU_RowGone_DoesNotFlip — same HIGH-1 guard, the
// row-deleted variant. The re-confirm finds no row (concurrent delete) →
// skip; a stale (false,nil) about a just-deleted row must never fire
// onOffline/RestartByID.
func TestReconcileOnce_TOCTOU_RowGone_DoesNotFlip(t *testing.T) {
mock := setupTestDB(t)
checker := &fakeRunningChecker{running: map[string]bool{"ws-deleted": false}}
off := &recordingOffline{}
expectReconcileQuery(mock, reconcileRows([2]string{"ws-deleted", "i-x"}))
expectReconfirmNoRows(mock, "ws-deleted") // row gone
reconcileOnce(context.Background(), checker, off.handler())
if got := off.got(); len(got) != 0 {
t.Fatalf("TOCTOU guard violated: deleted row must NOT flip, got %v", got)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("unmet expectations: %v", err)
}
}
// TestReconcileOnce_Degraded_FlipsOffline — MED-3 scope. A `degraded`
// SaaS workspace whose EC2 is gone is otherwise covered by NO sweep. It's
// in scope (the SELECT regex requires status IN ('online','degraded')),
// CP reports (false,nil), the re-confirm shows it STILL degraded with the
// SAME instance → flip.
func TestReconcileOnce_Degraded_FlipsOffline(t *testing.T) {
mock := setupTestDB(t)
checker := &fakeRunningChecker{running: map[string]bool{"ws-degraded": false}}
off := &recordingOffline{}
expectReconcileQuery(mock, reconcileRows([2]string{"ws-degraded", "i-deg"}))
expectReconfirm(mock, "ws-degraded", "degraded", "i-deg")
reconcileOnce(context.Background(), checker, off.handler())
if got := off.got(); len(got) != 1 || got[0] != "ws-degraded" {
t.Fatalf("expected onOffline(ws-degraded), got %v", got)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("unmet expectations: %v", err)
}
}
// TestReconfirm_DBError_DoesNotFlip — re-confirm fail-safe. If the
// re-confirm read itself errors (transient DB blip), we treat it as "not
// confirmed" and skip the flip rather than acting on an unverifiable
// (false,nil).
func TestReconcileOnce_ReconfirmDBError_DoesNotFlip(t *testing.T) {
mock := setupTestDB(t)
checker := &fakeRunningChecker{running: map[string]bool{"ws-x": false}}
off := &recordingOffline{}
expectReconcileQuery(mock, reconcileRows([2]string{"ws-x", "i-x"}))
mock.ExpectQuery(`(?s)^\s*SELECT status, COALESCE\(instance_id, ''\)\s+FROM workspaces\s+WHERE id = \$1`).
WithArgs("ws-x").
WillReturnError(errors.New("connection reset"))
reconcileOnce(context.Background(), checker, off.handler())
if got := off.got(); len(got) != 0 {
t.Fatalf("re-confirm DB error must fail-safe (no flip), got %v", got)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("unmet expectations: %v", err)
}
}
// TestReconcileOnce_QueryError — DB transient failure. Reconcile returns
// without panicking and never probes IsRunning or flips anything.
func TestReconcileOnce_QueryError(t *testing.T) {
@@ -385,7 +202,7 @@ func TestReconcileOnce_QueryError(t *testing.T) {
checker := &fakeRunningChecker{}
off := &recordingOffline{}
mock.ExpectQuery(`(?s)^\s*SELECT id::text, instance_id\s+FROM workspaces`).
mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces`).
WithArgs(CPInstanceReconcileLimit).
WillReturnError(errors.New("connection refused"))