diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index b8a56653c..0508e3b64 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -372,6 +372,17 @@ jobs: # staging gate report green without a real provision→online→A2A # cycle goes red on every PR. bash tests/e2e/test_require_live_guard_unit.sh + # harden/enforce-ci-gates-core-v2 (PR #2286): fail-direction proof + # for the E2E_REQUIRE_LIVE zero-validated gate in + # test_priority_runtimes_e2e.sh (the REQUIRED `E2E API Smoke Test`). + # Offline (no LLM/network/provisioning): sources that script under + # its unit source-guard and drives the REAL evaluate_require_live_gate + # — asserts REQUIRE_LIVE=1 + zero validated → RED (the false-green + # trap), REQUIRE_LIVE=1 + >=1 validated → GREEN, and REQUIRE_LIVE + # unset + zero validated → GREEN (loud skip). CI can't provision a + # live arm to prove this, so this unit test IS the regression gate: + # a revert of the zero-validated→RED logic goes red on every PR. + bash tests/e2e/test_require_live_priority_gate_unit.sh - if: ${{ needs.changes.outputs.scripts == 'true' }} name: Test ECR promote-tenant-image script (mock-driven, no live infra) diff --git a/.gitea/workflows/e2e-api.yml b/.gitea/workflows/e2e-api.yml index d23572fa2..b511bb0c5 100644 --- a/.gitea/workflows/e2e-api.yml +++ b/.gitea/workflows/e2e-api.yml @@ -272,6 +272,24 @@ jobs: echo "::error::Redis did not become ready in 15s" docker logs "$REDIS_CONTAINER" || true exit 1 + - name: Set deterministic admin token for the e2e platform + if: needs.detect-changes.outputs.api == 'true' + run: | + # AdminAuth (workspace-server/internal/middleware/wsauth_middleware.go:164) + # reads ADMIN_TOKEN. Setting it (a) closes isDevModeFailOpen (devmode.go:50 + # returns false when ADMIN_TOKEN is non-empty), so admin routes require a + # bearer, and (b) makes Tier-2b accept a bearer that constant-time-equals + # ADMIN_TOKEN. The platform process inherits ADMIN_TOKEN from $GITHUB_ENV. + # + # MOLECULE_ADMIN_TOKEN is the var the e2e scripts send as the bearer + # (tests/e2e/_lib.sh:33 e2e_mint_workspace_token, and the run_mock + # org-import curl). Set BOTH to the SAME value so the bearer the test + # sends == the secret the platform checks. Deterministic test value; + # this platform is ephemeral, single-run, and never reachable off-host. + E2E_ADMIN_TOKEN="e2e-api-admin-${{ github.run_id }}-${{ github.run_attempt }}" + echo "ADMIN_TOKEN=${E2E_ADMIN_TOKEN}" >> "$GITHUB_ENV" + echo "MOLECULE_ADMIN_TOKEN=${E2E_ADMIN_TOKEN}" >> "$GITHUB_ENV" + echo "Admin token configured for the e2e platform (ADMIN_TOKEN + MOLECULE_ADMIN_TOKEN)." - name: Build platform if: needs.detect-changes.outputs.api == 'true' working-directory: workspace-server @@ -397,8 +415,33 @@ jobs: - name: Run notify-with-attachments E2E if: needs.detect-changes.outputs.api == 'true' run: bash tests/e2e/test_notify_attachments_e2e.sh - - name: Run priority-runtimes E2E (claude-code + hermes — skips when keys absent) + - name: "Run priority-runtimes E2E (REQUIRE-LIVE: mock validates the runtime plumbing end-to-end)" + # E2E_REQUIRE_LIVE=1 is ON: the run MUST validate >=1 runtime end-to-end + # or it exits NON-zero (RED). This is now SAFE because the `mock` arm can + # actually provision in CI: the only blocker was that POST /org/import and + # POST /admin/workspaces/:id/tokens are AdminAuth-gated + # (router.go:778 + :427) and this job previously configured NO admin token, + # so every admin call 401'd ("admin auth required"). The "Set deterministic + # admin token" step above now sets ADMIN_TOKEN on the platform AND exports + # the matching MOLECULE_ADMIN_TOKEN the e2e scripts send as the bearer, so + # the mock arm can org-import → online → mint token → canned A2A reply → + # validated(). That guarantees VALIDATED>=1 on a healthy platform, so the + # REQUIRED `E2E API Smoke Test` gate now HONESTLY validates a runtime + # end-to-end; if the mock plumbing (DB insert, status flip, A2A proxy, + # activity logging, or the admin-auth wiring) genuinely breaks, the gate + # goes RED instead of false-green. The zero-validated→RED decision is also + # regression-gated WITHOUT provisioning by the bash unit test + # tests/e2e/test_require_live_priority_gate_unit.sh (wired into ci.yml's + # "Run E2E bash unit tests" job), so a revert of that logic still fails CI. + # + # MiniMax stays an OPPORTUNISTIC best-effort arm: create is registry-fragile + # in CI (422 UNREGISTERED_MODEL_FOR_RUNTIME), so a miss is reported via + # bestfail() and never reds the gate — mock carries the required validation, + # MiniMax is a bonus real-LLM check when it comes up. ZERO new credentials. if: needs.detect-changes.outputs.api == 'true' + env: + E2E_REQUIRE_LIVE: '1' + E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }} run: bash tests/e2e/test_priority_runtimes_e2e.sh - name: Install standalone runtime parser from Gitea registry if: needs.detect-changes.outputs.api == 'true' diff --git a/tests/e2e/_lib.sh b/tests/e2e/_lib.sh index 6ade61136..f287be514 100755 --- a/tests/e2e/_lib.sh +++ b/tests/e2e/_lib.sh @@ -17,6 +17,33 @@ e2e_extract_token() { python3 "$(dirname "${BASH_SOURCE[0]}")/_extract_token.py" } +# Populate a curl-args array with the platform admin bearer, IF one is set. +# +# AdminAuth (workspace-server/internal/middleware/wsauth_middleware.go:161) +# fail-opens ONLY while ADMIN_TOKEN is unset AND no workspace token exists yet +# (devmode.go:50). The e2e-api CI job now sets ADMIN_TOKEN on the platform and +# exports the matching MOLECULE_ADMIN_TOKEN here, which flips fail-open OFF — so +# every admin-gated route (GET/POST/DELETE /workspaces, /events, /bundles, +# /org/import, …) now requires the EXACT ADMIN_TOKEN as bearer (Tier-2b rejects +# workspace bearers, wsauth_middleware.go:250). Helpers that hit admin routes +# (e2e_cleanup_all_workspaces, e2e_delete_workspace's default path) must send it. +# +# Guarded if-set so a bootstrap/dev platform with no admin token (fail-open) +# still works with zero auth. Mirrors e2e_mint_workspace_token's admin_auth. +# +# Usage: +# local admin_auth=(); e2e_admin_auth_args admin_auth +# curl -s "$BASE/workspaces" ${admin_auth[@]+"${admin_auth[@]}"} +e2e_admin_auth_args() { + local _outname="$1" + local _bearer="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}" + if [ -n "$_bearer" ]; then + eval "$_outname=(-H \"Authorization: Bearer \$_bearer\")" + else + eval "$_outname=()" + fi +} + # Delete every workspace currently on the platform. Use at the top of a # script so count-based assertions are reproducible across runs. # Mint a fresh workspace auth token via the real admin endpoint. @@ -53,19 +80,38 @@ e2e_delete_workspace() { if [ -z "$wid" ]; then return 0 fi + # DELETE /workspaces/:id and GET /workspaces/:id-for-name are both behind + # AdminAuth (router.go:155 GET single is public, but List/Delete are gated at + # router.go:165-167). Callers that already pass a per-workspace bearer (e.g. + # test_api.sh's NEW_TOKEN) authenticate themselves; the cleanup-trap callers + # in poll-mode/notify/priority pass NO curl args and rely on this fallback to + # the platform admin bearer so the DELETE doesn't 401 once ADMIN_TOKEN is set. + if [ "${#curl_args[@]}" -eq 0 ]; then + e2e_admin_auth_args curl_args + fi + # ${curl_args[@]+"…"} guard: under `set -u` an empty array expands to an + # "unbound variable" error on bash <4.4 (macOS 3.2, some Linux). This form + # expands to nothing when the array is empty. Callers from the priority- + # runtimes EXIT trap pass no extra curl args, so the array IS empty there — + # without the guard the trap aborts non-zero AFTER the gate already passed, + # turning a validated run RED. (Same idiom already used for CREATED_WSIDS.) if [ -z "$name" ]; then - name=$(curl -s "$BASE/workspaces/$wid" "${curl_args[@]}" | python3 -c "import json,sys + name=$(curl -s "$BASE/workspaces/$wid" ${curl_args[@]+"${curl_args[@]}"} | python3 -c "import json,sys try: print(json.load(sys.stdin).get('name','')) except Exception: pass" 2>/dev/null || true) fi curl -s -X DELETE "$BASE/workspaces/$wid?confirm=true" \ - -H "X-Confirm-Name: $name" "${curl_args[@]}" > /dev/null || true + -H "X-Confirm-Name: $name" ${curl_args[@]+"${curl_args[@]}"} > /dev/null || true } e2e_cleanup_all_workspaces() { - curl -s "$BASE/workspaces" | python3 -c "import json,sys + # GET /workspaces (list) is AdminAuth-gated (router.go:165). Send the platform + # admin bearer if one is set so the list doesn't 401 → empty → no cleanup. + local _admin_auth=() + e2e_admin_auth_args _admin_auth + curl -s "$BASE/workspaces" ${_admin_auth[@]+"${_admin_auth[@]}"} | python3 -c "import json,sys try: [print(f\"{w.get('id','')}\\t{w.get('name','')}\") for w in json.load(sys.stdin)] except Exception: diff --git a/tests/e2e/test_api.sh b/tests/e2e/test_api.sh index 26d443e8e..b283d6bc7 100644 --- a/tests/e2e/test_api.sh +++ b/tests/e2e/test_api.sh @@ -15,18 +15,27 @@ SUM_AUTH=() ECHO_URL="https://example.com/echo-agent" SUM_URL="https://example.com/summarizer-agent" -# AdminAuth-gated calls need a bearer token once any workspace token -# exists in the DB. ADMIN_TOKEN is populated after the first workspace -# create + real token mint. acurl = "authenticated curl". -ADMIN_TOKEN="" +# AdminAuth-gated calls (GET/POST/DELETE /workspaces, /events, /bundles) +# require the platform admin bearer once ADMIN_TOKEN is set on the server. +# Tier-2b (wsauth_middleware.go:250) REJECTS workspace bearer tokens on admin +# routes when ADMIN_TOKEN is set, so admin calls MUST send the exact ADMIN_TOKEN +# value — which the e2e-api CI job exports here as MOLECULE_ADMIN_TOKEN. acurl = +# "admin curl": it always sends the platform admin bearer (if one is set). +# +# Guarded if-set: a fresh self-hosted/dev platform with no ADMIN_TOKEN fail-opens +# (devmode.go:50), so sending no bearer still works there. +ADMIN_BEARER="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}" +ADMIN_AUTH=() +[ -n "$ADMIN_BEARER" ] && ADMIN_AUTH=(-H "Authorization: Bearer $ADMIN_BEARER") acurl() { - if [ -n "$ADMIN_TOKEN" ]; then - curl -s -H "Authorization: Bearer $ADMIN_TOKEN" "$@" - else - curl -s "$@" - fi + curl -s ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} "$@" } +# WORKSPACE_TOKEN holds a per-workspace bearer for the WorkspaceAuth-gated +# routes (PATCH /workspaces/:id, /activity, …). It is set after the first +# create+mint and is NOT interchangeable with the admin bearer. +WORKSPACE_TOKEN="" + # Pre-test cleanup: remove any workspaces left over from prior runs so # count-based assertions ("empty", "count=2") are reproducible. e2e_cleanup_all_workspaces @@ -57,19 +66,22 @@ check "GET /health" '"status":"ok"' "$R" R=$(acurl "$BASE/workspaces") check "GET /workspaces (empty)" '[]' "$R" -# Test 3: Create workspace A (AdminAuth fail-open — no tokens exist yet) -R=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" -d '{"name":"Echo Agent","tier":1,"runtime":"external","external":true}') +# Test 3: Create workspace A. POST /workspaces is AdminAuth-gated (router.go:166); +# send the admin bearer (acurl). On a fail-open dev platform acurl sends nothing +# and the create still works. +R=$(acurl -X POST "$BASE/workspaces" -H "Content-Type: application/json" -d '{"name":"Echo Agent","tier":1,"runtime":"external","external":true}') check "POST /workspaces (create echo)" '"status":"awaiting_agent"' "$R" ECHO_ID=$(echo "$R" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])") -ADMIN_TOKEN=$(echo "$R" | e2e_extract_token) -if [ -z "$ADMIN_TOKEN" ]; then - ADMIN_TOKEN=$(e2e_mint_workspace_token "$ECHO_ID" 2>/dev/null || echo "") +# Per-workspace token for Echo, for the WorkspaceAuth-gated routes below. +WORKSPACE_TOKEN=$(echo "$R" | e2e_extract_token) +if [ -z "$WORKSPACE_TOKEN" ]; then + WORKSPACE_TOKEN=$(e2e_mint_workspace_token "$ECHO_ID" 2>/dev/null || echo "") fi -if [ -n "$ADMIN_TOKEN" ]; then - echo " (acquired admin token: ${ADMIN_TOKEN:0:8}...)" +if [ -n "$WORKSPACE_TOKEN" ]; then + echo " (acquired Echo workspace token: ${WORKSPACE_TOKEN:0:8}...)" else - echo " WARNING: no admin token acquired — subsequent AdminAuth calls will fail" + echo " WARNING: no Echo workspace token acquired — WorkspaceAuth calls will fail" fi # Test 4: Create workspace B (needs bearer — tokens now exist in DB) @@ -98,7 +110,7 @@ check "GET /workspaces/:id (agent_card null)" '"agent_card":null' "$R" # Test 7: Register echo — use workspace-specific token (from real admin # endpoint), not the admin token. C18 requires a token issued TO THIS # workspace, not just any valid token. -ECHO_WS_TOKEN="$ADMIN_TOKEN" +ECHO_WS_TOKEN="$WORKSPACE_TOKEN" [ -n "$ECHO_WS_TOKEN" ] && ECHO_AUTH=(-H "Authorization: Bearer $ECHO_WS_TOKEN") R=$(curl -s -X POST "$BASE/registry/register" -H "Content-Type: application/json" \ "${ECHO_AUTH[@]}" \ @@ -159,26 +171,29 @@ R=$(curl -s -X POST "$BASE/registry/check-access" -H "Content-Type: application/ -d "{\"caller_id\":\"$ECHO_ID\",\"target_id\":\"$SUM_ID\"}") check "POST /registry/check-access (same-org allowed)" '"allowed":true' "$R" -# Test 15: PATCH workspace (update position) -R=$(acurl -X PATCH "$BASE/workspaces/$ECHO_ID" -H "Content-Type: application/json" -d '{"x":100,"y":200}') +# Test 15: PATCH workspace (update position). PATCH /workspaces/:id is +# WorkspaceAuth-gated (router.go:227 — #680 IDOR fix), so it needs Echo's OWN +# bearer, NOT the admin bearer (WorkspaceAuth rejects the admin token). +R=$(curl -s "${ECHO_AUTH[@]}" -X PATCH "$BASE/workspaces/$ECHO_ID" -H "Content-Type: application/json" -d '{"x":100,"y":200}') check "PATCH /workspaces/:id (position)" '"status":"updated"' "$R" R=$(acurl "$BASE/workspaces/$ECHO_ID") check "Position saved (x=100)" '"x":100' "$R" check "Position saved (y=200)" '"y":200' "$R" -# Test 16: PATCH workspace (update name) -R=$(acurl -X PATCH "$BASE/workspaces/$ECHO_ID" -H "Content-Type: application/json" -d '{"name":"Echo Agent v2"}') +# Test 16: PATCH workspace (update name) — WorkspaceAuth-gated; use Echo's token. +R=$(curl -s "${ECHO_AUTH[@]}" -X PATCH "$BASE/workspaces/$ECHO_ID" -H "Content-Type: application/json" -d '{"name":"Echo Agent v2"}') check "PATCH /workspaces/:id (name)" '"status":"updated"' "$R" R=$(acurl "$BASE/workspaces/$ECHO_ID") check "Name updated" '"name":"Echo Agent v2"' "$R" -# Test 17: Events (#165 / PR #167 — now admin-gated, bearer required) -R=$(acurl "$BASE/events" -H "Authorization: Bearer $ECHO_TOKEN") +# Test 17: Events (#165 / PR #167 — admin-gated; the admin bearer is required, +# and Tier-2b rejects a workspace bearer here, so use acurl's admin token alone). +R=$(acurl "$BASE/events") check "GET /events (has events)" 'WORKSPACE_ONLINE' "$R" -R=$(acurl "$BASE/events/$ECHO_ID" -H "Authorization: Bearer $ECHO_TOKEN") +R=$(acurl "$BASE/events/$ECHO_ID") check "GET /events/:id (has events for echo)" 'WORKSPACE_ONLINE' "$R" # Test 18: Update card @@ -295,7 +310,7 @@ check "active_tasks cleared" '"active_tasks":0' "$R" # endpoint is admin-auth gated and keeps the full record, so operators # can still see task progress from the dashboard without exposing it # over the public per-workspace GET. -R=$(curl -s "$BASE/workspaces" -H "Authorization: Bearer $ECHO_TOKEN") +R=$(acurl "$BASE/workspaces") check "current_task in list response" '"current_task"' "$R" # Test 21: Delete @@ -306,18 +321,20 @@ check "current_task in list response" '"current_task"' "$R" # Delete the CHILD (Summarizer) here instead: a child delete does NOT cascade # upward, so the parent Echo survives and count=1 holds. The bundle round-trip # below needs Summarizer's exported config, so capture it BEFORE this delete. -BUNDLE=$(curl -s "$BASE/bundles/export/$SUM_ID" -H "Authorization: Bearer $SUM_TOKEN") +# GET /bundles/export/:id is admin-gated (router.go:741) — use the admin bearer. +BUNDLE=$(acurl "$BASE/bundles/export/$SUM_ID") check "GET /bundles/export/:id" '"name":"Summarizer Agent"' "$BUNDLE" ORIG_NAME=$(echo "$BUNDLE" | python3 -c "import sys,json; print(json.load(sys.stdin)['name'])") ORIG_TIER=$(echo "$BUNDLE" | python3 -c "import sys,json; print(json.load(sys.stdin)['tier'])") +# DELETE /workspaces/:id is admin-gated (router.go:167). X-Confirm-Name must +# still match the workspace name even with admin auth. R=$(acurl -X DELETE "$BASE/workspaces/$SUM_ID?confirm=true" \ - -H "Authorization: Bearer $SUM_TOKEN" \ -H "X-Confirm-Name: Summarizer Agent") check "DELETE /workspaces/:id" '"status":"removed"' "$R" -# Parent Echo must survive a child delete — list as Echo and expect count=1. -R=$(curl -s "$BASE/workspaces" -H "Authorization: Bearer $ECHO_TOKEN") +# Parent Echo must survive a child delete — list (admin) and expect count=1. +R=$(acurl "$BASE/workspaces") COUNT=$(echo "$R" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))") check "List after delete (count=1)" "1" "$COUNT" @@ -328,21 +345,21 @@ check "List after delete (count=1)" "1" "$COUNT" echo "" echo "--- Bundle Round-Trip Test ---" -# Delete the remaining parent Echo — use ECHO_TOKEN (per-workspace) for -# WorkspaceAuth and ADMIN_TOKEN for the AdminAuth layer. +# Delete the remaining parent Echo — DELETE is admin-gated (router.go:167); +# the platform admin bearer (acurl) authorizes it. X-Confirm-Name still required. R=$(acurl -X DELETE "$BASE/workspaces/$ECHO_ID?confirm=true" \ - -H "Authorization: Bearer $ECHO_TOKEN" \ -H "X-Confirm-Name: Echo Agent v2") check "Delete before re-import" '"status":"removed"' "$R" -# After deleting both workspaces, all per-workspace tokens are revoked. -# Clear the now-revoked admin bearer so acurl can use fresh-install fail-open. -ADMIN_TOKEN="" +# Both workspaces are now deleted. The platform-level ADMIN_TOKEN env is still +# set, so admin routes still require the admin bearer (fail-open does NOT +# re-engage just because the token table emptied) — keep using acurl's bearer. R=$(acurl "$BASE/workspaces") COUNT=$(echo "$R" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))") check "All workspaces deleted (count=0)" "0" "$COUNT" -# Re-import from the exported bundle (AdminAuth fail-open — no live tokens) +# Re-import from the exported bundle. POST /bundles/import is admin-gated +# (router.go:742) — acurl sends the admin bearer. R=$(acurl -X POST "$BASE/bundles/import" -H "Content-Type: application/json" -d "$BUNDLE") check "POST /bundles/import" '"status":"provisioning"' "$R" NEW_ID=$(echo "$R" | python3 -c "import sys,json; print(json.load(sys.stdin)['workspace_id'])") @@ -398,12 +415,15 @@ check "Register re-imported workspace" '"status":"registered"' "$R" REG_NEW_TOKEN=$(echo "$R" | e2e_extract_token) [ -n "$REG_NEW_TOKEN" ] && NEW_TOKEN="$REG_NEW_TOKEN" -# Re-export and verify agent_card survives the round-trip (#165 / PR #167 — admin-gated) -REBUNDLE=$(curl -s "$BASE/bundles/export/$NEW_ID" -H "Authorization: Bearer $NEW_TOKEN") +# Re-export and verify agent_card survives the round-trip (#165 / PR #167 — +# GET /bundles/export/:id is admin-gated; use the admin bearer). +REBUNDLE=$(acurl "$BASE/bundles/export/$NEW_ID") check "Re-exported bundle has agent_card" '"agent_card"' "$REBUNDLE" -# Clean up — use the token just issued to the re-imported workspace -e2e_delete_workspace "$NEW_ID" "$ORIG_NAME" -H "Authorization: Bearer $NEW_TOKEN" +# Clean up — DELETE /workspaces/:id is admin-gated; pass no per-call auth so +# e2e_delete_workspace falls back to the platform admin bearer (a workspace +# bearer would be rejected by Tier-2b). +e2e_delete_workspace "$NEW_ID" "$ORIG_NAME" echo "" echo "=== Results: $PASS passed, $FAIL failed ===" diff --git a/tests/e2e/test_notify_attachments_e2e.sh b/tests/e2e/test_notify_attachments_e2e.sh index 0d92bfe46..e6992759d 100755 --- a/tests/e2e/test_notify_attachments_e2e.sh +++ b/tests/e2e/test_notify_attachments_e2e.sh @@ -28,6 +28,13 @@ PASS=0 FAIL=0 WSID="" +# GET /workspaces (list) and POST /workspaces (create) are AdminAuth-gated +# (router.go:165-166). The e2e-api CI job sets ADMIN_TOKEN on the platform +# (fail-open OFF) and exports MOLECULE_ADMIN_TOKEN here, so these calls need the +# admin bearer. Guarded if-set so a fail-open dev platform still works. +ADMIN_AUTH=() +e2e_admin_auth_args ADMIN_AUTH + cleanup() { # Workspace teardown — best-effort, ignore errors so an unrelated CP # outage doesn't shadow a real test failure. @@ -80,7 +87,7 @@ echo "=== Setup ===" # canvas. Find and delete any with this exact name so the test is safe to # re-run from any state. Match by name (not tag) so this also catches # leftovers created by older script versions. -PRIOR=$(curl -s "$BASE/workspaces" | python3 -c ' +PRIOR=$(curl -s "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} | python3 -c ' import json, sys try: print(" ".join(w["id"] for w in json.load(sys.stdin) if w.get("name") == "Notify E2E")) @@ -96,7 +103,7 @@ done # feedback_workspace_model_required_no_platform_default_dynamic_credential_intake). # Body has no runtime → defaults to claude-code; pass the matching model # that the workspace-creation contract now requires. -R=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \ +R=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \ -d '{"name":"Notify E2E","tier":1,"runtime":"external","external":true,"model":"sonnet"}') WSID=$(echo "$R" | python3 -c 'import json,sys;print(json.load(sys.stdin)["id"])' 2>/dev/null || true) [ -n "$WSID" ] || { echo "Failed to create workspace: $R"; exit 1; } diff --git a/tests/e2e/test_priority_runtimes_e2e.sh b/tests/e2e/test_priority_runtimes_e2e.sh index 7785f87f4..61b852a11 100755 --- a/tests/e2e/test_priority_runtimes_e2e.sh +++ b/tests/e2e/test_priority_runtimes_e2e.sh @@ -24,11 +24,73 @@ # Each phase skips cleanly when its prerequisite secret is absent so a # partially-keyed env (e.g. CI without an OpenAI key) doesn't false-fail. # +# REQUIRE-LIVE (false-green guard, mirrors CP serving-e2e's +# SERVING_E2E_REQUIRE_LIVE semantics) +# ------------------------------------------------------------------ +# Without a guard, an env with NO live secrets makes every phase SKIP, +# leaving PASS=0 FAIL=0 — and the historical `[ "$FAIL" -eq 0 ]` gate +# exits 0 (GREEN) while validating ZERO runtimes. That made the REQUIRED +# `E2E API Smoke Test` merge gate pass without exercising a single +# runtime (false-green). +# +# Fix: a real "validated arm" counter (VALIDATED) tracks runtimes that +# actually ran AND produced a non-error A2A reply. With E2E_REQUIRE_LIVE=1: +# if zero arms validated, the run exits NON-zero with a loud message. +# Without it (E2E_REQUIRE_LIVE unset/0), a fully-skipped run stays a LOUD +# skip + exit 0 for dev convenience. +# +# This zero-validated→RED decision is the load-bearing logic. It is factored +# into evaluate_require_live_gate() (a pure function of $FAIL/$VALIDATED/ +# $E2E_REQUIRE_LIVE, defined before any platform I/O) and is REGRESSION-GATED +# on every PR by tests/e2e/test_require_live_priority_gate_unit.sh, which +# sources this file (E2E_PRIORITY_UNIT_SOURCE=1), sets the counters, and +# asserts the gate's exit code — no platform, no provisioning, no network. +# So the false-green can't silently come back: a revert of the guard fails CI. +# +# CI POSTURE (REQUIRE-LIVE ON — see .gitea/workflows/e2e-api.yml): +# The live e2e-api job SETS E2E_REQUIRE_LIVE=1. The `mock` arm is the +# CI-provisionable live-completion arm: it org-imports a mock workspace +# (→online→canned A2A reply) with NO external secret. The only thing that +# previously blocked it in CI was admin auth — POST /org/import and POST +# /admin/workspaces/:id/tokens are AdminAuth-gated, and the job set no admin +# token, so every admin call 401'd ("admin auth required"). The job now sets +# ADMIN_TOKEN on the platform AND exports the matching MOLECULE_ADMIN_TOKEN +# the scripts send, so mock validates end-to-end and VALIDATED>=1 holds on a +# healthy platform — the REQUIRED `E2E API Smoke Test` gate now HONESTLY +# validates a runtime. If the mock plumbing or the admin-auth wiring breaks, +# the gate goes RED (not false-green). The zero-validated→RED decision is also +# regression-gated WITHOUT provisioning by the bash unit test above, so a +# revert of that logic still fails CI. +# +# LIVE ARMS (run when their prerequisite is present; opportunistic): +# - `mock` (run_mock) is the no-key REQUIRE-LIVE backbone: a virtual +# workspace (no container, no EC2, no provider) whose org-import path +# short-circuits to status='online' with a canned A2A reply. It validates +# in CI now that the e2e-api job wires an admin token (org-import + token +# mint are AdminAuth-gated), so it is the guaranteed >=1 validation. +# - MiniMax (E2E_MINIMAX_API_KEY, from MOLECULE_STAGING_MINIMAX_API_KEY) is +# an OPPORTUNISTIC best-effort real-LLM arm: registry-fragile in CI (422 +# UNREGISTERED_MODEL_FOR_RUNTIME — see run_minimax header), so a miss is +# a best-effort MISS via bestfail() and does NOT red the gate. +# The CI e2e-api job sets E2E_REQUIRE_LIVE=1: mock guarantees a validation, so +# the REQUIRED gate is honest (RED if the mock plumbing/admin-auth breaks). The +# zero-validated→RED logic is also regression-gated by the bash unit test above. +# # Usage: +# # Enforce REQUIRE-LIVE locally (need >=1 arm to actually validate): +# E2E_REQUIRE_LIVE=1 E2E_MINIMAX_API_KEY=... \ +# tests/e2e/test_priority_runtimes_e2e.sh +# +# # Default (no enforcement): all-skip stays a LOUD skip + exit 0: +# tests/e2e/test_priority_runtimes_e2e.sh +# +# # Other live arms (if their secrets are configured): # CLAUDE_CODE_OAUTH_TOKEN=... E2E_OPENAI_API_KEY=... \ # tests/e2e/test_priority_runtimes_e2e.sh # # # Run only one runtime +# E2E_RUNTIMES=mock tests/e2e/test_priority_runtimes_e2e.sh +# E2E_RUNTIMES=minimax tests/e2e/test_priority_runtimes_e2e.sh # E2E_RUNTIMES=claude-code tests/e2e/test_priority_runtimes_e2e.sh # E2E_RUNTIMES=hermes tests/e2e/test_priority_runtimes_e2e.sh # @@ -41,13 +103,81 @@ set -euo pipefail -source "$(dirname "$0")/_lib.sh" - PASS=0 FAIL=0 SKIP=0 +# VALIDATED counts runtimes that ACTUALLY ran end-to-end (provisioned, +# reached online, AND returned a non-error A2A reply). Distinct from PASS, +# which also counts sub-assertions like activity-log rows. This is the +# signal the REQUIRE-LIVE gate keys off: VALIDATED==0 means we proved +# nothing about any runtime, regardless of how many sub-asserts "passed". +VALIDATED=0 CREATED_WSIDS=() +# evaluate_require_live_gate — the SINGLE source of the final exit decision. +# Pure function of $FAIL, $VALIDATED, and $E2E_REQUIRE_LIVE; performs NO I/O +# beyond the loud messages. Returns the exit code the script should exit with: +# - FAIL>0 → 1 (a real failure is always red) +# - VALIDATED==0 + REQUIRE_LIVE → 1 (false-green trap: proved nothing → RED) +# - VALIDATED==0 + !REQUIRE_LIVE → 0 (dev-convenience LOUD skip) +# - VALIDATED>=1 → 0 (at least one arm validated end-to-end) +# It is a function (not inline tail code) so test_require_live_priority_gate_unit.sh +# can drive the REAL decision in isolation — set the counters, call this, assert +# the return code — with no platform, no provisioning, no network. That makes the +# zero-validated→RED logic a CI-gated regression contract: a future revert of it +# fails the unit test on every PR. See that unit test for the fail-direction proof. +evaluate_require_live_gate() { + # Any real failure is always red. + if [ "$FAIL" -ne 0 ]; then + return 1 + fi + + # REQUIRE-LIVE gate (mirrors CP serving-e2e SERVING_E2E_REQUIRE_LIVE). + # A run where every runtime SKIPPED proves nothing. In enforced mode + # (E2E_REQUIRE_LIVE=1) that MUST be red so the required `E2E API Smoke + # Test` gate can't be false-green on an all-skip run. + local require_live="${E2E_REQUIRE_LIVE:-0}" + if [ "$VALIDATED" -eq 0 ]; then + if [ "$require_live" = "1" ] || [ "$require_live" = "true" ]; then + echo "::error::E2E_REQUIRE_LIVE is set but ZERO runtimes were validated end-to-end." >&2 + echo " Every runtime SKIPPED — no live secret was present, so this gate" >&2 + echo " validated nothing. Wire at least one live arm via Gitea secrets" >&2 + echo " (E2E_MINIMAX_API_KEY ← MOLECULE_STAGING_MINIMAX_API_KEY is the" >&2 + echo " default CI arm; CLAUDE_CODE_OAUTH_TOKEN / E2E_OPENAI_API_KEY also" >&2 + echo " work) so >=1 runtime actually provisions + replies. Failing RED" >&2 + echo " instead of false-green." >&2 + return 1 + fi + # Dev convenience: no enforcement requested → loud skip, exit 0. + echo "SKIPPED: no live secrets present and E2E_REQUIRE_LIVE is not set — validated" >&2 + echo " zero runtimes. This is a dev-convenience pass; CI sets" >&2 + echo " E2E_REQUIRE_LIVE=1 to make zero-validated a hard failure." >&2 + return 0 + fi + + echo "OK: $VALIDATED runtime(s) validated end-to-end." + return 0 +} + +# Source-guard: when sourced by the unit test (E2E_PRIORITY_UNIT_SOURCE=1) we +# stop HERE — the counters + evaluate_require_live_gate are now defined, and we +# must NOT fall through to _lib.sh's platform-dependent helpers or the live +# pre-sweep curl below (there is no platform in the unit-test environment). +if [ "${E2E_PRIORITY_UNIT_SOURCE:-0}" = "1" ]; then + return 0 +fi + +source "$(dirname "$0")/_lib.sh" + +# GET /workspaces (list, router.go:165) and POST /workspaces (create, +# router.go:166) are AdminAuth-gated. The e2e-api CI job sets ADMIN_TOKEN on the +# platform (fail-open OFF) and exports MOLECULE_ADMIN_TOKEN here, so the +# pre-sweep list and every runtime-create must send the admin bearer or they +# 401. run_mock uses POST /org/import (also admin-gated) and wires its own admin +# auth inline. Guarded if-set so a fail-open dev platform still works. +ADMIN_AUTH=() +e2e_admin_auth_args ADMIN_AUTH + cleanup() { # `set -u` + empty array would error on "${CREATED_WSIDS[@]}"; the # ${VAR[@]+"…"} form expands to nothing when the array is unset/empty @@ -58,14 +188,26 @@ cleanup() { } trap cleanup EXIT -pass() { echo " PASS — $1"; PASS=$((PASS + 1)); } -fail() { echo " FAIL — $1"; echo " $2"; FAIL=$((FAIL + 1)); } -skip() { echo " SKIP — $1"; SKIP=$((SKIP + 1)); } +pass() { echo " PASS — $1"; PASS=$((PASS + 1)); } +fail() { echo " FAIL — $1"; echo " $2"; FAIL=$((FAIL + 1)); } +skip() { echo " SKIP — $1"; SKIP=$((SKIP + 1)); } +# Mark a runtime as having been validated end-to-end (online + non-error +# A2A reply). Also emits a PASS line so it shows in the results tally. +validated() { echo " PASS — $1"; PASS=$((PASS + 1)); VALIDATED=$((VALIDATED + 1)); } +# bestfail() is for OPPORTUNISTIC (best-effort) arms whose failure must +# NOT red the gate. It does NOT increment FAIL — it only logs + bumps +# SKIP so the tally stays honest ("we tried, it didn't validate, but it +# was never load-bearing"). Used by the MiniMax arm: MiniMax-create is +# fragile in CI (registry-skewed model id, BYOK plumbing — see core#2263 +# and the run_minimax header), so a MiniMax miss is reported but never +# fails the REQUIRED gate. The mock arm is the load-bearing validation +# that keeps the gate honest; MiniMax is the real-LLM bonus on top. +bestfail() { echo " BEST-EFFORT MISS — $1"; echo " $2"; SKIP=$((SKIP + 1)); } # Pre-sweep any prior runs that left workspaces behind (same defence as # test_notify_attachments_e2e.sh: trap fires on normal exit, but a # SIGPIPE / kill -9 can bypass it). -PRIOR=$(curl -s "$BASE/workspaces" | python3 -c ' +PRIOR=$(curl -s "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} | python3 -c ' import json, sys try: print(" ".join(w["id"] for w in json.load(sys.stdin) if w.get("name","").startswith("Priority E2E "))) @@ -188,7 +330,7 @@ print(json.dumps({'CLAUDE_CODE_OAUTH_TOKEN': os.environ['CLAUDE_CODE_OAUTH_TOKEN ") local resp wsid # model required (CTO 2026-05-22 SSOT) — pass the deleted DefaultModel("claude-code") value. - resp=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \ + resp=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \ -d "{\"name\":\"Priority E2E (claude-code)\",\"runtime\":\"claude-code\",\"model\":\"sonnet\",\"tier\":1,\"secrets\":$secrets}") wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true if [ -z "$wsid" ]; then @@ -220,9 +362,9 @@ print(json.dumps({'CLAUDE_CODE_OAUTH_TOKEN': os.environ['CLAUDE_CODE_OAUTH_TOKEN local reply if reply=$(send_test_prompt "$wsid" "$token"); then if echo "$reply" | grep -q "PONG"; then - pass "claude-code reply contains PONG" + validated "claude-code reply contains PONG" else - pass "claude-code reply non-empty (first 80 chars: ${reply:0:80})" + validated "claude-code reply non-empty (first 80 chars: ${reply:0:80})" fi assert_activity_logged "claude-code" "$wsid" "$token" else @@ -254,7 +396,7 @@ print(json.dumps({ })) ") local resp wsid - resp=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \ + resp=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \ -d "{\"name\":\"Priority E2E (hermes)\",\"runtime\":\"hermes\",\"tier\":1,\"model\":\"openai/gpt-4o\",\"secrets\":$secrets}") wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true if [ -z "$wsid" ]; then @@ -288,9 +430,9 @@ print(json.dumps({ local reply if reply=$(send_test_prompt "$wsid" "$token"); then if echo "$reply" | grep -q "PONG"; then - pass "hermes reply contains PONG" + validated "hermes reply contains PONG" else - pass "hermes reply non-empty (first 80 chars: ${reply:0:80})" + validated "hermes reply non-empty (first 80 chars: ${reply:0:80})" fi assert_activity_logged "hermes" "$wsid" "$token" else @@ -327,7 +469,7 @@ print(json.dumps({ })) ") local resp wsid - resp=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \ + resp=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \ -d "{\"name\":\"Priority E2E ($runtime)\",\"runtime\":\"$runtime\",\"tier\":1,\"model\":\"openai/gpt-4o-mini\",\"secrets\":$secrets}") wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true if [ -z "$wsid" ]; then @@ -358,9 +500,9 @@ print(json.dumps({ local reply if reply=$(send_test_prompt "$wsid" "$token"); then if echo "$reply" | grep -q "PONG"; then - pass "$runtime reply contains PONG" + validated "$runtime reply contains PONG" else - pass "$runtime reply non-empty (first 80 chars: ${reply:0:80})" + validated "$runtime reply non-empty (first 80 chars: ${reply:0:80})" fi assert_activity_logged "$runtime" "$wsid" "$token" else @@ -371,18 +513,253 @@ print(json.dumps({ run_codex() { run_openai_runtime "codex" "codex"; } run_openclaw() { run_openai_runtime "openclaw" "openclaw"; } -WANT="${E2E_RUNTIMES:-claude-code codex hermes openclaw}" +#################################################################### +# Mock arm — the GUARANTEED, always-available REQUIRE-LIVE backbone. +#################################################################### +# The mock runtime (workspace-server/internal/handlers/mock_runtime.go) +# is a virtual workspace: NO container, NO EC2, NO LLM key. The org-import +# path (createWorkspaceTree, org_import.go) short-circuits a runtime=mock +# workspace straight to status='online' (no provisioner needed), and the +# A2A proxy (a2a_proxy.go → handleMockA2A) synthesises a deterministic +# canned JSON-RPC reply with logActivity=true (writes the activity_logs +# row too). That makes mock the perfect REQUIRE-LIVE backbone: it +# exercises the SAME plumbing every real runtime needs to pass — +# provision-decision → status=online → A2A round-trip → activity_logs — +# without depending on any external provider key or LLM availability. It +# is GREEN on a healthy platform and RED only if that plumbing genuinely +# breaks (DB insert, status flip, A2A proxy, activity logging). No more +# false-green (zero-validated is impossible when mock works), and no more +# can't-go-green (mock needs no secret, so it always runs in CI). +# +# Why org-import (POST /org/import) instead of POST /workspaces: +# The mock→online short-circuit lives ONLY in createWorkspaceTree +# (org_import.go). The single-workspace Create handler (workspace.go) +# has no mock branch — it routes runtime=mock through +# provisionWorkspaceAuto, which in CI's local-build mode has no mock +# image and would never reach online. Org-import is the supported path +# to a live mock workspace, so the arm drives it. +# +# The canned reply is one of the "On it!" variants (NOT "PONG"), so this +# arm validates on the non-empty / non-error branch — that is the real +# contract for mock (it proves the plumbing, not an LLM's instruction- +# following). +run_mock() { + echo "" + echo "=== mock (no-key plumbing backbone) happy path ===" + # No secret gate — mock ALWAYS runs. That is the whole point: it is the + # required-validation arm that keeps E2E_REQUIRE_LIVE honest without a key. + + # Inline single-workspace mock org. model is a required field on the + # org-import contract (createWorkspaceTree fails-closed without one); + # mock never USES the model, so any non-empty value satisfies the + # contract. The org-import path does not run the Create handler's + # registry model-validation, so "mock" is accepted as-is. + # POST /org/import is AdminAuth-gated (router.go:778). When the platform has + # ADMIN_TOKEN set (as the e2e-api CI job now does), an unauthenticated import + # 401s with {"error":"admin auth required"}. Send the same admin bearer the + # mint helper uses (MOLECULE_ADMIN_TOKEN, ADMIN_TOKEN fallback) — guarded so a + # bootstrap/dev platform with no admin token (fail-open) still works. + local admin_bearer="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}" + local admin_auth=() + [ -n "$admin_bearer" ] && admin_auth=(-H "Authorization: Bearer $admin_bearer") + local import_resp wsid + import_resp=$(curl -s -X POST "$BASE/org/import" -H "Content-Type: application/json" \ + ${admin_auth[@]+"${admin_auth[@]}"} \ + -d '{ + "template": { + "name": "Priority E2E Mock Org", + "defaults": {"runtime": "mock", "model": "mock", "tier": 1}, + "workspaces": [ + {"name": "Priority E2E (mock)", "runtime": "mock", "model": "mock", "tier": 1} + ] + } + }') + # org-import returns {"org":..., "count":N, "workspaces":[{"id":..., + # "name":...,"tier":...}, ...]} (handlers/org.go:898-901). Pull the id of + # the single workspace we declared. (Older "results" key fallback kept for + # forward/back compat in case the response shape is ever versioned.) + wsid=$(echo "$import_resp" | python3 -c ' +import json, sys +try: + d = json.load(sys.stdin) +except Exception: + sys.exit(0) +for r in (d.get("workspaces") or d.get("results") or []): + if r.get("name") == "Priority E2E (mock)" and r.get("id"): + print(r["id"]); break +') || true + if [ -z "$wsid" ]; then + # mock org-import is the REQUIRE-LIVE backbone and is EXPECTED to succeed in + # CI now that the e2e-api job wires an admin token (ADMIN_TOKEN on the + # platform + MOLECULE_ADMIN_TOKEN sent above). A missing id here is a REAL + # break (admin-auth wiring, org-import create, or the mock short-circuit) and + # MUST red the gate — so this is a hard fail(), not a best-effort miss. Under + # E2E_REQUIRE_LIVE=1 a FAIL also forces a non-zero exit via + # evaluate_require_live_gate. Surface the response so the break is visible + # (e.g. {"error":"admin auth required"} would mean the token wiring regressed). + fail "create mock workspace (org-import)" "$import_resp" + return 0 + fi + CREATED_WSIDS+=("$wsid") + echo " workspace=$wsid" + + # Mock goes straight to online (no container boot) — a short budget is + # plenty; if it is NOT online quickly the mock short-circuit in + # createWorkspaceTree is genuinely broken and the gate SHOULD red. + local final + final=$(wait_for_status "$wsid" "online failed" 60) || true + if [ "$final" != "online" ]; then + fail "mock workspace reaches online" "final status: $final (mock should go online without provisioning)" + return 0 + fi + pass "mock workspace reaches online" + + # Mock workspaces are not created with an inline token; mint one via the + # admin endpoint (same fallback every other arm uses). + local token + token=$(e2e_mint_workspace_token "$wsid") || true + if [ -z "$token" ]; then + fail "resolve mock workspace token" "no token returned from POST /admin/workspaces/:id/tokens" + return 0 + fi + + # A2A round-trip. The mock proxy returns a canned non-error reply (one + # of the "On it!" variants) — NOT "PONG" — so we validate on the + # non-empty branch. A non-error, non-empty reply means the A2A proxy + # short-circuit + reply-shape contract are intact end-to-end. + local reply + if reply=$(send_test_prompt "$wsid" "$token"); then + validated "mock reply non-empty (canned; first 80 chars: ${reply:0:80})" + assert_activity_logged "mock" "$wsid" "$token" + else + fail "mock reply" "${reply:-} (mock A2A short-circuit should always return a canned reply)" + fi +} + +#################################################################### +# MiniMax live arm — OPPORTUNISTIC (best-effort) real-LLM arm. +#################################################################### +# NOTE: this is now a BEST-EFFORT arm, not the REQUIRE-LIVE backbone. +# mock (run_mock above) is the guaranteed, no-key validation that keeps +# the gate honest. MiniMax-create is fragile in CI: the namespaced model +# id minimax:MiniMax-M2.7 is NOT in claude-code's native model set and +# does NOT resolve via DeriveProvider (its only prefix-owner, byok-minimax, +# is not wired as a claude-code runtime arm), so the create is rejected +# 422 UNREGISTERED_MODEL_FOR_RUNTIME before any provisioning (RCA core +# registry_gen.go Runtimes["claude-code"]). Rather than red the REQUIRED +# gate on that registry-skew (or on any transient MiniMax provisioning / +# model-registration issue), this arm reports a best-effort MISS via +# bestfail() and lets mock carry the validation. If MiniMax DOES come up +# it validates as a bonus real-LLM check. +# Drives the claude-code runtime against MiniMax (BYOK) using the +# already-present Gitea secret MOLECULE_STAGING_MINIMAX_API_KEY, +# surfaced into the env as E2E_MINIMAX_API_KEY (same name + secret the +# staging-smoke / continuous-synth canaries use — see staging-smoke.yml +# and continuous-synth-e2e.yml). NO new credential is introduced. +# +# Why this is the arm that keeps the REQUIRED gate honest: +# - claude-code's `minimax` provider (providers.yaml / registry_gen.go) +# is third_party_anthropic_compat: it reads MINIMAX_API_KEY at boot +# and routes ANTHROPIC_BASE_URL → api.minimax.io/anthropic. So the +# ONLY tenant secret needed is {"MINIMAX_API_KEY": } — exactly +# the SECRETS_JSON branch test_staging_full_saas.sh uses. +# - Model id is the NAMESPACED colon-form `minimax:MiniMax-M2.7`, the +# registered BYOK arm for claude-code (registry_gen.go Runtimes +# ["claude-code"]["minimax"]). Per core#2263 the BARE `MiniMax-M2` +# id can 400 on a registry-skewed ws-server build; the namespaced +# form resolves the way kimi's `moonshot/…` does, so it's the +# robust choice for the gate. +run_minimax() { + echo "" + echo "=== minimax (claude-code BYOK) happy path ===" + if [ -z "${E2E_MINIMAX_API_KEY:-}" ]; then + skip "E2E_MINIMAX_API_KEY not set (MiniMax live arm needs the MiniMax key)" + return 0 + fi + local secrets + secrets=$(python3 -c " +import json, os +# claude-code's minimax provider (third_party_anthropic_compat) reads +# MINIMAX_API_KEY and points ANTHROPIC_BASE_URL at api.minimax.io/anthropic +# at boot — so the ONLY tenant secret needed is the MiniMax key itself. +print(json.dumps({'MINIMAX_API_KEY': os.environ['E2E_MINIMAX_API_KEY']})) +") + local resp wsid + # Namespaced BYOK model id (core#2263): bare MiniMax-M2 can 400 on a + # registry-skewed ws-server build; minimax:MiniMax-M2.7 is the + # registered claude-code BYOK arm and resolves like kimi's moonshot/… + resp=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \ + -d "{\"name\":\"Priority E2E (minimax)\",\"runtime\":\"claude-code\",\"model\":\"minimax:MiniMax-M2.7\",\"tier\":1,\"secrets\":$secrets}") + wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true + if [ -z "$wsid" ]; then + # BEST-EFFORT: MiniMax-create is fragile (see header — the namespaced + # model id is registry-skewed → 422). Do NOT red the gate; mock is the + # required backbone. Report the create response so the skew is visible. + bestfail "create minimax workspace (best-effort; mock carries the gate)" "$resp" + return 0 + fi + CREATED_WSIDS+=("$wsid") + echo " workspace=$wsid" + + # claude-code runtime image is already pulled; cold boot ~30-90s. The + # first MiniMax cold-call can be slow but that's covered by send_test_prompt's + # --max-time 180. + local final + final=$(wait_for_status "$wsid" "online failed" 240) || true + if [ "$final" != "online" ]; then + bestfail "minimax workspace reaches online (best-effort)" "final status: $final" + return 0 + fi + pass "minimax workspace reaches online" + + local token + token=$(echo "$resp" | e2e_extract_token) + if [ -z "$token" ]; then + token=$(e2e_mint_workspace_token "$wsid") + fi + if [ -z "$token" ]; then + bestfail "resolve minimax workspace token (best-effort)" "no token returned" + return 0 + fi + + local reply + if reply=$(send_test_prompt "$wsid" "$token"); then + if echo "$reply" | grep -q "PONG"; then + validated "minimax reply contains PONG" + else + validated "minimax reply non-empty (first 80 chars: ${reply:0:80})" + fi + assert_activity_logged "minimax" "$wsid" "$token" + else + bestfail "minimax reply (best-effort)" "${reply:-}" + fi +} + +# `mock` runs FIRST and by default: it is the no-key REQUIRE-LIVE backbone +# that guarantees >=1 validation on a healthy platform (see run_mock). The +# real-LLM arms (claude-code/codex/hermes/openclaw/minimax) run if their +# secrets are present and add real-provider coverage on top; minimax is +# best-effort (never reds the gate). +WANT="${E2E_RUNTIMES:-mock claude-code codex hermes openclaw minimax}" for r in $WANT; do case "$r" in + mock) run_mock ;; claude-code) run_claude_code ;; codex) run_codex ;; hermes) run_hermes ;; openclaw) run_openclaw ;; - all) run_claude_code; run_codex; run_hermes; run_openclaw ;; + minimax) run_minimax ;; + all) run_mock; run_claude_code; run_codex; run_hermes; run_openclaw; run_minimax ;; *) echo "unknown runtime in E2E_RUNTIMES: $r" >&2; exit 2 ;; esac done echo "" -echo "=== Results: $PASS passed, $FAIL failed, $SKIP skipped ===" -[ "$FAIL" -eq 0 ] +echo "=== Results: $PASS passed, $FAIL failed, $SKIP skipped, $VALIDATED runtime(s) validated end-to-end ===" + +# Final exit decision lives in evaluate_require_live_gate (defined at the top of +# this file, before any platform I/O) so the same logic is unit-tested in +# isolation by test_require_live_priority_gate_unit.sh. Mirror its return code +# into the process exit code. +evaluate_require_live_gate +exit $? diff --git a/tests/e2e/test_require_live_priority_gate_unit.sh b/tests/e2e/test_require_live_priority_gate_unit.sh new file mode 100755 index 000000000..8439d9abd --- /dev/null +++ b/tests/e2e/test_require_live_priority_gate_unit.sh @@ -0,0 +1,114 @@ +#!/usr/bin/env bash +# Fail-direction / load-bearing proof for the E2E_REQUIRE_LIVE zero-validated +# gate in test_priority_runtimes_e2e.sh (the REQUIRED `E2E API Smoke Test`). +# +# WHY (harden/enforce-ci-gates-core-v2, PR #2286): the priority-runtimes E2E's +# only historical exit gate was `[ "$FAIL" -eq 0 ]`. When every runtime SKIPs +# because no live secret is present — exactly what the CI step did — PASS=0 +# FAIL=0 and the script exited 0 (GREEN) while validating ZERO runtimes. The +# REQUIRED merge gate was therefore false-green: passing without exercising a +# single runtime. The fix adds a VALIDATED counter and makes a zero-validated +# run RED when E2E_REQUIRE_LIVE is set. +# +# That zero-validated→RED decision lives in evaluate_require_live_gate() in +# test_priority_runtimes_e2e.sh. CI cannot prove it via a live arm — the CI +# substrate can't provision ANY runtime end-to-end (MiniMax 422, mock org- +# import create fails, claude-code needs a key CI lacks), so the live e2e-api +# job does NOT force E2E_REQUIRE_LIVE (that would red the required gate for +# everyone). This UNIT test is the regression coverage instead: it drives the +# REAL evaluate_require_live_gate() function — not a copy — in isolation by +# sourcing the script with E2E_PRIORITY_UNIT_SOURCE=1 (which stops before any +# platform I/O), setting the counters, and asserting the gate's return code. +# +# Because it exercises the actual function, a future revert of the zero- +# validated→RED logic in test_priority_runtimes_e2e.sh fails THIS test on +# every PR — so the false-green can't silently come back. +# +# Runs entirely offline (no LLM, no network, no provisioning) — pure shell +# logic — so it runs on every PR in the fast lane and locally via `bash`. +set -uo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +GATE_SCRIPT="$SCRIPT_DIR/test_priority_runtimes_e2e.sh" + +if [ ! -f "$GATE_SCRIPT" ]; then + echo "FATAL: cannot find $GATE_SCRIPT" >&2 + exit 2 +fi + +PASS=0 +FAIL=0 + +# run_case +# Sources the REAL test_priority_runtimes_e2e.sh under the unit source-guard +# (E2E_PRIORITY_UNIT_SOURCE=1 → it returns right after defining the counters +# and evaluate_require_live_gate(), before _lib.sh / the live pre-sweep curl), +# sets the counters to the scenario, calls the real gate, and echoes the +# return code. Each case runs in a fresh `bash -c` so set -e/-u inside the +# sourced script can't leak between cases or kill this harness. +run_case() { + local require_live="$1" validated="$2" failcount="$3" + local observed + E2E_PRIORITY_UNIT_SOURCE=1 \ + E2E_REQUIRE_LIVE="$require_live" \ + GATE_SCRIPT="$GATE_SCRIPT" \ + VAL="$validated" \ + FL="$failcount" \ + bash -c ' + set -uo pipefail + # shellcheck disable=SC1090 + source "$GATE_SCRIPT" # returns at the source-guard (no platform I/O) + VALIDATED="$VAL" + FAIL="$FL" + evaluate_require_live_gate >/dev/null 2>&1 + exit $? + ' + observed=$? + echo "$observed" +} + +assert_rc() { + local label="$1" require_live="$2" validated="$3" failcount="$4" expected="$5" + local observed + observed=$(run_case "$require_live" "$validated" "$failcount") + if [ "$observed" = "$expected" ]; then + echo " ✓ $label: REQUIRE_LIVE=$require_live VALIDATED=$validated FAIL=$failcount → rc=$observed" + PASS=$((PASS + 1)) + else + echo " ✗ $label: REQUIRE_LIVE=$require_live VALIDATED=$validated FAIL=$failcount expected=$expected OBSERVED=$observed" >&2 + FAIL=$((FAIL + 1)) + fi +} + +echo "=== E2E_REQUIRE_LIVE priority-runtimes zero-validated gate proof ===" +echo " (drives the REAL evaluate_require_live_gate from $GATE_SCRIPT)" +echo + +# (a) DECISIVE false-green trap: REQUIRE_LIVE=1 + zero validated → RED (exit 1). +assert_rc "require-live, zero validated → RED (the false-green trap)" \ + 1 0 0 1 + +# (b) REQUIRE_LIVE=1 + at least one validated → GREEN (exit 0). +assert_rc "require-live, one validated → GREEN" \ + 1 1 0 0 +assert_rc "require-live, several validated → GREEN" \ + 1 3 0 0 + +# (c) REQUIRE_LIVE unset-equivalent (0) + zero validated → GREEN (loud skip). +assert_rc "no require-live, zero validated → GREEN (dev-convenience loud skip)" \ + 0 0 0 0 + +# REQUIRE_LIVE=true (string form) is also honoured by the gate. +assert_rc "require-live='true', zero validated → RED" \ + true 0 0 1 + +# A real FAIL is always red, regardless of REQUIRE_LIVE / VALIDATED — the +# zero-validated guard must not mask (nor be masked by) a genuine failure. +assert_rc "real FAIL with validations, no require-live → RED" \ + 0 2 1 1 +assert_rc "real FAIL, zero validated, no require-live → RED" \ + 0 0 1 1 + +echo +echo "=== Results: $PASS passed, $FAIL failed ===" +[ "$FAIL" -eq 0 ]