harden(ci): E2E API Smoke fails on zero-validated + wires existing MiniMax live arm #2286

Merged
core-devops merged 6 commits from harden/enforce-ci-gates-core-v2 into main 2026-06-05 07:59:31 +00:00
7 changed files with 684 additions and 66 deletions
+11
View File
@@ -372,6 +372,17 @@ jobs:
# staging gate report green without a real provision→online→A2A
# cycle goes red on every PR.
bash tests/e2e/test_require_live_guard_unit.sh
# harden/enforce-ci-gates-core-v2 (PR #2286): fail-direction proof
# for the E2E_REQUIRE_LIVE zero-validated gate in
# test_priority_runtimes_e2e.sh (the REQUIRED `E2E API Smoke Test`).
# Offline (no LLM/network/provisioning): sources that script under
# its unit source-guard and drives the REAL evaluate_require_live_gate
# — asserts REQUIRE_LIVE=1 + zero validated → RED (the false-green
# trap), REQUIRE_LIVE=1 + >=1 validated → GREEN, and REQUIRE_LIVE
# unset + zero validated → GREEN (loud skip). CI can't provision a
# live arm to prove this, so this unit test IS the regression gate:
# a revert of the zero-validated→RED logic goes red on every PR.
bash tests/e2e/test_require_live_priority_gate_unit.sh
- if: ${{ needs.changes.outputs.scripts == 'true' }}
name: Test ECR promote-tenant-image script (mock-driven, no live infra)
+44 -1
View File
@@ -272,6 +272,24 @@ jobs:
echo "::error::Redis did not become ready in 15s"
docker logs "$REDIS_CONTAINER" || true
exit 1
- name: Set deterministic admin token for the e2e platform
if: needs.detect-changes.outputs.api == 'true'
run: |
# AdminAuth (workspace-server/internal/middleware/wsauth_middleware.go:164)
# reads ADMIN_TOKEN. Setting it (a) closes isDevModeFailOpen (devmode.go:50
# returns false when ADMIN_TOKEN is non-empty), so admin routes require a
# bearer, and (b) makes Tier-2b accept a bearer that constant-time-equals
# ADMIN_TOKEN. The platform process inherits ADMIN_TOKEN from $GITHUB_ENV.
#
# MOLECULE_ADMIN_TOKEN is the var the e2e scripts send as the bearer
# (tests/e2e/_lib.sh:33 e2e_mint_workspace_token, and the run_mock
# org-import curl). Set BOTH to the SAME value so the bearer the test
# sends == the secret the platform checks. Deterministic test value;
# this platform is ephemeral, single-run, and never reachable off-host.
E2E_ADMIN_TOKEN="e2e-api-admin-${{ github.run_id }}-${{ github.run_attempt }}"
echo "ADMIN_TOKEN=${E2E_ADMIN_TOKEN}" >> "$GITHUB_ENV"
echo "MOLECULE_ADMIN_TOKEN=${E2E_ADMIN_TOKEN}" >> "$GITHUB_ENV"
echo "Admin token configured for the e2e platform (ADMIN_TOKEN + MOLECULE_ADMIN_TOKEN)."
- name: Build platform
if: needs.detect-changes.outputs.api == 'true'
working-directory: workspace-server
@@ -397,8 +415,33 @@ jobs:
- name: Run notify-with-attachments E2E
if: needs.detect-changes.outputs.api == 'true'
run: bash tests/e2e/test_notify_attachments_e2e.sh
- name: Run priority-runtimes E2E (claude-code + hermes — skips when keys absent)
- name: "Run priority-runtimes E2E (REQUIRE-LIVE: mock validates the runtime plumbing end-to-end)"
# E2E_REQUIRE_LIVE=1 is ON: the run MUST validate >=1 runtime end-to-end
# or it exits NON-zero (RED). This is now SAFE because the `mock` arm can
# actually provision in CI: the only blocker was that POST /org/import and
# POST /admin/workspaces/:id/tokens are AdminAuth-gated
# (router.go:778 + :427) and this job previously configured NO admin token,
# so every admin call 401'd ("admin auth required"). The "Set deterministic
# admin token" step above now sets ADMIN_TOKEN on the platform AND exports
# the matching MOLECULE_ADMIN_TOKEN the e2e scripts send as the bearer, so
# the mock arm can org-import → online → mint token → canned A2A reply →
# validated(). That guarantees VALIDATED>=1 on a healthy platform, so the
# REQUIRED `E2E API Smoke Test` gate now HONESTLY validates a runtime
# end-to-end; if the mock plumbing (DB insert, status flip, A2A proxy,
# activity logging, or the admin-auth wiring) genuinely breaks, the gate
# goes RED instead of false-green. The zero-validated→RED decision is also
# regression-gated WITHOUT provisioning by the bash unit test
# tests/e2e/test_require_live_priority_gate_unit.sh (wired into ci.yml's
# "Run E2E bash unit tests" job), so a revert of that logic still fails CI.
#
# MiniMax stays an OPPORTUNISTIC best-effort arm: create is registry-fragile
# in CI (422 UNREGISTERED_MODEL_FOR_RUNTIME), so a miss is reported via
# bestfail() and never reds the gate — mock carries the required validation,
# MiniMax is a bonus real-LLM check when it comes up. ZERO new credentials.
if: needs.detect-changes.outputs.api == 'true'
env:
E2E_REQUIRE_LIVE: '1'
E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
run: bash tests/e2e/test_priority_runtimes_e2e.sh
- name: Install standalone runtime parser from Gitea registry
if: needs.detect-changes.outputs.api == 'true'
+49 -3
View File
@@ -17,6 +17,33 @@ e2e_extract_token() {
python3 "$(dirname "${BASH_SOURCE[0]}")/_extract_token.py"
}
# Populate a curl-args array with the platform admin bearer, IF one is set.
#
# AdminAuth (workspace-server/internal/middleware/wsauth_middleware.go:161)
# fail-opens ONLY while ADMIN_TOKEN is unset AND no workspace token exists yet
# (devmode.go:50). The e2e-api CI job now sets ADMIN_TOKEN on the platform and
# exports the matching MOLECULE_ADMIN_TOKEN here, which flips fail-open OFF — so
# every admin-gated route (GET/POST/DELETE /workspaces, /events, /bundles,
# /org/import, …) now requires the EXACT ADMIN_TOKEN as bearer (Tier-2b rejects
# workspace bearers, wsauth_middleware.go:250). Helpers that hit admin routes
# (e2e_cleanup_all_workspaces, e2e_delete_workspace's default path) must send it.
#
# Guarded if-set so a bootstrap/dev platform with no admin token (fail-open)
# still works with zero auth. Mirrors e2e_mint_workspace_token's admin_auth.
#
# Usage:
# local admin_auth=(); e2e_admin_auth_args admin_auth
# curl -s "$BASE/workspaces" ${admin_auth[@]+"${admin_auth[@]}"}
e2e_admin_auth_args() {
local _outname="$1"
local _bearer="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}"
if [ -n "$_bearer" ]; then
eval "$_outname=(-H \"Authorization: Bearer \$_bearer\")"
else
eval "$_outname=()"
fi
}
# Delete every workspace currently on the platform. Use at the top of a
# script so count-based assertions are reproducible across runs.
# Mint a fresh workspace auth token via the real admin endpoint.
@@ -53,19 +80,38 @@ e2e_delete_workspace() {
if [ -z "$wid" ]; then
return 0
fi
# DELETE /workspaces/:id and GET /workspaces/:id-for-name are both behind
# AdminAuth (router.go:155 GET single is public, but List/Delete are gated at
# router.go:165-167). Callers that already pass a per-workspace bearer (e.g.
# test_api.sh's NEW_TOKEN) authenticate themselves; the cleanup-trap callers
# in poll-mode/notify/priority pass NO curl args and rely on this fallback to
# the platform admin bearer so the DELETE doesn't 401 once ADMIN_TOKEN is set.
if [ "${#curl_args[@]}" -eq 0 ]; then
e2e_admin_auth_args curl_args
fi
# ${curl_args[@]+"…"} guard: under `set -u` an empty array expands to an
# "unbound variable" error on bash <4.4 (macOS 3.2, some Linux). This form
# expands to nothing when the array is empty. Callers from the priority-
# runtimes EXIT trap pass no extra curl args, so the array IS empty there —
# without the guard the trap aborts non-zero AFTER the gate already passed,
# turning a validated run RED. (Same idiom already used for CREATED_WSIDS.)
if [ -z "$name" ]; then
name=$(curl -s "$BASE/workspaces/$wid" "${curl_args[@]}" | python3 -c "import json,sys
name=$(curl -s "$BASE/workspaces/$wid" ${curl_args[@]+"${curl_args[@]}"} | python3 -c "import json,sys
try:
print(json.load(sys.stdin).get('name',''))
except Exception:
pass" 2>/dev/null || true)
fi
curl -s -X DELETE "$BASE/workspaces/$wid?confirm=true" \
-H "X-Confirm-Name: $name" "${curl_args[@]}" > /dev/null || true
-H "X-Confirm-Name: $name" ${curl_args[@]+"${curl_args[@]}"} > /dev/null || true
}
e2e_cleanup_all_workspaces() {
curl -s "$BASE/workspaces" | python3 -c "import json,sys
# GET /workspaces (list) is AdminAuth-gated (router.go:165). Send the platform
# admin bearer if one is set so the list doesn't 401 → empty → no cleanup.
local _admin_auth=()
e2e_admin_auth_args _admin_auth
curl -s "$BASE/workspaces" ${_admin_auth[@]+"${_admin_auth[@]}"} | python3 -c "import json,sys
try:
[print(f\"{w.get('id','')}\\t{w.get('name','')}\") for w in json.load(sys.stdin)]
except Exception:
+61 -41
View File
@@ -15,18 +15,27 @@ SUM_AUTH=()
ECHO_URL="https://example.com/echo-agent"
SUM_URL="https://example.com/summarizer-agent"
# AdminAuth-gated calls need a bearer token once any workspace token
# exists in the DB. ADMIN_TOKEN is populated after the first workspace
# create + real token mint. acurl = "authenticated curl".
ADMIN_TOKEN=""
# AdminAuth-gated calls (GET/POST/DELETE /workspaces, /events, /bundles)
# require the platform admin bearer once ADMIN_TOKEN is set on the server.
# Tier-2b (wsauth_middleware.go:250) REJECTS workspace bearer tokens on admin
# routes when ADMIN_TOKEN is set, so admin calls MUST send the exact ADMIN_TOKEN
# value — which the e2e-api CI job exports here as MOLECULE_ADMIN_TOKEN. acurl =
# "admin curl": it always sends the platform admin bearer (if one is set).
#
# Guarded if-set: a fresh self-hosted/dev platform with no ADMIN_TOKEN fail-opens
# (devmode.go:50), so sending no bearer still works there.
ADMIN_BEARER="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}"
ADMIN_AUTH=()
[ -n "$ADMIN_BEARER" ] && ADMIN_AUTH=(-H "Authorization: Bearer $ADMIN_BEARER")
acurl() {
if [ -n "$ADMIN_TOKEN" ]; then
curl -s -H "Authorization: Bearer $ADMIN_TOKEN" "$@"
else
curl -s "$@"
fi
curl -s ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} "$@"
}
# WORKSPACE_TOKEN holds a per-workspace bearer for the WorkspaceAuth-gated
# routes (PATCH /workspaces/:id, /activity, …). It is set after the first
# create+mint and is NOT interchangeable with the admin bearer.
WORKSPACE_TOKEN=""
# Pre-test cleanup: remove any workspaces left over from prior runs so
# count-based assertions ("empty", "count=2") are reproducible.
e2e_cleanup_all_workspaces
@@ -57,19 +66,22 @@ check "GET /health" '"status":"ok"' "$R"
R=$(acurl "$BASE/workspaces")
check "GET /workspaces (empty)" '[]' "$R"
# Test 3: Create workspace A (AdminAuth fail-open — no tokens exist yet)
R=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" -d '{"name":"Echo Agent","tier":1,"runtime":"external","external":true}')
# Test 3: Create workspace A. POST /workspaces is AdminAuth-gated (router.go:166);
# send the admin bearer (acurl). On a fail-open dev platform acurl sends nothing
# and the create still works.
R=$(acurl -X POST "$BASE/workspaces" -H "Content-Type: application/json" -d '{"name":"Echo Agent","tier":1,"runtime":"external","external":true}')
check "POST /workspaces (create echo)" '"status":"awaiting_agent"' "$R"
ECHO_ID=$(echo "$R" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])")
ADMIN_TOKEN=$(echo "$R" | e2e_extract_token)
if [ -z "$ADMIN_TOKEN" ]; then
ADMIN_TOKEN=$(e2e_mint_workspace_token "$ECHO_ID" 2>/dev/null || echo "")
# Per-workspace token for Echo, for the WorkspaceAuth-gated routes below.
WORKSPACE_TOKEN=$(echo "$R" | e2e_extract_token)
if [ -z "$WORKSPACE_TOKEN" ]; then
WORKSPACE_TOKEN=$(e2e_mint_workspace_token "$ECHO_ID" 2>/dev/null || echo "")
fi
if [ -n "$ADMIN_TOKEN" ]; then
echo " (acquired admin token: ${ADMIN_TOKEN:0:8}...)"
if [ -n "$WORKSPACE_TOKEN" ]; then
echo " (acquired Echo workspace token: ${WORKSPACE_TOKEN:0:8}...)"
else
echo " WARNING: no admin token acquired — subsequent AdminAuth calls will fail"
echo " WARNING: no Echo workspace token acquired — WorkspaceAuth calls will fail"
fi
# Test 4: Create workspace B (needs bearer — tokens now exist in DB)
@@ -98,7 +110,7 @@ check "GET /workspaces/:id (agent_card null)" '"agent_card":null' "$R"
# Test 7: Register echo — use workspace-specific token (from real admin
# endpoint), not the admin token. C18 requires a token issued TO THIS
# workspace, not just any valid token.
ECHO_WS_TOKEN="$ADMIN_TOKEN"
ECHO_WS_TOKEN="$WORKSPACE_TOKEN"
[ -n "$ECHO_WS_TOKEN" ] && ECHO_AUTH=(-H "Authorization: Bearer $ECHO_WS_TOKEN")
R=$(curl -s -X POST "$BASE/registry/register" -H "Content-Type: application/json" \
"${ECHO_AUTH[@]}" \
@@ -159,26 +171,29 @@ R=$(curl -s -X POST "$BASE/registry/check-access" -H "Content-Type: application/
-d "{\"caller_id\":\"$ECHO_ID\",\"target_id\":\"$SUM_ID\"}")
check "POST /registry/check-access (same-org allowed)" '"allowed":true' "$R"
# Test 15: PATCH workspace (update position)
R=$(acurl -X PATCH "$BASE/workspaces/$ECHO_ID" -H "Content-Type: application/json" -d '{"x":100,"y":200}')
# Test 15: PATCH workspace (update position). PATCH /workspaces/:id is
# WorkspaceAuth-gated (router.go:227 — #680 IDOR fix), so it needs Echo's OWN
# bearer, NOT the admin bearer (WorkspaceAuth rejects the admin token).
R=$(curl -s "${ECHO_AUTH[@]}" -X PATCH "$BASE/workspaces/$ECHO_ID" -H "Content-Type: application/json" -d '{"x":100,"y":200}')
check "PATCH /workspaces/:id (position)" '"status":"updated"' "$R"
R=$(acurl "$BASE/workspaces/$ECHO_ID")
check "Position saved (x=100)" '"x":100' "$R"
check "Position saved (y=200)" '"y":200' "$R"
# Test 16: PATCH workspace (update name)
R=$(acurl -X PATCH "$BASE/workspaces/$ECHO_ID" -H "Content-Type: application/json" -d '{"name":"Echo Agent v2"}')
# Test 16: PATCH workspace (update name) — WorkspaceAuth-gated; use Echo's token.
R=$(curl -s "${ECHO_AUTH[@]}" -X PATCH "$BASE/workspaces/$ECHO_ID" -H "Content-Type: application/json" -d '{"name":"Echo Agent v2"}')
check "PATCH /workspaces/:id (name)" '"status":"updated"' "$R"
R=$(acurl "$BASE/workspaces/$ECHO_ID")
check "Name updated" '"name":"Echo Agent v2"' "$R"
# Test 17: Events (#165 / PR #167 — now admin-gated, bearer required)
R=$(acurl "$BASE/events" -H "Authorization: Bearer $ECHO_TOKEN")
# Test 17: Events (#165 / PR #167 — admin-gated; the admin bearer is required,
# and Tier-2b rejects a workspace bearer here, so use acurl's admin token alone).
R=$(acurl "$BASE/events")
check "GET /events (has events)" 'WORKSPACE_ONLINE' "$R"
R=$(acurl "$BASE/events/$ECHO_ID" -H "Authorization: Bearer $ECHO_TOKEN")
R=$(acurl "$BASE/events/$ECHO_ID")
check "GET /events/:id (has events for echo)" 'WORKSPACE_ONLINE' "$R"
# Test 18: Update card
@@ -295,7 +310,7 @@ check "active_tasks cleared" '"active_tasks":0' "$R"
# endpoint is admin-auth gated and keeps the full record, so operators
# can still see task progress from the dashboard without exposing it
# over the public per-workspace GET.
R=$(curl -s "$BASE/workspaces" -H "Authorization: Bearer $ECHO_TOKEN")
R=$(acurl "$BASE/workspaces")
check "current_task in list response" '"current_task"' "$R"
# Test 21: Delete
@@ -306,18 +321,20 @@ check "current_task in list response" '"current_task"' "$R"
# Delete the CHILD (Summarizer) here instead: a child delete does NOT cascade
# upward, so the parent Echo survives and count=1 holds. The bundle round-trip
# below needs Summarizer's exported config, so capture it BEFORE this delete.
BUNDLE=$(curl -s "$BASE/bundles/export/$SUM_ID" -H "Authorization: Bearer $SUM_TOKEN")
# GET /bundles/export/:id is admin-gated (router.go:741) — use the admin bearer.
BUNDLE=$(acurl "$BASE/bundles/export/$SUM_ID")
check "GET /bundles/export/:id" '"name":"Summarizer Agent"' "$BUNDLE"
ORIG_NAME=$(echo "$BUNDLE" | python3 -c "import sys,json; print(json.load(sys.stdin)['name'])")
ORIG_TIER=$(echo "$BUNDLE" | python3 -c "import sys,json; print(json.load(sys.stdin)['tier'])")
# DELETE /workspaces/:id is admin-gated (router.go:167). X-Confirm-Name must
# still match the workspace name even with admin auth.
R=$(acurl -X DELETE "$BASE/workspaces/$SUM_ID?confirm=true" \
-H "Authorization: Bearer $SUM_TOKEN" \
-H "X-Confirm-Name: Summarizer Agent")
check "DELETE /workspaces/:id" '"status":"removed"' "$R"
# Parent Echo must survive a child delete — list as Echo and expect count=1.
R=$(curl -s "$BASE/workspaces" -H "Authorization: Bearer $ECHO_TOKEN")
# Parent Echo must survive a child delete — list (admin) and expect count=1.
R=$(acurl "$BASE/workspaces")
COUNT=$(echo "$R" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))")
check "List after delete (count=1)" "1" "$COUNT"
@@ -328,21 +345,21 @@ check "List after delete (count=1)" "1" "$COUNT"
echo ""
echo "--- Bundle Round-Trip Test ---"
# Delete the remaining parent Echo — use ECHO_TOKEN (per-workspace) for
# WorkspaceAuth and ADMIN_TOKEN for the AdminAuth layer.
# Delete the remaining parent Echo — DELETE is admin-gated (router.go:167);
# the platform admin bearer (acurl) authorizes it. X-Confirm-Name still required.
R=$(acurl -X DELETE "$BASE/workspaces/$ECHO_ID?confirm=true" \
-H "Authorization: Bearer $ECHO_TOKEN" \
-H "X-Confirm-Name: Echo Agent v2")
check "Delete before re-import" '"status":"removed"' "$R"
# After deleting both workspaces, all per-workspace tokens are revoked.
# Clear the now-revoked admin bearer so acurl can use fresh-install fail-open.
ADMIN_TOKEN=""
# Both workspaces are now deleted. The platform-level ADMIN_TOKEN env is still
# set, so admin routes still require the admin bearer (fail-open does NOT
# re-engage just because the token table emptied) — keep using acurl's bearer.
R=$(acurl "$BASE/workspaces")
COUNT=$(echo "$R" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))")
check "All workspaces deleted (count=0)" "0" "$COUNT"
# Re-import from the exported bundle (AdminAuth fail-open — no live tokens)
# Re-import from the exported bundle. POST /bundles/import is admin-gated
# (router.go:742) — acurl sends the admin bearer.
R=$(acurl -X POST "$BASE/bundles/import" -H "Content-Type: application/json" -d "$BUNDLE")
check "POST /bundles/import" '"status":"provisioning"' "$R"
NEW_ID=$(echo "$R" | python3 -c "import sys,json; print(json.load(sys.stdin)['workspace_id'])")
@@ -398,12 +415,15 @@ check "Register re-imported workspace" '"status":"registered"' "$R"
REG_NEW_TOKEN=$(echo "$R" | e2e_extract_token)
[ -n "$REG_NEW_TOKEN" ] && NEW_TOKEN="$REG_NEW_TOKEN"
# Re-export and verify agent_card survives the round-trip (#165 / PR #167 — admin-gated)
REBUNDLE=$(curl -s "$BASE/bundles/export/$NEW_ID" -H "Authorization: Bearer $NEW_TOKEN")
# Re-export and verify agent_card survives the round-trip (#165 / PR #167 —
# GET /bundles/export/:id is admin-gated; use the admin bearer).
REBUNDLE=$(acurl "$BASE/bundles/export/$NEW_ID")
check "Re-exported bundle has agent_card" '"agent_card"' "$REBUNDLE"
# Clean up — use the token just issued to the re-imported workspace
e2e_delete_workspace "$NEW_ID" "$ORIG_NAME" -H "Authorization: Bearer $NEW_TOKEN"
# Clean up — DELETE /workspaces/:id is admin-gated; pass no per-call auth so
# e2e_delete_workspace falls back to the platform admin bearer (a workspace
# bearer would be rejected by Tier-2b).
e2e_delete_workspace "$NEW_ID" "$ORIG_NAME"
echo ""
echo "=== Results: $PASS passed, $FAIL failed ==="
+9 -2
View File
@@ -28,6 +28,13 @@ PASS=0
FAIL=0
WSID=""
# GET /workspaces (list) and POST /workspaces (create) are AdminAuth-gated
# (router.go:165-166). The e2e-api CI job sets ADMIN_TOKEN on the platform
# (fail-open OFF) and exports MOLECULE_ADMIN_TOKEN here, so these calls need the
# admin bearer. Guarded if-set so a fail-open dev platform still works.
ADMIN_AUTH=()
e2e_admin_auth_args ADMIN_AUTH
cleanup() {
# Workspace teardown — best-effort, ignore errors so an unrelated CP
# outage doesn't shadow a real test failure.
@@ -80,7 +87,7 @@ echo "=== Setup ==="
# canvas. Find and delete any with this exact name so the test is safe to
# re-run from any state. Match by name (not tag) so this also catches
# leftovers created by older script versions.
PRIOR=$(curl -s "$BASE/workspaces" | python3 -c '
PRIOR=$(curl -s "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} | python3 -c '
import json, sys
try:
print(" ".join(w["id"] for w in json.load(sys.stdin) if w.get("name") == "Notify E2E"))
@@ -96,7 +103,7 @@ done
# feedback_workspace_model_required_no_platform_default_dynamic_credential_intake).
# Body has no runtime → defaults to claude-code; pass the matching model
# that the workspace-creation contract now requires.
R=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \
R=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \
-d '{"name":"Notify E2E","tier":1,"runtime":"external","external":true,"model":"sonnet"}')
WSID=$(echo "$R" | python3 -c 'import json,sys;print(json.load(sys.stdin)["id"])' 2>/dev/null || true)
[ -n "$WSID" ] || { echo "Failed to create workspace: $R"; exit 1; }
+396 -19
View File
@@ -24,11 +24,73 @@
# Each phase skips cleanly when its prerequisite secret is absent so a
# partially-keyed env (e.g. CI without an OpenAI key) doesn't false-fail.
#
# REQUIRE-LIVE (false-green guard, mirrors CP serving-e2e's
# SERVING_E2E_REQUIRE_LIVE semantics)
# ------------------------------------------------------------------
# Without a guard, an env with NO live secrets makes every phase SKIP,
# leaving PASS=0 FAIL=0 — and the historical `[ "$FAIL" -eq 0 ]` gate
# exits 0 (GREEN) while validating ZERO runtimes. That made the REQUIRED
# `E2E API Smoke Test` merge gate pass without exercising a single
# runtime (false-green).
#
# Fix: a real "validated arm" counter (VALIDATED) tracks runtimes that
# actually ran AND produced a non-error A2A reply. With E2E_REQUIRE_LIVE=1:
# if zero arms validated, the run exits NON-zero with a loud message.
# Without it (E2E_REQUIRE_LIVE unset/0), a fully-skipped run stays a LOUD
# skip + exit 0 for dev convenience.
#
# This zero-validated→RED decision is the load-bearing logic. It is factored
# into evaluate_require_live_gate() (a pure function of $FAIL/$VALIDATED/
# $E2E_REQUIRE_LIVE, defined before any platform I/O) and is REGRESSION-GATED
# on every PR by tests/e2e/test_require_live_priority_gate_unit.sh, which
# sources this file (E2E_PRIORITY_UNIT_SOURCE=1), sets the counters, and
# asserts the gate's exit code — no platform, no provisioning, no network.
# So the false-green can't silently come back: a revert of the guard fails CI.
#
# CI POSTURE (REQUIRE-LIVE ON — see .gitea/workflows/e2e-api.yml):
# The live e2e-api job SETS E2E_REQUIRE_LIVE=1. The `mock` arm is the
# CI-provisionable live-completion arm: it org-imports a mock workspace
# (→online→canned A2A reply) with NO external secret. The only thing that
# previously blocked it in CI was admin auth — POST /org/import and POST
# /admin/workspaces/:id/tokens are AdminAuth-gated, and the job set no admin
# token, so every admin call 401'd ("admin auth required"). The job now sets
# ADMIN_TOKEN on the platform AND exports the matching MOLECULE_ADMIN_TOKEN
# the scripts send, so mock validates end-to-end and VALIDATED>=1 holds on a
# healthy platform — the REQUIRED `E2E API Smoke Test` gate now HONESTLY
# validates a runtime. If the mock plumbing or the admin-auth wiring breaks,
# the gate goes RED (not false-green). The zero-validated→RED decision is also
# regression-gated WITHOUT provisioning by the bash unit test above, so a
# revert of that logic still fails CI.
#
# LIVE ARMS (run when their prerequisite is present; opportunistic):
# - `mock` (run_mock) is the no-key REQUIRE-LIVE backbone: a virtual
# workspace (no container, no EC2, no provider) whose org-import path
# short-circuits to status='online' with a canned A2A reply. It validates
# in CI now that the e2e-api job wires an admin token (org-import + token
# mint are AdminAuth-gated), so it is the guaranteed >=1 validation.
# - MiniMax (E2E_MINIMAX_API_KEY, from MOLECULE_STAGING_MINIMAX_API_KEY) is
# an OPPORTUNISTIC best-effort real-LLM arm: registry-fragile in CI (422
# UNREGISTERED_MODEL_FOR_RUNTIME — see run_minimax header), so a miss is
# a best-effort MISS via bestfail() and does NOT red the gate.
# The CI e2e-api job sets E2E_REQUIRE_LIVE=1: mock guarantees a validation, so
# the REQUIRED gate is honest (RED if the mock plumbing/admin-auth breaks). The
# zero-validated→RED logic is also regression-gated by the bash unit test above.
#
# Usage:
# # Enforce REQUIRE-LIVE locally (need >=1 arm to actually validate):
# E2E_REQUIRE_LIVE=1 E2E_MINIMAX_API_KEY=... \
# tests/e2e/test_priority_runtimes_e2e.sh
#
# # Default (no enforcement): all-skip stays a LOUD skip + exit 0:
# tests/e2e/test_priority_runtimes_e2e.sh
#
# # Other live arms (if their secrets are configured):
# CLAUDE_CODE_OAUTH_TOKEN=... E2E_OPENAI_API_KEY=... \
# tests/e2e/test_priority_runtimes_e2e.sh
#
# # Run only one runtime
# E2E_RUNTIMES=mock tests/e2e/test_priority_runtimes_e2e.sh
# E2E_RUNTIMES=minimax tests/e2e/test_priority_runtimes_e2e.sh
# E2E_RUNTIMES=claude-code tests/e2e/test_priority_runtimes_e2e.sh
# E2E_RUNTIMES=hermes tests/e2e/test_priority_runtimes_e2e.sh
#
@@ -41,13 +103,81 @@
set -euo pipefail
source "$(dirname "$0")/_lib.sh"
PASS=0
FAIL=0
SKIP=0
# VALIDATED counts runtimes that ACTUALLY ran end-to-end (provisioned,
# reached online, AND returned a non-error A2A reply). Distinct from PASS,
# which also counts sub-assertions like activity-log rows. This is the
# signal the REQUIRE-LIVE gate keys off: VALIDATED==0 means we proved
# nothing about any runtime, regardless of how many sub-asserts "passed".
VALIDATED=0
CREATED_WSIDS=()
# evaluate_require_live_gate — the SINGLE source of the final exit decision.
# Pure function of $FAIL, $VALIDATED, and $E2E_REQUIRE_LIVE; performs NO I/O
# beyond the loud messages. Returns the exit code the script should exit with:
# - FAIL>0 → 1 (a real failure is always red)
# - VALIDATED==0 + REQUIRE_LIVE → 1 (false-green trap: proved nothing → RED)
# - VALIDATED==0 + !REQUIRE_LIVE → 0 (dev-convenience LOUD skip)
# - VALIDATED>=1 → 0 (at least one arm validated end-to-end)
# It is a function (not inline tail code) so test_require_live_priority_gate_unit.sh
# can drive the REAL decision in isolation — set the counters, call this, assert
# the return code — with no platform, no provisioning, no network. That makes the
# zero-validated→RED logic a CI-gated regression contract: a future revert of it
# fails the unit test on every PR. See that unit test for the fail-direction proof.
evaluate_require_live_gate() {
# Any real failure is always red.
if [ "$FAIL" -ne 0 ]; then
return 1
fi
# REQUIRE-LIVE gate (mirrors CP serving-e2e SERVING_E2E_REQUIRE_LIVE).
# A run where every runtime SKIPPED proves nothing. In enforced mode
# (E2E_REQUIRE_LIVE=1) that MUST be red so the required `E2E API Smoke
# Test` gate can't be false-green on an all-skip run.
local require_live="${E2E_REQUIRE_LIVE:-0}"
if [ "$VALIDATED" -eq 0 ]; then
if [ "$require_live" = "1" ] || [ "$require_live" = "true" ]; then
echo "::error::E2E_REQUIRE_LIVE is set but ZERO runtimes were validated end-to-end." >&2
echo " Every runtime SKIPPED — no live secret was present, so this gate" >&2
echo " validated nothing. Wire at least one live arm via Gitea secrets" >&2
echo " (E2E_MINIMAX_API_KEY ← MOLECULE_STAGING_MINIMAX_API_KEY is the" >&2
echo " default CI arm; CLAUDE_CODE_OAUTH_TOKEN / E2E_OPENAI_API_KEY also" >&2
echo " work) so >=1 runtime actually provisions + replies. Failing RED" >&2
echo " instead of false-green." >&2
return 1
fi
# Dev convenience: no enforcement requested → loud skip, exit 0.
echo "SKIPPED: no live secrets present and E2E_REQUIRE_LIVE is not set — validated" >&2
echo " zero runtimes. This is a dev-convenience pass; CI sets" >&2
echo " E2E_REQUIRE_LIVE=1 to make zero-validated a hard failure." >&2
return 0
fi
echo "OK: $VALIDATED runtime(s) validated end-to-end."
return 0
}
# Source-guard: when sourced by the unit test (E2E_PRIORITY_UNIT_SOURCE=1) we
# stop HERE — the counters + evaluate_require_live_gate are now defined, and we
# must NOT fall through to _lib.sh's platform-dependent helpers or the live
# pre-sweep curl below (there is no platform in the unit-test environment).
if [ "${E2E_PRIORITY_UNIT_SOURCE:-0}" = "1" ]; then
return 0
fi
source "$(dirname "$0")/_lib.sh"
# GET /workspaces (list, router.go:165) and POST /workspaces (create,
# router.go:166) are AdminAuth-gated. The e2e-api CI job sets ADMIN_TOKEN on the
# platform (fail-open OFF) and exports MOLECULE_ADMIN_TOKEN here, so the
# pre-sweep list and every runtime-create must send the admin bearer or they
# 401. run_mock uses POST /org/import (also admin-gated) and wires its own admin
# auth inline. Guarded if-set so a fail-open dev platform still works.
ADMIN_AUTH=()
e2e_admin_auth_args ADMIN_AUTH
cleanup() {
# `set -u` + empty array would error on "${CREATED_WSIDS[@]}"; the
# ${VAR[@]+"…"} form expands to nothing when the array is unset/empty
@@ -58,14 +188,26 @@ cleanup() {
}
trap cleanup EXIT
pass() { echo " PASS — $1"; PASS=$((PASS + 1)); }
fail() { echo " FAIL — $1"; echo " $2"; FAIL=$((FAIL + 1)); }
skip() { echo " SKIP — $1"; SKIP=$((SKIP + 1)); }
pass() { echo " PASS — $1"; PASS=$((PASS + 1)); }
fail() { echo " FAIL — $1"; echo " $2"; FAIL=$((FAIL + 1)); }
skip() { echo " SKIP — $1"; SKIP=$((SKIP + 1)); }
# Mark a runtime as having been validated end-to-end (online + non-error
# A2A reply). Also emits a PASS line so it shows in the results tally.
validated() { echo " PASS — $1"; PASS=$((PASS + 1)); VALIDATED=$((VALIDATED + 1)); }
# bestfail() is for OPPORTUNISTIC (best-effort) arms whose failure must
# NOT red the gate. It does NOT increment FAIL — it only logs + bumps
# SKIP so the tally stays honest ("we tried, it didn't validate, but it
# was never load-bearing"). Used by the MiniMax arm: MiniMax-create is
# fragile in CI (registry-skewed model id, BYOK plumbing — see core#2263
# and the run_minimax header), so a MiniMax miss is reported but never
# fails the REQUIRED gate. The mock arm is the load-bearing validation
# that keeps the gate honest; MiniMax is the real-LLM bonus on top.
bestfail() { echo " BEST-EFFORT MISS — $1"; echo " $2"; SKIP=$((SKIP + 1)); }
# Pre-sweep any prior runs that left workspaces behind (same defence as
# test_notify_attachments_e2e.sh: trap fires on normal exit, but a
# SIGPIPE / kill -9 can bypass it).
PRIOR=$(curl -s "$BASE/workspaces" | python3 -c '
PRIOR=$(curl -s "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} | python3 -c '
import json, sys
try:
print(" ".join(w["id"] for w in json.load(sys.stdin) if w.get("name","").startswith("Priority E2E ")))
@@ -188,7 +330,7 @@ print(json.dumps({'CLAUDE_CODE_OAUTH_TOKEN': os.environ['CLAUDE_CODE_OAUTH_TOKEN
")
local resp wsid
# model required (CTO 2026-05-22 SSOT) — pass the deleted DefaultModel("claude-code") value.
resp=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \
resp=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \
-d "{\"name\":\"Priority E2E (claude-code)\",\"runtime\":\"claude-code\",\"model\":\"sonnet\",\"tier\":1,\"secrets\":$secrets}")
wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true
if [ -z "$wsid" ]; then
@@ -220,9 +362,9 @@ print(json.dumps({'CLAUDE_CODE_OAUTH_TOKEN': os.environ['CLAUDE_CODE_OAUTH_TOKEN
local reply
if reply=$(send_test_prompt "$wsid" "$token"); then
if echo "$reply" | grep -q "PONG"; then
pass "claude-code reply contains PONG"
validated "claude-code reply contains PONG"
else
pass "claude-code reply non-empty (first 80 chars: ${reply:0:80})"
validated "claude-code reply non-empty (first 80 chars: ${reply:0:80})"
fi
assert_activity_logged "claude-code" "$wsid" "$token"
else
@@ -254,7 +396,7 @@ print(json.dumps({
}))
")
local resp wsid
resp=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \
resp=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \
-d "{\"name\":\"Priority E2E (hermes)\",\"runtime\":\"hermes\",\"tier\":1,\"model\":\"openai/gpt-4o\",\"secrets\":$secrets}")
wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true
if [ -z "$wsid" ]; then
@@ -288,9 +430,9 @@ print(json.dumps({
local reply
if reply=$(send_test_prompt "$wsid" "$token"); then
if echo "$reply" | grep -q "PONG"; then
pass "hermes reply contains PONG"
validated "hermes reply contains PONG"
else
pass "hermes reply non-empty (first 80 chars: ${reply:0:80})"
validated "hermes reply non-empty (first 80 chars: ${reply:0:80})"
fi
assert_activity_logged "hermes" "$wsid" "$token"
else
@@ -327,7 +469,7 @@ print(json.dumps({
}))
")
local resp wsid
resp=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \
resp=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \
-d "{\"name\":\"Priority E2E ($runtime)\",\"runtime\":\"$runtime\",\"tier\":1,\"model\":\"openai/gpt-4o-mini\",\"secrets\":$secrets}")
wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true
if [ -z "$wsid" ]; then
@@ -358,9 +500,9 @@ print(json.dumps({
local reply
if reply=$(send_test_prompt "$wsid" "$token"); then
if echo "$reply" | grep -q "PONG"; then
pass "$runtime reply contains PONG"
validated "$runtime reply contains PONG"
else
pass "$runtime reply non-empty (first 80 chars: ${reply:0:80})"
validated "$runtime reply non-empty (first 80 chars: ${reply:0:80})"
fi
assert_activity_logged "$runtime" "$wsid" "$token"
else
@@ -371,18 +513,253 @@ print(json.dumps({
run_codex() { run_openai_runtime "codex" "codex"; }
run_openclaw() { run_openai_runtime "openclaw" "openclaw"; }
WANT="${E2E_RUNTIMES:-claude-code codex hermes openclaw}"
####################################################################
# Mock arm — the GUARANTEED, always-available REQUIRE-LIVE backbone.
####################################################################
# The mock runtime (workspace-server/internal/handlers/mock_runtime.go)
# is a virtual workspace: NO container, NO EC2, NO LLM key. The org-import
# path (createWorkspaceTree, org_import.go) short-circuits a runtime=mock
# workspace straight to status='online' (no provisioner needed), and the
# A2A proxy (a2a_proxy.go → handleMockA2A) synthesises a deterministic
# canned JSON-RPC reply with logActivity=true (writes the activity_logs
# row too). That makes mock the perfect REQUIRE-LIVE backbone: it
# exercises the SAME plumbing every real runtime needs to pass —
# provision-decision → status=online → A2A round-trip → activity_logs —
# without depending on any external provider key or LLM availability. It
# is GREEN on a healthy platform and RED only if that plumbing genuinely
# breaks (DB insert, status flip, A2A proxy, activity logging). No more
# false-green (zero-validated is impossible when mock works), and no more
# can't-go-green (mock needs no secret, so it always runs in CI).
#
# Why org-import (POST /org/import) instead of POST /workspaces:
# The mock→online short-circuit lives ONLY in createWorkspaceTree
# (org_import.go). The single-workspace Create handler (workspace.go)
# has no mock branch — it routes runtime=mock through
# provisionWorkspaceAuto, which in CI's local-build mode has no mock
# image and would never reach online. Org-import is the supported path
# to a live mock workspace, so the arm drives it.
#
# The canned reply is one of the "On it!" variants (NOT "PONG"), so this
# arm validates on the non-empty / non-error branch — that is the real
# contract for mock (it proves the plumbing, not an LLM's instruction-
# following).
run_mock() {
echo ""
echo "=== mock (no-key plumbing backbone) happy path ==="
# No secret gate — mock ALWAYS runs. That is the whole point: it is the
# required-validation arm that keeps E2E_REQUIRE_LIVE honest without a key.
# Inline single-workspace mock org. model is a required field on the
# org-import contract (createWorkspaceTree fails-closed without one);
# mock never USES the model, so any non-empty value satisfies the
# contract. The org-import path does not run the Create handler's
# registry model-validation, so "mock" is accepted as-is.
# POST /org/import is AdminAuth-gated (router.go:778). When the platform has
# ADMIN_TOKEN set (as the e2e-api CI job now does), an unauthenticated import
# 401s with {"error":"admin auth required"}. Send the same admin bearer the
# mint helper uses (MOLECULE_ADMIN_TOKEN, ADMIN_TOKEN fallback) — guarded so a
# bootstrap/dev platform with no admin token (fail-open) still works.
local admin_bearer="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}"
local admin_auth=()
[ -n "$admin_bearer" ] && admin_auth=(-H "Authorization: Bearer $admin_bearer")
local import_resp wsid
import_resp=$(curl -s -X POST "$BASE/org/import" -H "Content-Type: application/json" \
${admin_auth[@]+"${admin_auth[@]}"} \
-d '{
"template": {
"name": "Priority E2E Mock Org",
"defaults": {"runtime": "mock", "model": "mock", "tier": 1},
"workspaces": [
{"name": "Priority E2E (mock)", "runtime": "mock", "model": "mock", "tier": 1}
]
}
}')
# org-import returns {"org":..., "count":N, "workspaces":[{"id":...,
# "name":...,"tier":...}, ...]} (handlers/org.go:898-901). Pull the id of
# the single workspace we declared. (Older "results" key fallback kept for
# forward/back compat in case the response shape is ever versioned.)
wsid=$(echo "$import_resp" | python3 -c '
import json, sys
try:
d = json.load(sys.stdin)
except Exception:
sys.exit(0)
for r in (d.get("workspaces") or d.get("results") or []):
if r.get("name") == "Priority E2E (mock)" and r.get("id"):
print(r["id"]); break
') || true
if [ -z "$wsid" ]; then
# mock org-import is the REQUIRE-LIVE backbone and is EXPECTED to succeed in
# CI now that the e2e-api job wires an admin token (ADMIN_TOKEN on the
# platform + MOLECULE_ADMIN_TOKEN sent above). A missing id here is a REAL
# break (admin-auth wiring, org-import create, or the mock short-circuit) and
# MUST red the gate — so this is a hard fail(), not a best-effort miss. Under
# E2E_REQUIRE_LIVE=1 a FAIL also forces a non-zero exit via
# evaluate_require_live_gate. Surface the response so the break is visible
# (e.g. {"error":"admin auth required"} would mean the token wiring regressed).
fail "create mock workspace (org-import)" "$import_resp"
return 0
fi
CREATED_WSIDS+=("$wsid")
echo " workspace=$wsid"
# Mock goes straight to online (no container boot) — a short budget is
# plenty; if it is NOT online quickly the mock short-circuit in
# createWorkspaceTree is genuinely broken and the gate SHOULD red.
local final
final=$(wait_for_status "$wsid" "online failed" 60) || true
if [ "$final" != "online" ]; then
fail "mock workspace reaches online" "final status: $final (mock should go online without provisioning)"
return 0
fi
pass "mock workspace reaches online"
# Mock workspaces are not created with an inline token; mint one via the
# admin endpoint (same fallback every other arm uses).
local token
token=$(e2e_mint_workspace_token "$wsid") || true
if [ -z "$token" ]; then
fail "resolve mock workspace token" "no token returned from POST /admin/workspaces/:id/tokens"
return 0
fi
# A2A round-trip. The mock proxy returns a canned non-error reply (one
# of the "On it!" variants) — NOT "PONG" — so we validate on the
# non-empty branch. A non-error, non-empty reply means the A2A proxy
# short-circuit + reply-shape contract are intact end-to-end.
local reply
if reply=$(send_test_prompt "$wsid" "$token"); then
validated "mock reply non-empty (canned; first 80 chars: ${reply:0:80})"
assert_activity_logged "mock" "$wsid" "$token"
else
fail "mock reply" "${reply:-<empty or error>} (mock A2A short-circuit should always return a canned reply)"
fi
}
####################################################################
# MiniMax live arm — OPPORTUNISTIC (best-effort) real-LLM arm.
####################################################################
# NOTE: this is now a BEST-EFFORT arm, not the REQUIRE-LIVE backbone.
# mock (run_mock above) is the guaranteed, no-key validation that keeps
# the gate honest. MiniMax-create is fragile in CI: the namespaced model
# id minimax:MiniMax-M2.7 is NOT in claude-code's native model set and
# does NOT resolve via DeriveProvider (its only prefix-owner, byok-minimax,
# is not wired as a claude-code runtime arm), so the create is rejected
# 422 UNREGISTERED_MODEL_FOR_RUNTIME before any provisioning (RCA core
# registry_gen.go Runtimes["claude-code"]). Rather than red the REQUIRED
# gate on that registry-skew (or on any transient MiniMax provisioning /
# model-registration issue), this arm reports a best-effort MISS via
# bestfail() and lets mock carry the validation. If MiniMax DOES come up
# it validates as a bonus real-LLM check.
# Drives the claude-code runtime against MiniMax (BYOK) using the
# already-present Gitea secret MOLECULE_STAGING_MINIMAX_API_KEY,
# surfaced into the env as E2E_MINIMAX_API_KEY (same name + secret the
# staging-smoke / continuous-synth canaries use — see staging-smoke.yml
# and continuous-synth-e2e.yml). NO new credential is introduced.
#
# Why this is the arm that keeps the REQUIRED gate honest:
# - claude-code's `minimax` provider (providers.yaml / registry_gen.go)
# is third_party_anthropic_compat: it reads MINIMAX_API_KEY at boot
# and routes ANTHROPIC_BASE_URL → api.minimax.io/anthropic. So the
# ONLY tenant secret needed is {"MINIMAX_API_KEY": <key>} — exactly
# the SECRETS_JSON branch test_staging_full_saas.sh uses.
# - Model id is the NAMESPACED colon-form `minimax:MiniMax-M2.7`, the
# registered BYOK arm for claude-code (registry_gen.go Runtimes
# ["claude-code"]["minimax"]). Per core#2263 the BARE `MiniMax-M2`
# id can 400 on a registry-skewed ws-server build; the namespaced
# form resolves the way kimi's `moonshot/…` does, so it's the
# robust choice for the gate.
run_minimax() {
echo ""
echo "=== minimax (claude-code BYOK) happy path ==="
if [ -z "${E2E_MINIMAX_API_KEY:-}" ]; then
skip "E2E_MINIMAX_API_KEY not set (MiniMax live arm needs the MiniMax key)"
return 0
fi
local secrets
secrets=$(python3 -c "
import json, os
# claude-code's minimax provider (third_party_anthropic_compat) reads
# MINIMAX_API_KEY and points ANTHROPIC_BASE_URL at api.minimax.io/anthropic
# at boot — so the ONLY tenant secret needed is the MiniMax key itself.
print(json.dumps({'MINIMAX_API_KEY': os.environ['E2E_MINIMAX_API_KEY']}))
")
local resp wsid
# Namespaced BYOK model id (core#2263): bare MiniMax-M2 can 400 on a
# registry-skewed ws-server build; minimax:MiniMax-M2.7 is the
# registered claude-code BYOK arm and resolves like kimi's moonshot/…
resp=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \
-d "{\"name\":\"Priority E2E (minimax)\",\"runtime\":\"claude-code\",\"model\":\"minimax:MiniMax-M2.7\",\"tier\":1,\"secrets\":$secrets}")
wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true
if [ -z "$wsid" ]; then
# BEST-EFFORT: MiniMax-create is fragile (see header — the namespaced
# model id is registry-skewed → 422). Do NOT red the gate; mock is the
# required backbone. Report the create response so the skew is visible.
bestfail "create minimax workspace (best-effort; mock carries the gate)" "$resp"
return 0
fi
CREATED_WSIDS+=("$wsid")
echo " workspace=$wsid"
# claude-code runtime image is already pulled; cold boot ~30-90s. The
# first MiniMax cold-call can be slow but that's covered by send_test_prompt's
# --max-time 180.
local final
final=$(wait_for_status "$wsid" "online failed" 240) || true
if [ "$final" != "online" ]; then
bestfail "minimax workspace reaches online (best-effort)" "final status: $final"
return 0
fi
pass "minimax workspace reaches online"
local token
token=$(echo "$resp" | e2e_extract_token)
if [ -z "$token" ]; then
token=$(e2e_mint_workspace_token "$wsid")
fi
if [ -z "$token" ]; then
bestfail "resolve minimax workspace token (best-effort)" "no token returned"
return 0
fi
local reply
if reply=$(send_test_prompt "$wsid" "$token"); then
if echo "$reply" | grep -q "PONG"; then
validated "minimax reply contains PONG"
else
validated "minimax reply non-empty (first 80 chars: ${reply:0:80})"
fi
assert_activity_logged "minimax" "$wsid" "$token"
else
bestfail "minimax reply (best-effort)" "${reply:-<empty or error>}"
fi
}
# `mock` runs FIRST and by default: it is the no-key REQUIRE-LIVE backbone
# that guarantees >=1 validation on a healthy platform (see run_mock). The
# real-LLM arms (claude-code/codex/hermes/openclaw/minimax) run if their
# secrets are present and add real-provider coverage on top; minimax is
# best-effort (never reds the gate).
WANT="${E2E_RUNTIMES:-mock claude-code codex hermes openclaw minimax}"
for r in $WANT; do
case "$r" in
mock) run_mock ;;
claude-code) run_claude_code ;;
codex) run_codex ;;
hermes) run_hermes ;;
openclaw) run_openclaw ;;
all) run_claude_code; run_codex; run_hermes; run_openclaw ;;
minimax) run_minimax ;;
all) run_mock; run_claude_code; run_codex; run_hermes; run_openclaw; run_minimax ;;
*) echo "unknown runtime in E2E_RUNTIMES: $r" >&2; exit 2 ;;
esac
done
echo ""
echo "=== Results: $PASS passed, $FAIL failed, $SKIP skipped ==="
[ "$FAIL" -eq 0 ]
echo "=== Results: $PASS passed, $FAIL failed, $SKIP skipped, $VALIDATED runtime(s) validated end-to-end ==="
# Final exit decision lives in evaluate_require_live_gate (defined at the top of
# this file, before any platform I/O) so the same logic is unit-tested in
# isolation by test_require_live_priority_gate_unit.sh. Mirror its return code
# into the process exit code.
evaluate_require_live_gate
exit $?
+114
View File
@@ -0,0 +1,114 @@
#!/usr/bin/env bash
# Fail-direction / load-bearing proof for the E2E_REQUIRE_LIVE zero-validated
# gate in test_priority_runtimes_e2e.sh (the REQUIRED `E2E API Smoke Test`).
#
# WHY (harden/enforce-ci-gates-core-v2, PR #2286): the priority-runtimes E2E's
# only historical exit gate was `[ "$FAIL" -eq 0 ]`. When every runtime SKIPs
# because no live secret is present — exactly what the CI step did — PASS=0
# FAIL=0 and the script exited 0 (GREEN) while validating ZERO runtimes. The
# REQUIRED merge gate was therefore false-green: passing without exercising a
# single runtime. The fix adds a VALIDATED counter and makes a zero-validated
# run RED when E2E_REQUIRE_LIVE is set.
#
# That zero-validated→RED decision lives in evaluate_require_live_gate() in
# test_priority_runtimes_e2e.sh. CI cannot prove it via a live arm — the CI
# substrate can't provision ANY runtime end-to-end (MiniMax 422, mock org-
# import create fails, claude-code needs a key CI lacks), so the live e2e-api
# job does NOT force E2E_REQUIRE_LIVE (that would red the required gate for
# everyone). This UNIT test is the regression coverage instead: it drives the
# REAL evaluate_require_live_gate() function — not a copy — in isolation by
# sourcing the script with E2E_PRIORITY_UNIT_SOURCE=1 (which stops before any
# platform I/O), setting the counters, and asserting the gate's return code.
#
# Because it exercises the actual function, a future revert of the zero-
# validated→RED logic in test_priority_runtimes_e2e.sh fails THIS test on
# every PR — so the false-green can't silently come back.
#
# Runs entirely offline (no LLM, no network, no provisioning) — pure shell
# logic — so it runs on every PR in the fast lane and locally via `bash`.
set -uo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
GATE_SCRIPT="$SCRIPT_DIR/test_priority_runtimes_e2e.sh"
if [ ! -f "$GATE_SCRIPT" ]; then
echo "FATAL: cannot find $GATE_SCRIPT" >&2
exit 2
fi
PASS=0
FAIL=0
# run_case <E2E_REQUIRE_LIVE value> <VALIDATED count> <FAIL count>
# Sources the REAL test_priority_runtimes_e2e.sh under the unit source-guard
# (E2E_PRIORITY_UNIT_SOURCE=1 → it returns right after defining the counters
# and evaluate_require_live_gate(), before _lib.sh / the live pre-sweep curl),
# sets the counters to the scenario, calls the real gate, and echoes the
# return code. Each case runs in a fresh `bash -c` so set -e/-u inside the
# sourced script can't leak between cases or kill this harness.
run_case() {
local require_live="$1" validated="$2" failcount="$3"
local observed
E2E_PRIORITY_UNIT_SOURCE=1 \
E2E_REQUIRE_LIVE="$require_live" \
GATE_SCRIPT="$GATE_SCRIPT" \
VAL="$validated" \
FL="$failcount" \
bash -c '
set -uo pipefail
# shellcheck disable=SC1090
source "$GATE_SCRIPT" # returns at the source-guard (no platform I/O)
VALIDATED="$VAL"
FAIL="$FL"
evaluate_require_live_gate >/dev/null 2>&1
exit $?
'
observed=$?
echo "$observed"
}
assert_rc() {
local label="$1" require_live="$2" validated="$3" failcount="$4" expected="$5"
local observed
observed=$(run_case "$require_live" "$validated" "$failcount")
if [ "$observed" = "$expected" ]; then
echo "$label: REQUIRE_LIVE=$require_live VALIDATED=$validated FAIL=$failcount → rc=$observed"
PASS=$((PASS + 1))
else
echo "$label: REQUIRE_LIVE=$require_live VALIDATED=$validated FAIL=$failcount expected=$expected OBSERVED=$observed" >&2
FAIL=$((FAIL + 1))
fi
}
echo "=== E2E_REQUIRE_LIVE priority-runtimes zero-validated gate proof ==="
echo " (drives the REAL evaluate_require_live_gate from $GATE_SCRIPT)"
echo
# (a) DECISIVE false-green trap: REQUIRE_LIVE=1 + zero validated → RED (exit 1).
assert_rc "require-live, zero validated → RED (the false-green trap)" \
1 0 0 1
# (b) REQUIRE_LIVE=1 + at least one validated → GREEN (exit 0).
assert_rc "require-live, one validated → GREEN" \
1 1 0 0
assert_rc "require-live, several validated → GREEN" \
1 3 0 0
# (c) REQUIRE_LIVE unset-equivalent (0) + zero validated → GREEN (loud skip).
assert_rc "no require-live, zero validated → GREEN (dev-convenience loud skip)" \
0 0 0 0
# REQUIRE_LIVE=true (string form) is also honoured by the gate.
assert_rc "require-live='true', zero validated → RED" \
true 0 0 1
# A real FAIL is always red, regardless of REQUIRE_LIVE / VALIDATED — the
# zero-validated guard must not mask (nor be masked by) a genuine failure.
assert_rc "real FAIL with validations, no require-live → RED" \
0 2 1 1
assert_rc "real FAIL, zero validated, no require-live → RED" \
0 0 1 1
echo
echo "=== Results: $PASS passed, $FAIL failed ==="
[ "$FAIL" -eq 0 ]