Compare commits
29 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| d3d108a636 | |||
| 885cf423cc | |||
| d037e24cb0 | |||
| d1c0a66e14 | |||
| 9d23a7ef9f | |||
| 4752a78d21 | |||
| 6f7fa42b9c | |||
| 6884fff0b2 | |||
| 27f8f4dba2 | |||
| d063ecd186 | |||
| fc6850196b | |||
| 8a5c6cf771 | |||
| c8efa8f82a | |||
| f04c80b606 | |||
| 7f3a4491bb | |||
| 7c6986a96b | |||
| 19aa126c18 | |||
| 5f99c29de3 | |||
| d9ff9d036a | |||
| 91c9893ad4 | |||
| d8ff0b2503 | |||
| f0dec49793 | |||
| 10b7f8a99a | |||
| 6447edd2fd | |||
| 32e6427483 | |||
| e7968115ba | |||
| d0ab3d7c4b | |||
| f0b6079a82 | |||
| e6da3b29fb |
@@ -364,6 +364,14 @@ jobs:
|
||||
# check missed. If a refactor weakens the gate to a shape check,
|
||||
# this step goes red on every PR.
|
||||
bash tests/e2e/test_completion_assert_unit.sh
|
||||
# harden/e2e-staging-saas-failclosed: fail-direction proof for the
|
||||
# E2E_REQUIRE_LIVE fail-closed-on-skip guard in
|
||||
# test_staging_full_saas.sh. Offline (no LLM/network/provisioning):
|
||||
# asserts the guard exits 5 when a live lifecycle did NOT run and
|
||||
# passes when all milestones fired — so a refactor that lets the
|
||||
# staging gate report green without a real provision→online→A2A
|
||||
# cycle goes red on every PR.
|
||||
bash tests/e2e/test_require_live_guard_unit.sh
|
||||
|
||||
- if: ${{ needs.changes.outputs.scripts == 'true' }}
|
||||
name: Test ECR promote-tenant-image script (mock-driven, no live infra)
|
||||
|
||||
@@ -394,6 +394,21 @@ jobs:
|
||||
- name: Run E2E API tests
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: bash tests/e2e/test_api.sh
|
||||
- name: Run keyless feature-contract E2E (terminal-diagnose / webhooks / budget / checkpoints / audit / traces / session-search / rescue / llm-billing-mode / resume / hibernate)
|
||||
# Keyless required-lane coverage for feature endpoints that ship without
|
||||
# an LLM key (runtime=external fixture). Each asserts the real HTTP
|
||||
# contract + a meaningful failure mode (401/400/fail-closed) so a
|
||||
# regression goes RED, not silently green. The mock-runtime A2A canned
|
||||
# round-trip is covered by the priority-runtimes `mock` arm, not here.
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: bash tests/e2e/test_keyless_feature_contracts_e2e.sh
|
||||
- name: Run secrets-dispatch contract test (keyless SECRETS_JSON branch order)
|
||||
# Previously orphaned (no workflow referenced it). Hermetic unit-style
|
||||
# contract over test_staging_full_saas.sh's LLM-key branch precedence —
|
||||
# needs no platform, no bearer, no network. Guards the 2026-05-03
|
||||
# "wrong key shape wins" incident class.
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: bash tests/e2e/test_secrets_dispatch.sh
|
||||
- name: Run notify-with-attachments E2E
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: bash tests/e2e/test_notify_attachments_e2e.sh
|
||||
|
||||
@@ -113,6 +113,28 @@ jobs:
|
||||
runs-on: docker-host
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
#
|
||||
# PROMOTION-READINESS (toward required gate — do NOT flip continue-on-error
|
||||
# without CTO sign-off, that's the irreversible call):
|
||||
# NOW FAIL-CLOSED:
|
||||
# - Postgres/Redis/platform/canvas readiness are already bounded
|
||||
# readiness-polls that hard-fail (and dump logs) at their deadline,
|
||||
# not fixed sleeps — preserved.
|
||||
# - passWithNoTests:false + forbidOnly (playwright.config.ts) → a
|
||||
# renamed/moved spec or stray test.only can no longer green the lane.
|
||||
# - REQUIRE-LIVE guard in "Run Playwright E2E tests" → chat==true must
|
||||
# actually execute >=1 test, else exit 1.
|
||||
# - chat-desktop "activity log" test no longer swallows its assertion.
|
||||
# STILL BLOCKS PROMOTION:
|
||||
# - The echo round-trip asserts on rendered "Echo: ..." text but never
|
||||
# asserts the echo runtime actually RECEIVED the A2A request
|
||||
# (fixtures/echo-runtime.ts exposes lastRequest, unused) — an
|
||||
# optimistic client-side render could pass without a real round-trip.
|
||||
# Add a server-received assertion before required.
|
||||
# - The "No-op pass" path (detect-changes chat!=true) is a legitimate
|
||||
# paths-filter skip, but a required gate needs it to be a neutral
|
||||
# check, not a green "success", so a skipped heavy lane can't be
|
||||
# mistaken for a passed one.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 15
|
||||
env:
|
||||
@@ -334,11 +356,32 @@ jobs:
|
||||
- name: Run Playwright E2E tests
|
||||
if: needs.detect-changes.outputs.chat == 'true'
|
||||
working-directory: canvas
|
||||
env:
|
||||
# CI=1 activates forbidOnly in playwright.config.ts (a stray
|
||||
# `test.only` would otherwise green the suite while skipping the
|
||||
# rest). passWithNoTests:false (also in the config) already makes
|
||||
# a zero-match selection exit non-zero.
|
||||
CI: "1"
|
||||
run: |
|
||||
set -euo pipefail
|
||||
export E2E_PLATFORM_URL="http://127.0.0.1:${PLATFORM_PORT}"
|
||||
export E2E_DATABASE_URL="${DATABASE_URL}"
|
||||
export PLAYWRIGHT_BASE_URL="http://localhost:${CANVAS_PORT}"
|
||||
npx playwright test e2e/chat-desktop.spec.ts e2e/chat-mobile.spec.ts
|
||||
|
||||
# REQUIRE-LIVE guard (mirrors CP serving-e2e SERVING_E2E_REQUIRE_LIVE):
|
||||
# this lane reached here only because detect-changes said chat==true,
|
||||
# so it MUST actually execute the round-trip specs. `pipefail` makes
|
||||
# a real test failure (playwright non-zero) abort here under `set -e`;
|
||||
# passWithNoTests:false makes a zero-match selection non-zero too. The
|
||||
# explicit grep below is belt-and-braces: assert the list reporter
|
||||
# printed an executed-count summary, so a silent all-skip / no-op can
|
||||
# never report green.
|
||||
npx playwright test e2e/chat-desktop.spec.ts e2e/chat-mobile.spec.ts \
|
||||
--reporter=list 2>&1 | tee /tmp/pw-chat.out
|
||||
if ! grep -qE '[0-9]+ (passed|failed|skipped)' /tmp/pw-chat.out; then
|
||||
echo "::error::E2E Chat REQUIRE-LIVE: chat==true but Playwright reported no executed tests — specs missing or all-skipped, refusing to report green."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Dump platform log on failure
|
||||
if: failure() && needs.detect-changes.outputs.chat == 'true'
|
||||
|
||||
@@ -12,9 +12,30 @@ name: E2E Staging Canvas (Playwright)
|
||||
#
|
||||
|
||||
# Playwright test suite that provisions a fresh staging org per run and
|
||||
# verifies every workspace-panel tab renders without crashing. Complements
|
||||
# e2e-staging-saas.yml (which tests the API shape) by exercising the
|
||||
# actual browser + canvas bundle against live staging.
|
||||
# verifies every workspace-panel tab renders REAL content (not just an
|
||||
# empty/errored container). Complements e2e-staging-saas.yml (which tests
|
||||
# the API shape) by exercising the actual browser + canvas bundle against
|
||||
# live staging.
|
||||
#
|
||||
# PROMOTION-READINESS (toward making this a HARD merge-gate):
|
||||
# NOW RELIABLE (spec hardened — staging-tabs.spec.ts):
|
||||
# - All waits condition-based (toBeVisible/toHaveAttribute/expect.poll);
|
||||
# no fixed waitForTimeout in the spec.
|
||||
# - Tabs asserted on settled REAL content, not "container visible".
|
||||
# - ErrorBoundary + visible error alerts fail non-degraded tabs.
|
||||
# - Tab-list parity-checked vs live DOM; fail-closed on missing tenant.
|
||||
# STILL BLOCKS PROMOTION-TO-REQUIRED (do NOT remove continue-on-error —
|
||||
# CTO-owned, RFC internal#219 §1):
|
||||
# - Infra dependency: real staging EC2 per run (12-20 min cold boot);
|
||||
# AWS/Cloudflare/CP availability would become merge-blockers.
|
||||
# - Shared-zone TLS/DNS/ACME propagation flake surface is upstream of
|
||||
# this repo and outside its control.
|
||||
# - Required-gate correctness needs CP_STAGING_ADMIN_API_TOKEN GUARANTEED
|
||||
# present; today's skip-if-absent (core#2225) is right for non-gating
|
||||
# but would skip-green a required check.
|
||||
# - Single hermes/platform_managed workspace; agent-dependent content
|
||||
# (live chat/traces round-trip) not exercised on staging (#2162).
|
||||
# The full checklist lives at the foot of canvas/e2e/staging-tabs.spec.ts.
|
||||
#
|
||||
# Triggers: push to main, PR touching canvas sources + this workflow only
|
||||
# after the PR enters `merge-queue`, manual dispatch, and scheduled cron to
|
||||
|
||||
@@ -85,6 +85,25 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
#
|
||||
# PROMOTION-READINESS (toward required gate — do NOT flip continue-on-error
|
||||
# without CTO sign-off, that's the irreversible call):
|
||||
# NOW FAIL-CLOSED:
|
||||
# - Missing CP_STAGING_ADMIN_API_TOKEN → hard exit 2 (preflight).
|
||||
# - Staging CP unhealthy → hard exit 1 (preflight, not a workspace bug).
|
||||
# - Harness E2E_REQUIRE_LIVE=1 → exit 5 if a clean exit didn't prove
|
||||
# all four awaiting_agent transitions (no silent skip).
|
||||
# - Sweep transition (step 6) is now a bounded readiness-poll, not a
|
||||
# fixed sleep + one-shot assert → no more sweep-cadence flake.
|
||||
# - register / re-register retry ONLY transient edge 5xx (bounded),
|
||||
# fail closed on 4xx → no more cold-boot-502 flake.
|
||||
# STILL BLOCKS PROMOTION:
|
||||
# - Single shared staging tenant + EC2 quota window: an infra-side
|
||||
# provisioning outage (not a code bug) would turn the gate red.
|
||||
# Needs an infra-class vs code-class signal split before required.
|
||||
# - "CP unhealthy → exit 1" currently looks identical to a real
|
||||
# failure on the run page; required-gate would need it demoted to
|
||||
# a neutral/skip so staging flakiness can't block merges.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 25
|
||||
|
||||
@@ -124,6 +143,15 @@ jobs:
|
||||
|
||||
- name: Run external-runtime E2E
|
||||
id: e2e
|
||||
# E2E_REQUIRE_LIVE=1: the harness fails CLOSED (exit 5) if it ever
|
||||
# reaches a clean exit without proving all four awaiting_agent
|
||||
# transitions. Mirrors CP serving-e2e SERVING_E2E_REQUIRE_LIVE — a
|
||||
# silent skip / early-return / dropped assertion can no longer
|
||||
# masquerade as green. Token-missing and CP-unhealthy already
|
||||
# hard-fail in the two preflight steps above, so reaching this step
|
||||
# means a real cycle is expected.
|
||||
env:
|
||||
E2E_REQUIRE_LIVE: "1"
|
||||
run: bash tests/e2e/test_staging_external_runtime.sh
|
||||
|
||||
# Mirror the e2e-staging-saas.yml safety net: if the runner is
|
||||
|
||||
@@ -109,6 +109,9 @@ jobs:
|
||||
E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
|
||||
E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }}
|
||||
E2E_RUNTIME: claude-code
|
||||
# Platform-managed create path (moonshot/kimi-k2.6, no tenant key) — the
|
||||
# combo proven to create cleanly; this test only needs the ws online.
|
||||
E2E_LLM_PATH: platform
|
||||
E2E_MODEL_SLUG: MiniMax-M2
|
||||
E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
|
||||
|
||||
@@ -172,9 +172,23 @@ jobs:
|
||||
# and defeats the cost saving. Operators can override via the
|
||||
# workflow_dispatch flow (no input wired here yet — runtime
|
||||
# override is enough for ad-hoc).
|
||||
E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'codex' && 'openai/gpt-4o' || github.event.inputs.runtime == 'google-adk' && 'google_genai:gemini-2.5-pro' || 'MiniMax-M2' }}
|
||||
#
|
||||
# #2263 deploy-skew: the claude-code default is the COLON-namespaced BYOK
|
||||
# id `minimax:MiniMax-M2.7`, NOT bare `MiniMax-M2`. The deployed staging
|
||||
# ws-server's compiled registry can lag source; validateRegisteredModelForRuntime
|
||||
# 400s the bare form on an older image (the sibling Platform Boot job, on
|
||||
# the SAME image, succeeds with namespaced `moonshot/kimi-k2.6`). The colon
|
||||
# form stays in the BYOK `minimax` arm (providers.yaml:851) so it resolves
|
||||
# provider=minimax (BYOK) and the #1994 byok-not-platform guard still
|
||||
# passes — the slash/platform form `minimax/MiniMax-M2.7` would not.
|
||||
E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'codex' && 'openai/gpt-4o' || github.event.inputs.runtime == 'google-adk' && 'google_genai:gemini-2.5-pro' || 'minimax:MiniMax-M2.7' }}
|
||||
E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
|
||||
# Fail-closed-on-skip: in CI the harness MUST prove ≥1 full
|
||||
# provision→online→A2A cycle. If it reaches the end having validated
|
||||
# nothing (a future short-circuit / skip path), it exits 5 rather than
|
||||
# reporting a false green. Mirrors CP serving-e2e SERVING_E2E_REQUIRE_LIVE.
|
||||
E2E_REQUIRE_LIVE: '1'
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
@@ -372,6 +386,10 @@ jobs:
|
||||
E2E_MODE: smoke
|
||||
E2E_RUN_ID: "platform-${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
|
||||
# Fail-closed-on-skip (see BYOK job). smoke mode still runs steps 2/4/7/8b,
|
||||
# so all four required milestones (provisioned/tenant_online/
|
||||
# workspace_online/a2a_roundtrip) fire — the guard is valid for this lane too.
|
||||
E2E_REQUIRE_LIVE: '1'
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
@@ -33,11 +33,20 @@
|
||||
# 2026-05-17 (internal#189 Phase 1).
|
||||
#
|
||||
# BURN-IN CLOSED 2026-05-17 (internal#189 Phase 1): The 7-day burn-in
|
||||
# window closed. continue-on-error: true has been removed from the
|
||||
# tier-check job; AND-composition is now fully enforced. If you need
|
||||
# to temporarily re-introduce a mask, file a tracker and follow the
|
||||
# mc#1982 protocol (Tier 2e lint requires a current tracker within
|
||||
# 2 lines of any continue-on-error: true).
|
||||
# window closed. As of 2026-06-04 the residual masks left behind by the
|
||||
# burn-in are removed for real (the comment previously claimed this while
|
||||
# the masks still persisted — that was stale):
|
||||
# - continue-on-error: true on the jq-install step (redundant; the step
|
||||
# already exits 0) and on the tier-check step (the burn-in mask).
|
||||
# - the `|| true` after the sop-tier-check.sh invocation, which masked
|
||||
# real tier-gate verdicts.
|
||||
# AND-composition is now fully enforced and the tier-check step can
|
||||
# honestly red CI on a real SOP-6 violation. SOP_FAIL_OPEN=1 is RETAINED
|
||||
# as sanctioned infra-resilience: it fails-open only on token/network/jq
|
||||
# faults, never on a real gate verdict. If you need to temporarily
|
||||
# re-introduce a mask, file a tracker and follow the mc#1982 protocol
|
||||
# (Tier 2e lint requires a current tracker within 2 lines of any
|
||||
# continue-on-error: true).
|
||||
|
||||
name: sop-tier-check
|
||||
|
||||
@@ -90,10 +99,11 @@ jobs:
|
||||
# GitHub releases may be unreachable from some runner networks
|
||||
# (infra#241 follow-up: GitHub timeout after 3s on 5.78.80.188
|
||||
# runners). The sop-tier-check script has its own fallback as a
|
||||
# third line of defense. continue-on-error: true ensures this step
|
||||
# failing does not block the job.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
# third line of defense, and this step's final command
|
||||
# (`jq --version ... || echo`) already exits 0 unconditionally — so
|
||||
# the step cannot fail the job on its own.
|
||||
# continue-on-error REMOVED 2026-06-04 (mc#1982 directive: root-fix
|
||||
# and remove, do not renew). It was redundant masking, not a gate.
|
||||
run: |
|
||||
# apt-get is the primary method — Ubuntu package mirrors are reliably
|
||||
# reachable from runner containers. GitHub releases may be blocked
|
||||
@@ -110,11 +120,11 @@ jobs:
|
||||
jq --version 2>/dev/null || echo "::notice::jq not yet available — script fallback will retry"
|
||||
|
||||
- name: Verify tier label + reviewer team membership
|
||||
# continue-on-error: true at step level — job-level is ignored by Gitea
|
||||
# Actions (quirk #10, internal runbooks). Belt-and-suspenders with
|
||||
# SOP_FAIL_OPEN=1 + || true below.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
# continue-on-error REMOVED 2026-06-04 (expired internal#189 Phase 1
|
||||
# burn-in, window closed 2026-05-17; mc#1982 directive: root-fix and
|
||||
# remove, do not renew). SOP_FAIL_OPEN=1 below still fails-open on
|
||||
# token/network/infra errors only (never on a real tier-gate verdict),
|
||||
# so this step can now honestly fail CI on a genuine SOP-6 violation.
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
GITEA_HOST: git.moleculesai.app
|
||||
@@ -123,9 +133,13 @@ jobs:
|
||||
PR_AUTHOR: ${{ github.event.pull_request.user.login }}
|
||||
SOP_DEBUG: '0'
|
||||
SOP_LEGACY_CHECK: '0'
|
||||
# SOP_FAIL_OPEN=1 makes the script always exit 0. The UI enforces
|
||||
# the actual merge gate. Combined with continue-on-error: true
|
||||
# above, this step never fails the job regardless of script exit.
|
||||
# SOP_FAIL_OPEN=1 fails-open ONLY on infra faults (empty/invalid
|
||||
# token, unreachable Gitea API, missing jq) — see the guarded
|
||||
# `exit 0` branches in sop-tier-check.sh. It does NOT mask a real
|
||||
# tier-gate verdict: a missing tier label, no approving review, or
|
||||
# an unsatisfied AND-clause still `exit 1`. Kept as sanctioned
|
||||
# infra-resilience; the `|| true` mask was REMOVED with the burn-in
|
||||
# COE (2026-06-04) so a genuine SOP-6 violation now reds CI.
|
||||
SOP_FAIL_OPEN: '1'
|
||||
run: |
|
||||
bash .gitea/scripts/sop-tier-check.sh || true
|
||||
bash .gitea/scripts/sop-tier-check.sh
|
||||
|
||||
@@ -101,10 +101,19 @@ test.describe("Desktop ChatTab", () => {
|
||||
await textarea.fill("Trigger activity");
|
||||
await page.getByRole("button", { name: /Send/ }).first().click();
|
||||
|
||||
// Activity log container should appear during the send flow.
|
||||
await expect(page.locator("[data-testid='activity-log']").first()).toBeVisible({ timeout: 10_000 }).catch(() => {
|
||||
// Activity log may not be present in all layouts.
|
||||
});
|
||||
// FALSE-GREEN FIX: the prior `.catch(() => {})` swallowed the assertion
|
||||
// entirely, so this test passed whether or not the activity log ever
|
||||
// rendered. The activity-log container is optional per layout, so we
|
||||
// gate on its presence in the DOM: if it's not part of this layout,
|
||||
// skip explicitly (a recorded skip, not a silent pass); if it IS
|
||||
// present, it MUST become visible during the send flow — that's the
|
||||
// behaviour this test exists to protect.
|
||||
const activityLog = page.locator("[data-testid='activity-log']").first();
|
||||
if ((await activityLog.count()) === 0) {
|
||||
test.skip(true, "activity-log not part of this layout");
|
||||
return;
|
||||
}
|
||||
await expect(activityLog).toBeVisible({ timeout: 10_000 });
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
@@ -0,0 +1,329 @@
|
||||
/**
|
||||
* Staging canvas E2E — REAL desktop take-control path (core#2261 "Gap 1").
|
||||
*
|
||||
* This is the live-e2e gate that the existing staging-tabs.spec.ts does NOT
|
||||
* provide. staging-tabs only opens the 13 declared workspace-panel tabs
|
||||
* (TAB_IDS at staging-tabs.spec.ts:24-38 — `display` is NOT among them) and
|
||||
* asserts they render without a "Failed to load" toast. It never acquires
|
||||
* display control, never opens the noVNC WebSocket, and never asserts a
|
||||
* framebuffer frame arrives. The companion unit test
|
||||
* canvas/src/components/tabs/__tests__/DisplayTab.test.tsx mocks the RFB
|
||||
* constructor (vi.mock("@novnc/novnc"), see its lines 8/20-39) so NO real
|
||||
* WebSocket is ever opened there either. Result: a broken take-control path
|
||||
* (acquire → noVNC WS upgrade → ws-proxy → EIC → websockify → x11vnc → Xvfb)
|
||||
* ships GREEN. This spec closes that gap by exercising the REAL wire path
|
||||
* end to end against a live, desktop-capable staging workspace.
|
||||
*
|
||||
* What it asserts (the real path, no mocks):
|
||||
* 1. POST /workspaces/<id>/display/control/acquire returns 200 with a
|
||||
* session_url that carries the signed token in its `#token=` fragment
|
||||
* (mirrors workspace_display_control.go:signedDisplaySessionURL).
|
||||
* 2. Opening the noVNC WebSocket at session_url with the subprotocols
|
||||
* ["binary", "molecule-display-token.<token>"] (exactly what the canvas
|
||||
* sends — DisplayTab.tsx:339) UPGRADES (onopen fires, readyState===OPEN,
|
||||
* no immediate 1006 abnormal close). A 1006 / 403 means the handshake
|
||||
* failed somewhere in the proxy chain.
|
||||
* 3. At least one BINARY framebuffer message arrives on that socket — a
|
||||
* real frame off x11vnc, not just a panel mount. RFB sends a
|
||||
* ProtocolVersion banner ("RFB 003.00x\n") as the first server message,
|
||||
* which proves the upstream VNC server is live behind the EIC tunnel.
|
||||
*
|
||||
* Auth model (important): the WS upgrade is gated by workspace-server
|
||||
* middleware.AdminAuth. A browser WebSocket CANNOT set an Authorization
|
||||
* header, so in production the canvas WS upgrade passes AdminAuth via the
|
||||
* same-origin-canvas path (wsauth_middleware.go:isSameOriginCanvas, which
|
||||
* keys off the Origin header the browser sets automatically on a same-origin
|
||||
* WS upgrade). We therefore open the socket from inside the browser page via
|
||||
* page.evaluate AFTER navigating to the tenant origin — so the browser sends
|
||||
* `Origin: https://<slug>.staging.moleculesai.app`, exactly as production
|
||||
* does. The acquire POST (which CAN carry a header) uses the per-tenant admin
|
||||
* bearer set on the context. This is the faithful production handshake, not a
|
||||
* synthetic one.
|
||||
*
|
||||
* Gate / cost: this test only runs when STAGING_DISPLAY_WORKSPACE_ID points
|
||||
* at a STANDING desktop-capable workspace (compute.display.mode ==
|
||||
* "desktop-control"). We deliberately do NOT provision one in the shared
|
||||
* staging-setup.ts: a desktop AMI boots in ~12-15 min and would tax the
|
||||
* existing tabs harness on every run. Standing that workspace up is a cost
|
||||
* item for the CTO (one always-on desktop EC2 on staging). Until that exists,
|
||||
* the test SKIPS loud. When the env IS present, any failure in
|
||||
* provision/acquire/upgrade is a HARD error — fail-closed, never silently
|
||||
* green (no "flaky" disposition: a 1006 names a broken proxy hop).
|
||||
*/
|
||||
|
||||
import { test, expect } from "@playwright/test";
|
||||
|
||||
const STAGING = process.env.CANVAS_E2E_STAGING === "1";
|
||||
|
||||
// The standing desktop-capable workspace id. Absent => skip loud. This is
|
||||
// the single knob that activates the gate; see file header for the cost note.
|
||||
const DISPLAY_WS_ID = process.env.STAGING_DISPLAY_WORKSPACE_ID;
|
||||
|
||||
test.skip(!STAGING, "CANVAS_E2E_STAGING not set — skipping staging-only tests");
|
||||
test.skip(
|
||||
!DISPLAY_WS_ID,
|
||||
"STAGING_DISPLAY_WORKSPACE_ID not set — no standing desktop-capable staging " +
|
||||
"workspace to exercise the take-control path. Set it to a workspace whose " +
|
||||
"compute.display.mode == 'desktop-control' to activate this real-e2e gate. " +
|
||||
"(Standing that workspace up is a CTO cost item — one always-on desktop EC2.)",
|
||||
);
|
||||
|
||||
// How long we wait for the WS to upgrade + deliver the first frame. The EIC
|
||||
// tunnel + websockify handshake adds real latency on top of the edge; budget
|
||||
// generously but bounded, so a genuinely-dead path fails LOUD instead of
|
||||
// hanging to the suite timeout.
|
||||
const WS_UPGRADE_TIMEOUT_MS = 30_000;
|
||||
const FIRST_FRAME_TIMEOUT_MS = 30_000;
|
||||
|
||||
test.describe("staging desktop take-control (real noVNC path)", () => {
|
||||
test("acquire → WS upgrades → first framebuffer frame arrives", async ({
|
||||
page,
|
||||
context,
|
||||
}) => {
|
||||
// The standing desktop workspace lives in its OWN standing org (it can't
|
||||
// live in the per-run ephemeral org — that gets torn down each run). When
|
||||
// STAGING_DISPLAY_SLUG is configured, staging-setup.ts resolves that org's
|
||||
// tenant URL / admin token / org id and exports them under STAGING_DISPLAY_*.
|
||||
// Fall back to the ephemeral org's exports only if the display org wasn't
|
||||
// separately configured (e.g. the desktop workspace happens to live in the
|
||||
// run's own tenant — not the expected topology, but supported).
|
||||
const tenantURL =
|
||||
process.env.STAGING_DISPLAY_TENANT_URL || process.env.STAGING_TENANT_URL;
|
||||
const tenantToken =
|
||||
process.env.STAGING_DISPLAY_TENANT_TOKEN || process.env.STAGING_TENANT_TOKEN;
|
||||
const orgID =
|
||||
process.env.STAGING_DISPLAY_ORG_ID || process.env.STAGING_ORG_ID;
|
||||
|
||||
// Fail-closed: when the gate env IS present (we got past the skips above),
|
||||
// the rest of the staging context MUST be wired or this is a hard error,
|
||||
// never a silent pass. Mirrors staging-tabs.spec.ts:53-57.
|
||||
if (!tenantURL || !tenantToken) {
|
||||
throw new Error(
|
||||
"STAGING_DISPLAY_WORKSPACE_ID is set but no tenant URL/token is available " +
|
||||
"for the take-control gate. Set STAGING_DISPLAY_SLUG so staging-setup.ts " +
|
||||
"resolves STAGING_DISPLAY_TENANT_URL / STAGING_DISPLAY_TENANT_TOKEN for the " +
|
||||
"standing desktop org (or ensure the ephemeral STAGING_TENANT_* exports exist).",
|
||||
);
|
||||
}
|
||||
|
||||
const workspaceId = DISPLAY_WS_ID as string;
|
||||
|
||||
// The per-tenant admin bearer satisfies AdminAuth for the acquire POST
|
||||
// (which can carry a header). The WS upgrade below relies on Origin
|
||||
// (same-origin canvas), NOT this header.
|
||||
await context.setExtraHTTPHeaders({
|
||||
Authorization: `Bearer ${tenantToken}`,
|
||||
// X-Molecule-Org-Id is required by workspace-server TenantGuard for
|
||||
// cross-org requests routed through the CP edge; staging-setup exports it.
|
||||
// Harmless (and correct) to send on the same-origin tenant box too.
|
||||
...(orgID ? { "X-Molecule-Org-Id": orgID } : {}),
|
||||
});
|
||||
|
||||
// 0. Sanity: the workspace must actually be display-enabled, else the
|
||||
// whole gate is meaningless. Hit the availability endpoint first so a
|
||||
// mis-pointed STAGING_DISPLAY_WORKSPACE_ID fails with a precise message
|
||||
// instead of an opaque acquire error.
|
||||
const availResp = await page.request.get(
|
||||
`${tenantURL}/workspaces/${workspaceId}/display`,
|
||||
);
|
||||
expect(
|
||||
availResp.status(),
|
||||
`GET /display for ${workspaceId} should be 200`,
|
||||
).toBe(200);
|
||||
const avail = await availResp.json();
|
||||
expect(
|
||||
avail.available,
|
||||
`workspace ${workspaceId} is not display-available (reason=${avail.reason}). ` +
|
||||
"STAGING_DISPLAY_WORKSPACE_ID must point at a workspace with " +
|
||||
"compute.display.mode == 'desktop-control' AND a live instance_id.",
|
||||
).toBe(true);
|
||||
|
||||
// 1. Acquire display control. The handler returns session_url +
|
||||
// expires_at; session_url embeds the signed token in its #token=
|
||||
// fragment (workspace_display_control.go:signedDisplaySessionURL).
|
||||
const acquireResp = await page.request.post(
|
||||
`${tenantURL}/workspaces/${workspaceId}/display/control/acquire`,
|
||||
{ data: { controller: "user", ttl_seconds: 300 } },
|
||||
);
|
||||
expect(
|
||||
acquireResp.status(),
|
||||
`acquire should be 200; body: ${await acquireResp.text()}`,
|
||||
).toBe(200);
|
||||
const acquire = await acquireResp.json();
|
||||
expect(acquire.controller, "controller should be 'user'").toBe("user");
|
||||
expect(
|
||||
typeof acquire.session_url,
|
||||
`acquire response missing session_url: ${JSON.stringify(acquire)}`,
|
||||
).toBe("string");
|
||||
|
||||
// The token rides in the URL fragment (#token=...), never as a query
|
||||
// param — confirm the contract the client (DisplayTab.tsx:459-466)
|
||||
// depends on so a server-side change to the URL shape fails HERE.
|
||||
const sessionUrl: string = acquire.session_url;
|
||||
expect(
|
||||
sessionUrl,
|
||||
`session_url should carry the token in a #token= fragment: ${sessionUrl}`,
|
||||
).toContain("#token=");
|
||||
|
||||
// 2. Open the REAL noVNC WebSocket from inside the page, so the browser
|
||||
// sends Origin: <tenant> and the same-origin-canvas AdminAuth path
|
||||
// accepts the upgrade (a browser WS can't set Authorization). We
|
||||
// navigate to the tenant origin first purely to anchor the Origin
|
||||
// header; we don't need the canvas bundle to hydrate.
|
||||
await page.goto(tenantURL, { waitUntil: "domcontentloaded" });
|
||||
|
||||
// Reproduce DisplayTab.tsx:459-466 (displayWebSocketConnection): resolve
|
||||
// session_url against the tenant origin, pull the token out of the
|
||||
// fragment, strip the fragment, switch http(s)->ws(s). Then connect with
|
||||
// the exact subprotocols the canvas uses (DisplayTab.tsx:339).
|
||||
const result = await page.evaluate(
|
||||
async ({ rawSessionUrl, upgradeTimeoutMs, frameTimeoutMs }) => {
|
||||
const u = new URL(rawSessionUrl, window.location.href);
|
||||
const token =
|
||||
new URLSearchParams(u.hash.replace(/^#/, "")).get("token") ?? "";
|
||||
if (!token) {
|
||||
return { ok: false, stage: "token-parse", detail: "no #token in session_url" };
|
||||
}
|
||||
u.hash = "";
|
||||
u.protocol = window.location.protocol === "https:" ? "wss:" : "ws:";
|
||||
const wsUrl = u.toString();
|
||||
|
||||
return await new Promise<{
|
||||
ok: boolean;
|
||||
stage: string;
|
||||
detail: string;
|
||||
frameBytes?: number;
|
||||
frameKind?: string;
|
||||
closeCode?: number;
|
||||
}>((resolve) => {
|
||||
let upgraded = false;
|
||||
let settled = false;
|
||||
const finish = (r: {
|
||||
ok: boolean;
|
||||
stage: string;
|
||||
detail: string;
|
||||
frameBytes?: number;
|
||||
frameKind?: string;
|
||||
closeCode?: number;
|
||||
}) => {
|
||||
if (settled) return;
|
||||
settled = true;
|
||||
try {
|
||||
ws.close();
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
resolve(r);
|
||||
};
|
||||
|
||||
let ws: WebSocket;
|
||||
try {
|
||||
ws = new WebSocket(wsUrl, [`binary`, `molecule-display-token.${token}`]);
|
||||
} catch (e) {
|
||||
resolve({ ok: false, stage: "construct", detail: String(e) });
|
||||
return;
|
||||
}
|
||||
ws.binaryType = "arraybuffer";
|
||||
|
||||
const upgradeTimer = setTimeout(() => {
|
||||
finish({
|
||||
ok: false,
|
||||
stage: "upgrade-timeout",
|
||||
detail: `WS did not open within ${upgradeTimeoutMs}ms (readyState=${ws.readyState})`,
|
||||
});
|
||||
}, upgradeTimeoutMs);
|
||||
|
||||
let frameTimer: ReturnType<typeof setTimeout> | null = null;
|
||||
|
||||
ws.onopen = () => {
|
||||
upgraded = true;
|
||||
clearTimeout(upgradeTimer);
|
||||
// Now wait for the first server message. RFB's ProtocolVersion
|
||||
// banner is the first thing x11vnc sends; if nothing arrives the
|
||||
// tunnel opened but the VNC server behind it is dead.
|
||||
frameTimer = setTimeout(() => {
|
||||
finish({
|
||||
ok: false,
|
||||
stage: "frame-timeout",
|
||||
detail: `WS upgraded but no framebuffer message within ${frameTimeoutMs}ms`,
|
||||
});
|
||||
}, frameTimeoutMs);
|
||||
};
|
||||
|
||||
ws.onmessage = (ev) => {
|
||||
if (frameTimer) clearTimeout(frameTimer);
|
||||
let bytes = 0;
|
||||
let kind: string = typeof ev.data;
|
||||
if (ev.data instanceof ArrayBuffer) {
|
||||
bytes = ev.data.byteLength;
|
||||
kind = "ArrayBuffer";
|
||||
} else if (typeof Blob !== "undefined" && ev.data instanceof Blob) {
|
||||
bytes = ev.data.size;
|
||||
kind = "Blob";
|
||||
} else if (typeof ev.data === "string") {
|
||||
bytes = ev.data.length;
|
||||
kind = "string";
|
||||
}
|
||||
finish({
|
||||
ok: bytes > 0,
|
||||
stage: "frame",
|
||||
detail:
|
||||
bytes > 0
|
||||
? "received framebuffer message"
|
||||
: "first message was empty",
|
||||
frameBytes: bytes,
|
||||
frameKind: kind,
|
||||
});
|
||||
};
|
||||
|
||||
ws.onclose = (ev) => {
|
||||
// A close BEFORE open === failed upgrade (1006 abnormal / 403
|
||||
// forbidden surface here). A close AFTER we already saw a frame is
|
||||
// benign (our own finish() triggered it).
|
||||
if (!upgraded) {
|
||||
clearTimeout(upgradeTimer);
|
||||
finish({
|
||||
ok: false,
|
||||
stage: "upgrade-close",
|
||||
detail: `WS closed before upgrade (code=${ev.code}, reason="${ev.reason}") — handshake rejected somewhere in edge → ws-proxy → EIC → websockify → x11vnc`,
|
||||
closeCode: ev.code,
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
ws.onerror = () => {
|
||||
if (!upgraded) {
|
||||
clearTimeout(upgradeTimer);
|
||||
finish({
|
||||
ok: false,
|
||||
stage: "upgrade-error",
|
||||
detail: "WS error before upgrade — proxy chain rejected the handshake",
|
||||
});
|
||||
}
|
||||
};
|
||||
});
|
||||
},
|
||||
{
|
||||
rawSessionUrl: sessionUrl,
|
||||
upgradeTimeoutMs: WS_UPGRADE_TIMEOUT_MS,
|
||||
frameTimeoutMs: FIRST_FRAME_TIMEOUT_MS,
|
||||
},
|
||||
);
|
||||
|
||||
// 3. Assert the real outcome. No "flaky" escape hatch: each failure stage
|
||||
// names the broken hop so a reviewer can act on it directly.
|
||||
expect(
|
||||
result.ok,
|
||||
`take-control failed at stage="${result.stage}": ${result.detail}` +
|
||||
(result.closeCode ? ` (close code ${result.closeCode})` : ""),
|
||||
).toBe(true);
|
||||
expect(
|
||||
result.stage,
|
||||
`expected to reach the 'frame' stage; got '${result.stage}' (${result.detail})`,
|
||||
).toBe("frame");
|
||||
expect(
|
||||
result.frameBytes ?? 0,
|
||||
`framebuffer message should be non-empty (kind=${result.frameKind})`,
|
||||
).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
@@ -337,13 +337,99 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
|
||||
|
||||
// 7. Hand state off to tests + teardown — overwrite the slug-only
|
||||
// bootstrap state with the full state spec tests need.
|
||||
writeFileSync(
|
||||
stateFile,
|
||||
JSON.stringify({ slug, tenantURL, workspaceId, tenantToken }, null, 2),
|
||||
);
|
||||
//
|
||||
// FAIL-CLOSED handoff: every field the spec reads must be non-empty. If
|
||||
// any is missing here, the spec's env-presence guard would throw with a
|
||||
// generic "did setup run?" message that hides WHICH field was lost. Catch
|
||||
// it at the source — a partial provision must hard-fail setup, never hand
|
||||
// off a half-built state that the spec then has to diagnose (or worse,
|
||||
// skip). This is the loud, fail-closed contract: STAGING was requested,
|
||||
// so an incomplete provision is an error, not a skip.
|
||||
const handoff = { slug, tenantURL, workspaceId, tenantToken };
|
||||
const missingFields = Object.entries(handoff)
|
||||
.filter(([, v]) => !v)
|
||||
.map(([k]) => k);
|
||||
if (missingFields.length > 0) {
|
||||
throw new Error(
|
||||
`[staging-setup] provision incomplete — empty handoff field(s): ` +
|
||||
`${missingFields.join(", ")}. Refusing to hand off a partial state ` +
|
||||
`that would surface downstream as an opaque spec failure.`,
|
||||
);
|
||||
}
|
||||
writeFileSync(stateFile, JSON.stringify(handoff, null, 2));
|
||||
process.env.STAGING_SLUG = slug;
|
||||
process.env.STAGING_TENANT_URL = tenantURL;
|
||||
process.env.STAGING_WORKSPACE_ID = workspaceId;
|
||||
process.env.STAGING_TENANT_TOKEN = tenantToken;
|
||||
// The ephemeral org's UUID — exported so specs that route through the CP
|
||||
// edge can send X-Molecule-Org-Id (workspace-server TenantGuard). The tabs
|
||||
// harness hits the tenant box same-origin and doesn't need it, but the
|
||||
// take-control gate (staging-display.spec.ts) does.
|
||||
process.env.STAGING_ORG_ID = orgID;
|
||||
console.log(`[staging-setup] Ready — ${stateFile}`);
|
||||
|
||||
// 8. (core#2261 Gap 1) Resolve the STANDING desktop-capable org, if one is
|
||||
// configured, for the live take-control e2e (staging-display.spec.ts).
|
||||
//
|
||||
// This block is FULLY env-gated and additive: it provisions NOTHING and is
|
||||
// a no-op unless STAGING_DISPLAY_SLUG is set. We deliberately do NOT spin a
|
||||
// desktop workspace inside this shared setup — a desktop AMI boots in
|
||||
// ~12-15 min and would tax every tabs run. Instead an operator stands up
|
||||
// one always-on desktop org once (a CTO cost item) and points
|
||||
// STAGING_DISPLAY_SLUG + STAGING_DISPLAY_WORKSPACE_ID at it. Here we just
|
||||
// resolve that standing org's tenant URL, admin token, and org id so the
|
||||
// display spec can reach it. Fail-closed: if STAGING_DISPLAY_SLUG is set but
|
||||
// we can't resolve its token/id, we THROW — the gate must never silently
|
||||
// fall back to the (non-desktop) ephemeral org and pass.
|
||||
const displaySlug = process.env.STAGING_DISPLAY_SLUG;
|
||||
if (displaySlug) {
|
||||
console.log(`[staging-setup] Resolving standing desktop org: ${displaySlug}`);
|
||||
|
||||
// org id for the standing slug (admin-orgs row carries it + status).
|
||||
const orgsRes = await jsonFetch(`${CP_URL}/cp/admin/orgs`, { headers: adminAuth });
|
||||
if (orgsRes.status !== 200) {
|
||||
throw new Error(
|
||||
`STAGING_DISPLAY_SLUG=${displaySlug} set, but GET /cp/admin/orgs returned ` +
|
||||
`${orgsRes.status} — cannot resolve the standing desktop org for the ` +
|
||||
`take-control gate.`,
|
||||
);
|
||||
}
|
||||
const displayRow = (orgsRes.body?.orgs || []).find(
|
||||
(o: any) => o.slug === displaySlug,
|
||||
);
|
||||
if (!displayRow?.id) {
|
||||
throw new Error(
|
||||
`STAGING_DISPLAY_SLUG=${displaySlug} not found in /cp/admin/orgs — the ` +
|
||||
`standing desktop org for the take-control gate does not exist. Provision ` +
|
||||
`it (one always-on desktop EC2) or unset STAGING_DISPLAY_SLUG/` +
|
||||
`STAGING_DISPLAY_WORKSPACE_ID to skip the gate.`,
|
||||
);
|
||||
}
|
||||
if (displayRow.instance_status !== "running") {
|
||||
throw new Error(
|
||||
`Standing desktop org ${displaySlug} is '${displayRow.instance_status}', ` +
|
||||
`not 'running' — the take-control gate needs a live desktop tenant. ` +
|
||||
`full row: ${JSON.stringify(displayRow)}`,
|
||||
);
|
||||
}
|
||||
|
||||
const displayTokRes = await jsonFetch(
|
||||
`${CP_URL}/cp/admin/orgs/${displaySlug}/admin-token`,
|
||||
{ headers: adminAuth },
|
||||
);
|
||||
if (displayTokRes.status !== 200 || !displayTokRes.body?.admin_token) {
|
||||
throw new Error(
|
||||
`admin-token fetch for standing desktop org ${displaySlug} returned ` +
|
||||
`${displayTokRes.status}: ${JSON.stringify(displayTokRes.body)}`,
|
||||
);
|
||||
}
|
||||
|
||||
process.env.STAGING_DISPLAY_ORG_ID = displayRow.id;
|
||||
process.env.STAGING_DISPLAY_TENANT_URL = `https://${displaySlug}.${TENANT_DOMAIN}`;
|
||||
process.env.STAGING_DISPLAY_TENANT_TOKEN = displayTokRes.body.admin_token;
|
||||
console.log(
|
||||
`[staging-setup] Standing desktop org resolved: ${displaySlug} ` +
|
||||
`(org_id=${displayRow.id}, url=${process.env.STAGING_DISPLAY_TENANT_URL})`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
+305
-33
@@ -1,7 +1,8 @@
|
||||
/**
|
||||
* Staging canvas E2E — opens each of the 13 workspace-panel tabs against a
|
||||
* fresh staging org provisioned in the global setup. Asserts each tab
|
||||
* renders without throwing and captures a screenshot for visual review.
|
||||
* Staging canvas E2E — opens each workspace-panel tab against a fresh
|
||||
* staging org provisioned in the global setup. Asserts each tab renders
|
||||
* REAL content (not an empty container, not an error state) and captures a
|
||||
* screenshot for visual review.
|
||||
*
|
||||
* Auth model: the tenant platform's AdminAuth middleware accepts a bearer
|
||||
* token OR a WorkOS session cookie. Playwright can't mint a WorkOS
|
||||
@@ -10,17 +11,39 @@
|
||||
* Bearer header via context.setExtraHTTPHeaders(). Every browser
|
||||
* request inherits the header.
|
||||
*
|
||||
* Known SaaS gaps — documented in #1369 and allowed to render errored
|
||||
* content without failing the test (the gate is "no hard crash, no
|
||||
* 'Failed to load' toast"):
|
||||
* PROMOTION-READINESS (see § at bottom of file): this suite is being
|
||||
* hardened toward becoming a HARD merge-gate. It currently runs under
|
||||
* `continue-on-error: true` (RFC internal#219 §1, non-gating) — that is a
|
||||
* deliberate, CTO-owned call and is NOT changed here. The hardening makes
|
||||
* every assertion deterministic so that WHEN promotion happens the gate
|
||||
* does not flap. See the PROMOTION-READINESS block at the foot of this
|
||||
* file for what is now reliable and what still blocks promotion.
|
||||
*
|
||||
* Known SaaS gaps — documented in #1369. These tabs legitimately cannot
|
||||
* load real content in SaaS mode and are allowed an in-panel empty/error
|
||||
* state (NOT a hard crash, NOT an ErrorBoundary):
|
||||
* - Files tab: empty (platform can't docker exec into a remote EC2)
|
||||
* - Terminal tab: WS connect fails
|
||||
* - Peers tab: 401 without workspace-scoped token
|
||||
* These are enumerated in KNOWN_DEGRADED_TABS below and asserted with a
|
||||
* weaker (but still non-trivial) contract: the panel renders and does not
|
||||
* crash the app. Every OTHER tab must render real content.
|
||||
*/
|
||||
|
||||
import { test, expect } from "@playwright/test";
|
||||
import { test, expect, type Page } from "@playwright/test";
|
||||
|
||||
// Tab ids as declared in canvas/src/components/SidePanel.tsx TABS.
|
||||
//
|
||||
// NOTE (drift guard): this list is asserted-complete against the live DOM
|
||||
// below (see "tab list parity" step) so it cannot silently drift out of
|
||||
// sync with SidePanel.tsx TABS the way a hand-maintained constant does.
|
||||
// `display` and `container-config` are intentionally EXCLUDED here:
|
||||
// - `display` is owned by the in-flight take-control e2e (PR #2275 /
|
||||
// staging-display.spec.ts); asserting it here would collide.
|
||||
// - `container-config` only renders when selectedNodeId is set AND is
|
||||
// gated on tier; it is covered by container-config-specific specs.
|
||||
// The parity check accounts for these via EXPECTED_EXTRA_TABS so a NEW
|
||||
// tab appearing in SidePanel still trips the guard.
|
||||
const TAB_IDS = [
|
||||
"chat",
|
||||
"activity",
|
||||
@@ -37,12 +60,131 @@ const TAB_IDS = [
|
||||
"audit",
|
||||
] as const;
|
||||
|
||||
// Tabs present in the DOM that this spec intentionally does not drive.
|
||||
// Keeping this explicit means a genuinely-new tab (not one of these) makes
|
||||
// the parity assertion fail LOUD instead of being silently un-tested.
|
||||
const EXPECTED_EXTRA_TABS = ["display", "container-config"] as const;
|
||||
|
||||
// Tabs that are KNOWN to degrade in SaaS mode (#1369). They get the weaker
|
||||
// "renders + no crash" contract instead of the "real content" contract.
|
||||
// Anything NOT in this set must render real content or the test fails.
|
||||
const KNOWN_DEGRADED_TABS = new Set<string>(["terminal", "files"]);
|
||||
|
||||
const STAGING = process.env.CANVAS_E2E_STAGING === "1";
|
||||
|
||||
test.skip(!STAGING, "CANVAS_E2E_STAGING not set — skipping staging-only tests");
|
||||
// IMPORTANT — fail-closed, not skip-green.
|
||||
//
|
||||
// `test.skip(!STAGING)` is correct ONLY when the operator never asked for a
|
||||
// staging run (CANVAS_E2E_STAGING unset). In that case the workflow's
|
||||
// detect-changes / token-check gates have already decided not to exercise
|
||||
// staging, and skipping is the documented contract.
|
||||
//
|
||||
// But if STAGING *is* requested (CANVAS_E2E_STAGING=1) and global setup did
|
||||
// NOT hand off the tenant state, that is a HARD failure, not a skip — see
|
||||
// the explicit env-presence throw inside the test body. A silent skip there
|
||||
// would let a broken provision ship green, which is exactly the
|
||||
// weak-gate failure this hardening removes (§ No flakes / internal#828).
|
||||
test.skip(!STAGING, "CANVAS_E2E_STAGING not set — staging-only suite, not requested");
|
||||
|
||||
/**
|
||||
* Assert the panel for `tabId` rendered real content.
|
||||
*
|
||||
* Deterministic contract (no fixed waits — every step is condition-based
|
||||
* with Playwright's built-in retry / expect.poll):
|
||||
* 1. The tabpanel container is visible.
|
||||
* 2. The global ErrorBoundary did NOT trip ("Something went wrong").
|
||||
* 3. No visible error alert is shown in the panel.
|
||||
* 4. For non-degraded tabs: the panel settles to non-empty,
|
||||
* non-spinner content (so an empty <div/> or a stuck "Loading…"
|
||||
* spinner FAILS instead of passing as it did before).
|
||||
*/
|
||||
async function assertPanelRendered(page: Page, tabId: string): Promise<void> {
|
||||
const panel = page.locator(`#panel-${tabId}`);
|
||||
|
||||
// (1) Container visible. Built-in retry up to the expect timeout — no
|
||||
// arbitrary waitForTimeout. Mechanism: replaces any reliance on a fixed
|
||||
// settle delay with a real visibility condition.
|
||||
await expect(panel, `panel for ${tabId} never became visible`).toBeVisible({
|
||||
timeout: 10_000,
|
||||
});
|
||||
|
||||
// (2) ErrorBoundary trip = hard crash anywhere in the React subtree.
|
||||
// canvas/src/components/ErrorBoundary.tsx renders "Something went wrong".
|
||||
// The OLD gate only looked for a "Failed to load" toast and would ship
|
||||
// an ErrorBoundary-crashed panel GREEN. Mechanism: assert the crash
|
||||
// surface is absent, retried via expect.poll so a late-mounting crash
|
||||
// banner is still caught.
|
||||
await expect
|
||||
.poll(
|
||||
async () =>
|
||||
page.getByText("Something went wrong", { exact: false }).count(),
|
||||
{
|
||||
message: `tab ${tabId}: ErrorBoundary tripped (Something went wrong)`,
|
||||
timeout: 5_000,
|
||||
},
|
||||
)
|
||||
.toBe(0);
|
||||
|
||||
// (3) No visible error alert inside the panel. Tabs surface load errors
|
||||
// as role="alert" with the real error text (EventsTab/ChannelsTab/
|
||||
// ConfigTab/...). The OLD gate matched ONLY [role=alert]:has-text("Failed
|
||||
// to load") — it missed (a) error messages that don't contain that exact
|
||||
// phrase and (b) error divs that omit role="alert" entirely (e.g.
|
||||
// ActivityTab). We replace it with a broader, but still SaaS-gap-aware,
|
||||
// check: any *visible* alert OR red error banner inside the panel.
|
||||
//
|
||||
// Degraded tabs (#1369) are allowed an error state — for those we only
|
||||
// require no app-level crash (covered by step 2). For every other tab a
|
||||
// visible error alert is a real regression.
|
||||
if (!KNOWN_DEGRADED_TABS.has(tabId)) {
|
||||
const visibleAlerts = panel.locator('[role="alert"]:visible');
|
||||
await expect
|
||||
.poll(async () => visibleAlerts.count(), {
|
||||
message:
|
||||
`tab ${tabId}: a visible error alert is shown in the panel ` +
|
||||
`(was a weak "Failed to load"-only check before)`,
|
||||
timeout: 5_000,
|
||||
})
|
||||
.toBe(0);
|
||||
}
|
||||
|
||||
// (4) Real content. The tabpanel CONTAINER always mounts, so the old
|
||||
// toBeVisible() on the container passed even when the child rendered
|
||||
// nothing. Assert the panel's trimmed innerText is non-empty AND not
|
||||
// stuck on a loading spinner. expect.poll retries until the async
|
||||
// fetch+render settles — replacing the implicit "the network finished
|
||||
// by now" timing assumption with an explicit polled condition.
|
||||
//
|
||||
// Degraded tabs may legitimately be empty (Files in SaaS mode), so they
|
||||
// are exempt from the non-empty requirement; step 2 still guards them
|
||||
// against a hard crash.
|
||||
if (!KNOWN_DEGRADED_TABS.has(tabId)) {
|
||||
await expect
|
||||
.poll(
|
||||
async () => {
|
||||
const text = ((await panel.innerText()) || "").trim();
|
||||
// A panel still showing only a loading spinner has not settled.
|
||||
const stillLoading = /^(loading\b|loading…|loading\.\.\.)/i.test(
|
||||
text,
|
||||
);
|
||||
return text.length > 0 && !stillLoading;
|
||||
},
|
||||
{
|
||||
message:
|
||||
`tab ${tabId}: panel rendered empty or stuck on a loading ` +
|
||||
`spinner — no real content settled (weak "container visible" ` +
|
||||
`gate would have passed this)`,
|
||||
// Generous: real tabs fetch from the tenant over the network.
|
||||
// Polled, so it returns as soon as content appears.
|
||||
timeout: 20_000,
|
||||
},
|
||||
)
|
||||
.toBe(true);
|
||||
}
|
||||
}
|
||||
|
||||
test.describe("staging canvas tabs", () => {
|
||||
test("each workspace-panel tab renders without error", async ({
|
||||
test("each workspace-panel tab renders real content", async ({
|
||||
page,
|
||||
context,
|
||||
}) => {
|
||||
@@ -50,9 +192,16 @@ test.describe("staging canvas tabs", () => {
|
||||
const tenantToken = process.env.STAGING_TENANT_TOKEN;
|
||||
const workspaceId = process.env.STAGING_WORKSPACE_ID;
|
||||
|
||||
// FAIL-CLOSED (not skip): STAGING was requested but global setup did
|
||||
// not export tenant state. A silent skip here would paint a broken
|
||||
// provision GREEN. This is the loud-fail the hardening mandates.
|
||||
if (!tenantURL || !tenantToken || !workspaceId) {
|
||||
throw new Error(
|
||||
"staging-setup.ts did not export STAGING_TENANT_URL / STAGING_TENANT_TOKEN / STAGING_WORKSPACE_ID — did global setup run?",
|
||||
"staging-setup.ts did not export STAGING_TENANT_URL / " +
|
||||
"STAGING_TENANT_TOKEN / STAGING_WORKSPACE_ID. CANVAS_E2E_STAGING=1 " +
|
||||
"was set (staging WAS requested) but global setup produced no " +
|
||||
"tenant — this is a provisioning failure, NOT a reason to skip. " +
|
||||
"Check the [staging-setup] log above for the real error.",
|
||||
);
|
||||
}
|
||||
|
||||
@@ -152,11 +301,19 @@ test.describe("staging canvas tabs", () => {
|
||||
// omit the URL, so we'd otherwise be flying blind. Logged to the
|
||||
// test's stdout (visible in the workflow log under the failed step).
|
||||
page.on("requestfailed", (req) => {
|
||||
console.log(`[e2e/requestfailed] ${req.method()} ${req.url()}: ${req.failure()?.errorText ?? "?"}`);
|
||||
console.log(
|
||||
`[e2e/requestfailed] ${req.method()} ${req.url()}: ${
|
||||
req.failure()?.errorText ?? "?"
|
||||
}`,
|
||||
);
|
||||
});
|
||||
page.on("response", (res) => {
|
||||
if (res.status() >= 400) {
|
||||
console.log(`[e2e/response-${res.status()}] ${res.request().method()} ${res.url()}`);
|
||||
console.log(
|
||||
`[e2e/response-${res.status()}] ${res
|
||||
.request()
|
||||
.method()} ${res.url()}`,
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -173,9 +330,8 @@ test.describe("staging canvas tabs", () => {
|
||||
// hydrated, even with zero workspaces) or the hydration-error
|
||||
// banner — whichever wins first. Previous version of this wait
|
||||
// used `[role="tablist"]`, but that selector only appears AFTER
|
||||
// a workspace node is clicked (which happens below at L100), so
|
||||
// the wait would always time out at 45s before any meaningful
|
||||
// failure surfaced.
|
||||
// a workspace node is clicked, so the wait would always time out
|
||||
// at 45s before any meaningful failure surfaced.
|
||||
await page.waitForSelector(
|
||||
'[aria-label="Molecule AI workspace canvas"], [data-testid="hydration-error"]',
|
||||
{ timeout: 45_000 },
|
||||
@@ -189,10 +345,20 @@ test.describe("staging canvas tabs", () => {
|
||||
"canvas hydration failed — check staging CP + tenant reachability",
|
||||
).toBe(0);
|
||||
|
||||
// The global ErrorBoundary must not have tripped at the app root
|
||||
// either — a crash before the side panel even opens would otherwise
|
||||
// be invisible until a tab assertion happened to notice it.
|
||||
await expect(
|
||||
page.getByText("Something went wrong", { exact: false }),
|
||||
"app-level ErrorBoundary tripped during hydration",
|
||||
).toHaveCount(0);
|
||||
|
||||
// Click the workspace node to open the side panel. Try a data
|
||||
// attribute first, fall back to a generic role-based selector so
|
||||
// the test doesn't break when the node-card markup changes.
|
||||
const byDataAttr = page.locator(`[data-workspace-id="${workspaceId}"]`).first();
|
||||
const byDataAttr = page
|
||||
.locator(`[data-workspace-id="${workspaceId}"]`)
|
||||
.first();
|
||||
if ((await byDataAttr.count()) > 0) {
|
||||
await byDataAttr.click({ timeout: 10_000 });
|
||||
} else {
|
||||
@@ -202,19 +368,56 @@ test.describe("staging canvas tabs", () => {
|
||||
await firstNode.click({ timeout: 10_000 });
|
||||
}
|
||||
|
||||
await page.waitForSelector('[role="tablist"]', { timeout: 15_000 });
|
||||
// The tablist appears once the side panel mounts. Condition-based
|
||||
// wait — no fixed delay.
|
||||
const tablist = page.locator('[role="tablist"]');
|
||||
await expect(
|
||||
tablist,
|
||||
"side panel tablist never appeared after clicking the workspace node",
|
||||
).toBeVisible({ timeout: 15_000 });
|
||||
|
||||
// Tab-list parity guard. The hand-maintained TAB_IDS constant used to
|
||||
// be able to drift silently out of sync with SidePanel.tsx TABS — a
|
||||
// tab could be added to the UI and never get an assertion, shipping
|
||||
// broken-but-untested. Read the actual tab ids from the DOM and assert
|
||||
// every live tab is either driven by this spec (TAB_IDS) or explicitly
|
||||
// excluded (EXPECTED_EXTRA_TABS). A genuinely-new tab fails LOUD.
|
||||
const liveTabIds = (
|
||||
await tablist.locator('[role="tab"][id^="tab-"]').evaluateAll((els) =>
|
||||
els.map((el) => el.id.replace(/^tab-/, "")),
|
||||
)
|
||||
).sort();
|
||||
const accountedFor = new Set<string>([
|
||||
...TAB_IDS,
|
||||
...EXPECTED_EXTRA_TABS,
|
||||
]);
|
||||
const unaccounted = liveTabIds.filter((id) => !accountedFor.has(id));
|
||||
expect(
|
||||
unaccounted,
|
||||
`SidePanel exposes tab(s) this spec neither drives nor excludes: ` +
|
||||
`${unaccounted.join(", ")}. Add them to TAB_IDS (and assert their ` +
|
||||
`content) or to EXPECTED_EXTRA_TABS with a reason.`,
|
||||
).toHaveLength(0);
|
||||
// And the inverse: every TAB_ID we intend to drive must actually exist
|
||||
// in the DOM, so a renamed/removed tab fails here instead of timing out
|
||||
// on a missing #tab-<id> selector with an opaque message.
|
||||
const missing = TAB_IDS.filter((id) => !liveTabIds.includes(id));
|
||||
expect(
|
||||
missing,
|
||||
`TAB_IDS references tab(s) not present in SidePanel: ${missing.join(
|
||||
", ",
|
||||
)} — the spec's tab list has drifted from SidePanel.tsx TABS.`,
|
||||
).toHaveLength(0);
|
||||
|
||||
for (const tabId of TAB_IDS) {
|
||||
await test.step(`tab: ${tabId}`, async () => {
|
||||
const tabButton = page.locator(`#tab-${tabId}`);
|
||||
// The TABS bar is `overflow-x-auto` (SidePanel.tsx:~tabs
|
||||
// wrapper) — tabs after position ~3 are clipped behind the
|
||||
// right-edge fade gradient on smaller viewports. Playwright's
|
||||
// `toBeVisible()` returns false for clipped elements, so a
|
||||
// bare visibility check fails on `skills` and later tabs in
|
||||
// CI. scrollIntoViewIfNeeded brings the button into view
|
||||
// before the visibility check, mirroring what SidePanel's own
|
||||
// keyboard handler does on arrow-key navigation.
|
||||
// The TABS bar is `overflow-x-auto` — tabs past position ~3 are
|
||||
// clipped behind the right-edge fade gradient on smaller
|
||||
// viewports. Playwright's toBeVisible() returns false for clipped
|
||||
// elements, so a bare visibility check fails on later tabs in CI.
|
||||
// scrollIntoViewIfNeeded brings the button into view before the
|
||||
// visibility check.
|
||||
await tabButton.scrollIntoViewIfNeeded({ timeout: 5_000 });
|
||||
await expect(
|
||||
tabButton,
|
||||
@@ -222,18 +425,34 @@ test.describe("staging canvas tabs", () => {
|
||||
).toBeVisible({ timeout: 5_000 });
|
||||
await tabButton.click();
|
||||
|
||||
const panel = page.locator(`#panel-${tabId}`);
|
||||
await expect(panel, `panel for ${tabId} never rendered`).toBeVisible({
|
||||
timeout: 10_000,
|
||||
});
|
||||
// Confirm the click actually activated this tab before asserting
|
||||
// its content — aria-selected flips on the active tab. This closes
|
||||
// a race where a slow click handler left the PREVIOUS tab's panel
|
||||
// mounted and we asserted the wrong panel's content. Built-in
|
||||
// retry, condition-based, no fixed wait.
|
||||
await expect(
|
||||
tabButton,
|
||||
`tab-${tabId} did not become the selected tab after click`,
|
||||
).toHaveAttribute("aria-selected", "true", { timeout: 5_000 });
|
||||
|
||||
// "Failed to load" toast = hard crash. Known SaaS-mode gaps
|
||||
// (Files empty, Terminal disconnected, Peers 401) surface as
|
||||
// in-panel content, not toasts.
|
||||
// Real-content assertion (the core hardening). See
|
||||
// assertPanelRendered: container visible + no ErrorBoundary + no
|
||||
// visible error alert + settled non-empty content for non-degraded
|
||||
// tabs. Replaces the old "panel visible + no Failed-to-load toast"
|
||||
// pair, which shipped empty/errored panels green.
|
||||
await assertPanelRendered(page, tabId);
|
||||
|
||||
// Belt to the braces: the original toast check stays. A global
|
||||
// "Failed to load" toast (role=alert outside the panel) is still a
|
||||
// crash signal worth catching even though the in-panel checks above
|
||||
// now do the heavy lifting.
|
||||
const errorToasts = await page
|
||||
.locator('[role="alert"]:has-text("Failed to load")')
|
||||
.count();
|
||||
expect(errorToasts, `tab ${tabId}: "Failed to load" toast`).toBe(0);
|
||||
expect(
|
||||
errorToasts,
|
||||
`tab ${tabId}: a global "Failed to load" toast is showing`,
|
||||
).toBe(0);
|
||||
|
||||
await page.screenshot({
|
||||
path: `test-results/staging-tab-${tabId}.png`,
|
||||
@@ -267,3 +486,56 @@ test.describe("staging canvas tabs", () => {
|
||||
).toHaveLength(0);
|
||||
});
|
||||
});
|
||||
|
||||
/*
|
||||
* PROMOTION-READINESS — staging canvas E2E → HARD merge-gate
|
||||
* ----------------------------------------------------------
|
||||
* NOW RELIABLE (deterministic; these no longer flap on timing):
|
||||
* - Every wait is condition-based (toBeVisible / toHaveAttribute /
|
||||
* expect.poll). There is NO fixed waitForTimeout / sleep in the spec;
|
||||
* the only setTimeout is the bounded poll-interval inside
|
||||
* staging-setup.ts waitFor(), which has a hard deadline.
|
||||
* - Tabs are asserted on REAL settled content (non-empty, non-spinner),
|
||||
* not just "container is visible" — an empty or stuck-loading panel now
|
||||
* fails instead of shipping green.
|
||||
* - The ErrorBoundary ("Something went wrong") is asserted absent at app
|
||||
* hydration AND per tab — a React subtree crash can no longer pass.
|
||||
* - Visible error alerts inside a panel fail non-degraded tabs (was a
|
||||
* weak [role=alert]:has-text("Failed to load")-only check that missed
|
||||
* both other error phrasings and role-less error divs).
|
||||
* - The driven tab list is parity-checked against the live DOM, so a new
|
||||
* SidePanel tab can't ship un-tested and a removed one fails loud.
|
||||
* - Click→activation is confirmed (aria-selected) before asserting the
|
||||
* panel, removing a wrong-panel race.
|
||||
* - The suite is fail-closed: CANVAS_E2E_STAGING=1 with no tenant state
|
||||
* hard-errors (never skips→green); CANVAS_E2E_STAGING unset cleanly
|
||||
* skips (operator did not request staging).
|
||||
*
|
||||
* STILL BLOCKS PROMOTION-TO-REQUIRED (do NOT flip continue-on-error here —
|
||||
* CTO-owned, RFC internal#219 §1):
|
||||
* - INFRA DEPENDENCY: each run provisions a real staging EC2 tenant
|
||||
* (12-20 min cold boot). Required-gate latency + AWS/Cloudflare/CP
|
||||
* availability become merge-blockers. A staging outage would freeze
|
||||
* main even though the code is fine — unacceptable for a required check
|
||||
* until staging has an SLA or this runs against a warm pre-provisioned
|
||||
* pool.
|
||||
* - SHARED-RESOURCE FLAKE SURFACE: TLS/DNS/ACME propagation on a shared
|
||||
* staging zone (staging-setup TLS_TIMEOUT_MS) is outside this repo's
|
||||
* control. Deterministic here ≠ deterministic upstream.
|
||||
* - SECRET DEPENDENCY: CP_STAGING_ADMIN_API_TOKEN must be present on the
|
||||
* runner. The workflow's skip-if-absent (core#2225) keeps a missing
|
||||
* secret from painting red — correct for non-gating, but a REQUIRED
|
||||
* check must instead guarantee the secret is always present, else it
|
||||
* skip-greens the very thing it is supposed to enforce.
|
||||
* - SINGLE-WORKSPACE COVERAGE: one hermes/platform_managed workspace that
|
||||
* does NOT boot an agent on staging (no CP LLM proxy env, workspace-
|
||||
* server #2162). Tabs render, but agent-dependent content paths (live
|
||||
* chat round-trip, traces from a real run) are not exercised.
|
||||
*
|
||||
* PROMOTION CHECKLIST (when CTO signs off on making this required):
|
||||
* 1. Warm pre-provisioned tenant pool OR a staging SLA bounding boot time.
|
||||
* 2. Guarantee CP_STAGING_ADMIN_API_TOKEN on the gating runner; turn the
|
||||
* skip-if-absent into a hard error for the required path.
|
||||
* 3. Decide whether agent-dependent tabs need a wired LLM proxy on the
|
||||
* staging tenant (covers chat/traces real content) before gating them.
|
||||
*/
|
||||
|
||||
@@ -7,6 +7,14 @@ export default defineConfig({
|
||||
fullyParallel: false,
|
||||
workers: 1,
|
||||
retries: 0,
|
||||
// Fail CLOSED when an explicit spec selection matches zero tests.
|
||||
// Playwright defaults this to true, so `playwright test e2e/chat-*.spec.ts`
|
||||
// would exit 0 (green) if those files were renamed/moved/deleted — a
|
||||
// false-green that would silently gut the e2e-chat gate after a refactor.
|
||||
// forbidOnly likewise stops a stray `test.only` from green-ing the suite
|
||||
// while skipping every other case.
|
||||
passWithNoTests: false,
|
||||
forbidOnly: !!process.env.CI,
|
||||
use: {
|
||||
baseURL: process.env.PLAYWRIGHT_BASE_URL || "http://localhost:3000",
|
||||
headless: true,
|
||||
|
||||
@@ -11,7 +11,10 @@
|
||||
# default + 401, see PR #1714.)
|
||||
#
|
||||
# claude-code → auth-aware:
|
||||
# E2E_MINIMAX_API_KEY → "MiniMax-M2"
|
||||
# E2E_MINIMAX_API_KEY → "minimax:MiniMax-M2.7"
|
||||
# (colon-namespaced BYOK id; bare
|
||||
# "MiniMax-M2" 400s on a deploy-skewed
|
||||
# staging registry — #2263)
|
||||
# E2E_ANTHROPIC_API_KEY → "claude-sonnet-4-6"
|
||||
# otherwise → "sonnet"
|
||||
#
|
||||
@@ -82,7 +85,17 @@ pick_model_slug() {
|
||||
hermes) printf 'openai/gpt-4o' ;;
|
||||
claude-code)
|
||||
if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
|
||||
printf 'MiniMax-M2'
|
||||
# Namespaced (colon) BYOK id, not bare "MiniMax-M2" (#2263 deploy-skew):
|
||||
# bare ids can lag the deployed staging ws-server's compiled registry,
|
||||
# so workspace-create's validateRegisteredModelForRuntime 400s the bare
|
||||
# form on an older image. The colon-namespaced `minimax:MiniMax-M2.7`
|
||||
# resolves the same way the proven-working sibling `moonshot/kimi-k2.6`
|
||||
# does. It stays in the BYOK `minimax` arm (providers.yaml:851), so
|
||||
# DeriveProvider -> provider_selection=minimax (BYOK) and the #1994
|
||||
# byok-not-platform guard (test_staging_full_saas.sh:1000) still passes —
|
||||
# unlike the slash/platform form `minimax/MiniMax-M2.7`, which resolves
|
||||
# to provider=platform and would trip that guard.
|
||||
printf 'minimax:MiniMax-M2.7'
|
||||
elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
|
||||
printf 'claude-sonnet-4-6'
|
||||
else
|
||||
|
||||
+332
@@ -0,0 +1,332 @@
|
||||
#!/usr/bin/env bash
|
||||
set -uo pipefail
|
||||
#
|
||||
# test_keyless_feature_contracts_e2e.sh — REQUIRED-lane (E2E API Smoke Test)
|
||||
# keyless HTTP-contract coverage for feature endpoints that ship WITHOUT an
|
||||
# LLM key and had NO e2e assertion before (coverage-audit gap list).
|
||||
#
|
||||
# Why a NEW script (not added to test_api.sh): PR #2286 is concurrently
|
||||
# rewriting test_api.sh's auth helpers + _lib.sh (e2e_admin_auth_args) and the
|
||||
# test_priority_runtimes mock arm. Keeping these assertions in a standalone
|
||||
# file avoids a merge conflict with that in-flight PR and keeps the new feature
|
||||
# coverage independently reviewable. The mock-runtime A2A canned round-trip is
|
||||
# OWNED by #2286's `mock` arm (run_mock) — intentionally NOT duplicated here.
|
||||
#
|
||||
# Every endpoint below is exercised against a runtime=external workspace so NO
|
||||
# LLM key is needed. For each we assert the real HTTP contract: the happy path
|
||||
# AND a meaningful failure mode (401 without auth, 400 on bad input, or the
|
||||
# documented fail-closed status) so the test catches REAL regressions, not
|
||||
# just 200s.
|
||||
#
|
||||
# Auth model (matches workspace-server/internal/middleware/wsauth_middleware.go):
|
||||
# * WorkspaceAuth (/workspaces/:id/*) is STRICT once a token exists — a
|
||||
# bearer-less request 401s (devmode fail-open needs MOLECULE_ENV=dev AND
|
||||
# ADMIN_TOKEN unset, neither of which the e2e-api job sets).
|
||||
# * AdminAuth routes accept the platform ADMIN_TOKEN (post-#2286) OR, when no
|
||||
# ADMIN_TOKEN is configured, any valid workspace bearer (Tier-3 fallback) —
|
||||
# so the workspace token we mint authenticates admin routes in BOTH the
|
||||
# pre-#2286 (no ADMIN_TOKEN) and post-#2286 (ADMIN_TOKEN set) CI shapes.
|
||||
#
|
||||
# Local-run shape (mirrors the e2e-api job — real PG+Redis+platform):
|
||||
# DATABASE_URL=... REDIS_URL=... ADMIN_TOKEN=... ./platform-server &
|
||||
# BASE=http://127.0.0.1:$PORT bash tests/e2e/test_keyless_feature_contracts_e2e.sh
|
||||
|
||||
source "$(dirname "$0")/_lib.sh" # sets BASE default
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
|
||||
pass() { echo "PASS: $1"; PASS=$((PASS + 1)); }
|
||||
fail() { echo "FAIL: $1"; echo " $2"; FAIL=$((FAIL + 1)); }
|
||||
|
||||
# assert_contains DESC EXPECTED_SUBSTRING ACTUAL
|
||||
assert_contains() {
|
||||
if printf '%s' "$3" | grep -qF "$2"; then
|
||||
pass "$1"
|
||||
else
|
||||
fail "$1" "expected to contain [$2] — got: $3"
|
||||
fi
|
||||
}
|
||||
|
||||
# http_code METHOD URL [curl-args...] → prints the HTTP status code only.
|
||||
http_code() {
|
||||
local method="$1" url="$2"; shift 2
|
||||
curl -s -o /dev/null -w "%{http_code}" -X "$method" "$url" "$@"
|
||||
}
|
||||
|
||||
# body_and_code METHOD URL [curl-args...] → prints "<body>\n<code>".
|
||||
body_and_code() {
|
||||
local method="$1" url="$2"; shift 2
|
||||
curl -s -w $'\n%{http_code}' -X "$method" "$url" "$@"
|
||||
}
|
||||
|
||||
echo "=== Keyless feature HTTP-contract E2E (required lane) ==="
|
||||
echo ""
|
||||
|
||||
# Platform admin bearer when the job set one (#2286 shape). When ADMIN_TOKEN is
|
||||
# configured, AdminAuth's Tier-1 fail-open is OFF even before the first token
|
||||
# exists, so admin-gated create / list / delete must carry it from the start.
|
||||
# Pre-#2286 (no ADMIN_TOKEN) this is empty → fail-open create works bare.
|
||||
ENV_ADMIN="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}"
|
||||
ENV_ADMIN_AUTH=()
|
||||
[ -n "$ENV_ADMIN" ] && ENV_ADMIN_AUTH=(-H "Authorization: Bearer $ENV_ADMIN")
|
||||
|
||||
# Reproducible counts across reruns. e2e_cleanup_all_workspaces hits the
|
||||
# admin-gated list/delete; the platform admin bearer (if set) goes via the
|
||||
# MOLECULE_ADMIN_TOKEN/ADMIN_TOKEN env the helper already reads.
|
||||
e2e_cleanup_all_workspaces
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixture: one external workspace, registered → online. Keyless (external=true
|
||||
# means no container is provisioned and no LLM key is consulted).
|
||||
# ---------------------------------------------------------------------------
|
||||
R=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \
|
||||
${ENV_ADMIN_AUTH[@]+"${ENV_ADMIN_AUTH[@]}"} \
|
||||
-d '{"name":"Keyless Fixture","tier":1,"runtime":"external","external":true}')
|
||||
WS_ID=$(printf '%s' "$R" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo "")
|
||||
if [ -z "$WS_ID" ]; then
|
||||
echo "FATAL: could not create fixture workspace — got: $R" >&2
|
||||
exit 2
|
||||
fi
|
||||
assert_contains "POST /workspaces (external fixture created)" '"status":"awaiting_agent"' "$R"
|
||||
|
||||
# Workspace token: register returns one; else mint via the admin endpoint.
|
||||
WS_TOKEN=$(printf '%s' "$R" | e2e_extract_token)
|
||||
if [ -z "$WS_TOKEN" ]; then
|
||||
WS_TOKEN=$(e2e_mint_workspace_token "$WS_ID" 2>/dev/null || echo "")
|
||||
fi
|
||||
if [ -z "$WS_TOKEN" ]; then
|
||||
echo "FATAL: could not obtain workspace token for $WS_ID" >&2
|
||||
exit 2
|
||||
fi
|
||||
AUTH=(-H "Authorization: Bearer $WS_TOKEN")
|
||||
|
||||
# Admin bearer: explicit platform ADMIN_TOKEN if the job set one (#2286 shape),
|
||||
# else the workspace token (AdminAuth Tier-3 accepts it pre-#2286).
|
||||
ADMIN_BEARER="${ENV_ADMIN:-$WS_TOKEN}"
|
||||
ADMIN_AUTH=(-H "Authorization: Bearer $ADMIN_BEARER")
|
||||
|
||||
# Bring the fixture online so lifecycle (hibernate) has a hibernatable state.
|
||||
curl -s -X POST "$BASE/registry/register" -H "Content-Type: application/json" "${AUTH[@]}" \
|
||||
-d "{\"id\":\"$WS_ID\",\"url\":\"https://example.com/keyless\",\"agent_card\":{\"name\":\"Keyless Fixture\",\"skills\":[{\"id\":\"noop\",\"name\":\"Noop\"}]}}" >/dev/null
|
||||
|
||||
# ===========================================================================
|
||||
# 1. Terminal diagnose — GET /workspaces/:id/terminal/diagnose (wsAuth)
|
||||
# External workspace has no instance_id → diagnoseLocal path → 200 with a
|
||||
# deterministic report (ok=false, first_failure on docker/container). The
|
||||
# /terminal endpoint itself is a WebSocket upgrade (not HTTP-assertable
|
||||
# keyless); diagnose is its pure-HTTP sibling and the real contract surface.
|
||||
# ===========================================================================
|
||||
echo "--- /terminal/diagnose ---"
|
||||
BC=$(body_and_code GET "$BASE/workspaces/$WS_ID/terminal/diagnose" "${AUTH[@]}")
|
||||
DIAG_CODE=$(printf '%s' "$BC" | tail -n1)
|
||||
DIAG_BODY=$(printf '%s' "$BC" | sed '$d')
|
||||
assert_contains "GET /terminal/diagnose (200 report)" "200" "$DIAG_CODE"
|
||||
assert_contains "GET /terminal/diagnose (carries workspace_id)" "\"workspace_id\":\"$WS_ID\"" "$DIAG_BODY"
|
||||
assert_contains "GET /terminal/diagnose (has steps[])" '"steps"' "$DIAG_BODY"
|
||||
# Failure mode: no bearer → 401 (WorkspaceAuth strict once a token exists).
|
||||
assert_contains "GET /terminal/diagnose (no auth → 401)" "401" \
|
||||
"$(http_code GET "$BASE/workspaces/$WS_ID/terminal/diagnose")"
|
||||
|
||||
# ===========================================================================
|
||||
# 2. Webhooks (public) — POST /webhooks/:type
|
||||
# Public, no auth. telegram adapter: empty update body → (nil,nil) → 200
|
||||
# ignored; non-JSON → parse error → 400; unknown type → 404.
|
||||
# ===========================================================================
|
||||
echo "--- /webhooks/:type ---"
|
||||
BC=$(body_and_code POST "$BASE/webhooks/telegram" -H "Content-Type: application/json" -d '{}')
|
||||
WH_CODE=$(printf '%s' "$BC" | tail -n1)
|
||||
WH_BODY=$(printf '%s' "$BC" | sed '$d')
|
||||
assert_contains "POST /webhooks/telegram (non-message update → 200)" "200" "$WH_CODE"
|
||||
assert_contains "POST /webhooks/telegram (status ignored)" '"status":"ignored"' "$WH_BODY"
|
||||
assert_contains "POST /webhooks/telegram (bad JSON → 400)" "400" \
|
||||
"$(http_code POST "$BASE/webhooks/telegram" -H 'Content-Type: application/json' -d 'not-json')"
|
||||
assert_contains "POST /webhooks/<unknown> (→ 404)" "404" \
|
||||
"$(http_code POST "$BASE/webhooks/nope-not-a-channel" -H 'Content-Type: application/json' -d '{}')"
|
||||
|
||||
# ===========================================================================
|
||||
# 3. Budget — GET /workspaces/:id/budget (wsAuth) + PATCH (admin)
|
||||
# GET: fresh workspace → multi-period view, no limits, zero spend.
|
||||
# PATCH: set monthly limit (admin) → reflected; bad input → 400.
|
||||
# ===========================================================================
|
||||
echo "--- /budget ---"
|
||||
BUD=$(curl -s "$BASE/workspaces/$WS_ID/budget" "${AUTH[@]}")
|
||||
assert_contains "GET /budget (has periods map)" '"periods"' "$BUD"
|
||||
assert_contains "GET /budget (monthly_spend 0 on fresh ws)" '"monthly_spend":0' "$BUD"
|
||||
# PATCH is admin-gated (router.go:419). Set a monthly limit and verify echo.
|
||||
PB=$(curl -s -X PATCH "$BASE/workspaces/$WS_ID/budget" -H "Content-Type: application/json" "${ADMIN_AUTH[@]}" \
|
||||
-d '{"budget_limits":{"monthly":2000}}')
|
||||
assert_contains "PATCH /budget (monthly limit set → echoed)" '"budget_limit":2000' "$PB"
|
||||
# Re-read confirms persistence.
|
||||
assert_contains "GET /budget (limit persisted)" '"budget_limit":2000' \
|
||||
"$(curl -s "$BASE/workspaces/$WS_ID/budget" "${AUTH[@]}")"
|
||||
# Failure: empty body → 400 "budget_limits or budget_limit field is required".
|
||||
assert_contains "PATCH /budget (empty body → 400)" "400" \
|
||||
"$(http_code PATCH "$BASE/workspaces/$WS_ID/budget" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{}')"
|
||||
# Failure: unknown period → 400.
|
||||
assert_contains "PATCH /budget (unknown period → 400)" "400" \
|
||||
"$(http_code PATCH "$BASE/workspaces/$WS_ID/budget" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{"budget_limits":{"yearly":1}}')"
|
||||
# Failure: GET without bearer → 401.
|
||||
assert_contains "GET /budget (no auth → 401)" "401" "$(http_code GET "$BASE/workspaces/$WS_ID/budget")"
|
||||
|
||||
# ===========================================================================
|
||||
# 4. Checkpoints — POST/GET/DELETE /workspaces/:id/checkpoints* (wsAuth)
|
||||
# Fully self-contained CRUD over workflow_checkpoints (#788). Upsert → latest
|
||||
# → list-by-wfid → delete → 404. Failure modes: missing workflow_id → 400,
|
||||
# empty latest → 404.
|
||||
# ===========================================================================
|
||||
echo "--- /checkpoints ---"
|
||||
WFID="kl-wf-$$"
|
||||
CP=$(curl -s -X POST "$BASE/workspaces/$WS_ID/checkpoints" -H "Content-Type: application/json" "${AUTH[@]}" \
|
||||
-d "{\"workflow_id\":\"$WFID\",\"step_name\":\"step-a\",\"step_index\":1,\"payload\":{\"k\":\"v\"}}")
|
||||
assert_contains "POST /checkpoints (upsert → id + workflow_id)" "\"workflow_id\":\"$WFID\"" "$CP"
|
||||
assert_contains "GET /checkpoints/latest (200 newest)" "\"workflow_id\":\"$WFID\"" \
|
||||
"$(curl -s "$BASE/workspaces/$WS_ID/checkpoints/latest" "${AUTH[@]}")"
|
||||
assert_contains "GET /checkpoints/:wfid (lists the step)" '"step_name":"step-a"' \
|
||||
"$(curl -s "$BASE/workspaces/$WS_ID/checkpoints/$WFID" "${AUTH[@]}")"
|
||||
DEL=$(curl -s -X DELETE "$BASE/workspaces/$WS_ID/checkpoints/$WFID" "${AUTH[@]}")
|
||||
assert_contains "DELETE /checkpoints/:wfid (deleted count)" '"deleted":1' "$DEL"
|
||||
assert_contains "GET /checkpoints/:wfid (after delete → 404)" "404" \
|
||||
"$(http_code GET "$BASE/workspaces/$WS_ID/checkpoints/$WFID" "${AUTH[@]}")"
|
||||
# Failure: missing workflow_id → 400 (binding:required).
|
||||
assert_contains "POST /checkpoints (missing workflow_id → 400)" "400" \
|
||||
"$(http_code POST "$BASE/workspaces/$WS_ID/checkpoints" -H 'Content-Type: application/json' "${AUTH[@]}" -d '{"step_name":"x"}')"
|
||||
# Failure: no bearer → 401.
|
||||
assert_contains "POST /checkpoints (no auth → 401)" "401" \
|
||||
"$(http_code POST "$BASE/workspaces/$WS_ID/checkpoints" -H 'Content-Type: application/json' -d '{"workflow_id":"x","step_name":"y"}')"
|
||||
|
||||
# ===========================================================================
|
||||
# 5. Audit — GET /workspaces/:id/audit (wsAuth)
|
||||
# EU AI Act ledger query (#594). Fresh ws → empty events, total 0,
|
||||
# chain_valid null (AUDIT_LEDGER_SALT unset). Failure: bad RFC3339 from → 400.
|
||||
# ===========================================================================
|
||||
echo "--- /audit ---"
|
||||
AUD=$(curl -s "$BASE/workspaces/$WS_ID/audit" "${AUTH[@]}")
|
||||
assert_contains "GET /audit (total 0 on fresh ws)" '"total":0' "$AUD"
|
||||
assert_contains "GET /audit (chain_valid null without salt)" '"chain_valid":null' "$AUD"
|
||||
assert_contains "GET /audit (bad 'from' → 400)" "400" \
|
||||
"$(http_code GET "$BASE/workspaces/$WS_ID/audit?from=not-a-date" "${AUTH[@]}")"
|
||||
assert_contains "GET /audit (no auth → 401)" "401" "$(http_code GET "$BASE/workspaces/$WS_ID/audit")"
|
||||
|
||||
# ===========================================================================
|
||||
# 6. Traces — GET /workspaces/:id/traces (wsAuth)
|
||||
# Langfuse proxy (#590). No LANGFUSE_* configured → 200 [] (graceful empty),
|
||||
# never a 5xx. Failure: no auth → 401.
|
||||
# ===========================================================================
|
||||
echo "--- /traces ---"
|
||||
BC=$(body_and_code GET "$BASE/workspaces/$WS_ID/traces" "${AUTH[@]}")
|
||||
TR_CODE=$(printf '%s' "$BC" | tail -n1)
|
||||
TR_BODY=$(printf '%s' "$BC" | sed '$d')
|
||||
assert_contains "GET /traces (200 without Langfuse)" "200" "$TR_CODE"
|
||||
assert_contains "GET /traces (empty list)" '[]' "$TR_BODY"
|
||||
assert_contains "GET /traces (no auth → 401)" "401" "$(http_code GET "$BASE/workspaces/$WS_ID/traces")"
|
||||
|
||||
# ===========================================================================
|
||||
# 7. Session search — GET /workspaces/:id/session-search (wsAuth)
|
||||
# Searches activity_logs. Seed one activity row, then assert q-filter finds
|
||||
# it and a non-matching q returns []. Failure: no auth → 401.
|
||||
# ===========================================================================
|
||||
echo "--- /session-search ---"
|
||||
curl -s -X POST "$BASE/workspaces/$WS_ID/activity" -H "Content-Type: application/json" "${AUTH[@]}" \
|
||||
-d '{"activity_type":"agent_log","method":"inference","summary":"keyless-needle marker"}' >/dev/null
|
||||
assert_contains "GET /session-search?q=keyless-needle (finds row)" 'keyless-needle' \
|
||||
"$(curl -s "$BASE/workspaces/$WS_ID/session-search?q=keyless-needle" "${AUTH[@]}")"
|
||||
assert_contains "GET /session-search?q=<no-match> (empty)" '[]' \
|
||||
"$(curl -s "$BASE/workspaces/$WS_ID/session-search?q=zzz-no-such-token-zzz" "${AUTH[@]}")"
|
||||
assert_contains "GET /session-search (no auth → 401)" "401" \
|
||||
"$(http_code GET "$BASE/workspaces/$WS_ID/session-search?q=x")"
|
||||
|
||||
# ===========================================================================
|
||||
# 8. Rescue — GET /workspaces/:id/rescue (wsAuth)
|
||||
# RFC internal#742. Fail-CLOSED contract: the e2e-api job has no
|
||||
# MOLECULE_ORG_ID, so the handler returns 503 platform_misconfigured rather
|
||||
# than leaking cross-org. That fail-closed behaviour IS the keyless contract
|
||||
# we gate here (a regression that drops the org guard would flip this to a
|
||||
# 200/404 and turn this assertion RED). Failure mode: no auth → 401.
|
||||
# ===========================================================================
|
||||
echo "--- /rescue ---"
|
||||
BC=$(body_and_code GET "$BASE/workspaces/$WS_ID/rescue" "${AUTH[@]}")
|
||||
RES_CODE=$(printf '%s' "$BC" | tail -n1)
|
||||
RES_BODY=$(printf '%s' "$BC" | sed '$d')
|
||||
if [ "$RES_CODE" = "404" ]; then
|
||||
# MOLECULE_ORG_ID was set in this environment → no-bundle path.
|
||||
assert_contains "GET /rescue (no bundle → 404, org configured)" 'no rescue bundle' "$RES_BODY"
|
||||
else
|
||||
# No MOLECULE_ORG_ID (the e2e-api default) → fail-closed 503.
|
||||
assert_contains "GET /rescue (fail-closed 503 without MOLECULE_ORG_ID)" "503" "$RES_CODE"
|
||||
assert_contains "GET /rescue (platform_misconfigured code)" 'platform_misconfigured' "$RES_BODY"
|
||||
fi
|
||||
assert_contains "GET /rescue (no auth → 401)" "401" "$(http_code GET "$BASE/workspaces/$WS_ID/rescue")"
|
||||
|
||||
# ===========================================================================
|
||||
# 9. LLM billing-mode admin toggle — GET/PUT /admin/workspaces/:id/llm-billing-mode
|
||||
# (AdminAuth). Flip to byok → read back override; bad UUID → 400; missing
|
||||
# 'mode' key → 400; unknown mode → 400.
|
||||
# ===========================================================================
|
||||
echo "--- /admin/workspaces/:id/llm-billing-mode ---"
|
||||
assert_contains "GET llm-billing-mode (resolves a mode)" '"resolved_mode"' \
|
||||
"$(curl -s "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" "${ADMIN_AUTH[@]}")"
|
||||
PUTBM=$(curl -s -X PUT "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" -H "Content-Type: application/json" "${ADMIN_AUTH[@]}" \
|
||||
-d '{"mode":"byok"}')
|
||||
assert_contains "PUT llm-billing-mode byok (override set)" '"workspace_override":"byok"' "$PUTBM"
|
||||
assert_contains "GET llm-billing-mode (byok persisted)" '"workspace_override":"byok"' \
|
||||
"$(curl -s "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" "${ADMIN_AUTH[@]}")"
|
||||
# Clear the override (null) so we don't leave fixture state skewed.
|
||||
curl -s -X PUT "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" -H "Content-Type: application/json" "${ADMIN_AUTH[@]}" \
|
||||
-d '{"mode":null}' >/dev/null
|
||||
# Failure: malformed UUID → 400.
|
||||
assert_contains "PUT llm-billing-mode (bad UUID → 400)" "400" \
|
||||
"$(http_code PUT "$BASE/admin/workspaces/not-a-uuid/llm-billing-mode" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{"mode":"byok"}')"
|
||||
# Failure: missing 'mode' key → 400.
|
||||
assert_contains "PUT llm-billing-mode (missing mode → 400)" "400" \
|
||||
"$(http_code PUT "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{}')"
|
||||
# Failure: unknown mode string → 400.
|
||||
assert_contains "PUT llm-billing-mode (unknown mode → 400)" "400" \
|
||||
"$(http_code PUT "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{"mode":"bogus-mode"}')"
|
||||
|
||||
# ===========================================================================
|
||||
# 10. Lifecycle — Pause → Resume + Hibernate (wsAuth)
|
||||
# Pause works backend-agnostically (StopWorkspaceAuto no-ops on no backend)
|
||||
# → status=paused. Resume re-provisions: 200 provisioning when a provisioner
|
||||
# is wired (the e2e-api host has Docker), or 503 provisioner-not-available
|
||||
# otherwise — both are valid contracts, so accept either. Failure modes:
|
||||
# resume a non-paused ws → 404; hibernate a non-online ws → 404.
|
||||
# ===========================================================================
|
||||
echo "--- lifecycle (resume / hibernate) ---"
|
||||
# Pause the (online) fixture → status paused.
|
||||
PA=$(curl -s -X POST "$BASE/workspaces/$WS_ID/pause" "${AUTH[@]}")
|
||||
assert_contains "POST /pause (online → paused)" '"status":"paused"' "$PA"
|
||||
# Resume the paused fixture — accept 200 provisioning OR 503 (no provisioner).
|
||||
BC=$(body_and_code POST "$BASE/workspaces/$WS_ID/resume" "${AUTH[@]}")
|
||||
RSM_CODE=$(printf '%s' "$BC" | tail -n1)
|
||||
RSM_BODY=$(printf '%s' "$BC" | sed '$d')
|
||||
if [ "$RSM_CODE" = "200" ]; then
|
||||
assert_contains "POST /resume (paused → provisioning)" '"status":"provisioning"' "$RSM_BODY"
|
||||
elif [ "$RSM_CODE" = "503" ]; then
|
||||
assert_contains "POST /resume (no provisioner → 503 contract)" 'provisioner not available' "$RSM_BODY"
|
||||
else
|
||||
fail "POST /resume (expected 200 or 503)" "got HTTP $RSM_CODE — $RSM_BODY"
|
||||
fi
|
||||
# Failure: resume a workspace that is NOT paused → 404.
|
||||
# (After the resume above it is provisioning/online, not paused.)
|
||||
assert_contains "POST /resume (not-paused → 404)" "404" \
|
||||
"$(http_code POST "$BASE/workspaces/$WS_ID/resume" "${AUTH[@]}")"
|
||||
# Hibernate: bring the fixture back online first, then hibernate it.
|
||||
curl -s -X POST "$BASE/registry/register" -H "Content-Type: application/json" "${AUTH[@]}" \
|
||||
-d "{\"id\":\"$WS_ID\",\"url\":\"https://example.com/keyless\",\"agent_card\":{\"name\":\"Keyless Fixture\",\"skills\":[{\"id\":\"noop\",\"name\":\"Noop\"}]}}" >/dev/null
|
||||
HB=$(curl -s -X POST "$BASE/workspaces/$WS_ID/hibernate" "${AUTH[@]}")
|
||||
assert_contains "POST /hibernate (online → hibernated)" '"status":"hibernated"' "$HB"
|
||||
# Failure: hibernate again (now hibernated, not online/degraded) → 404.
|
||||
assert_contains "POST /hibernate (not-hibernatable → 404)" "404" \
|
||||
"$(http_code POST "$BASE/workspaces/$WS_ID/hibernate" "${AUTH[@]}")"
|
||||
# Failure: no bearer → 401.
|
||||
assert_contains "POST /resume (no auth → 401)" "401" "$(http_code POST "$BASE/workspaces/$WS_ID/resume")"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cleanup — delete the fixture (admin-gated DELETE + per-workspace bearer).
|
||||
# ---------------------------------------------------------------------------
|
||||
e2e_delete_workspace "$WS_ID" "Keyless Fixture" "${ADMIN_AUTH[@]}"
|
||||
|
||||
echo ""
|
||||
echo "=== Results: $PASS passed, $FAIL failed ==="
|
||||
[ "$FAIL" -eq 0 ]
|
||||
@@ -49,13 +49,13 @@ run_test "codex → slash-form fallback" codex
|
||||
run_test "claude-code → OAuth/default alias" claude-code "sonnet"
|
||||
|
||||
got=$(unset E2E_MODEL_SLUG E2E_ANTHROPIC_API_KEY; E2E_MINIMAX_API_KEY="mx-test" pick_model_slug claude-code)
|
||||
assert_eq "claude-code + MiniMax key → MiniMax model" "$got" "MiniMax-M2"
|
||||
assert_eq "claude-code + MiniMax key → MiniMax model" "$got" "minimax:MiniMax-M2.7"
|
||||
|
||||
got=$(unset E2E_MODEL_SLUG E2E_MINIMAX_API_KEY; E2E_ANTHROPIC_API_KEY="sk-ant-test" pick_model_slug claude-code)
|
||||
assert_eq "claude-code + Anthropic API key → Anthropic API model" "$got" "claude-sonnet-4-6"
|
||||
|
||||
got=$(unset E2E_MODEL_SLUG; E2E_MINIMAX_API_KEY="mx-priority" E2E_ANTHROPIC_API_KEY="sk-ant-loser" pick_model_slug claude-code)
|
||||
assert_eq "claude-code + both keys → MiniMax priority" "$got" "MiniMax-M2"
|
||||
assert_eq "claude-code + both keys → MiniMax priority" "$got" "minimax:MiniMax-M2.7"
|
||||
|
||||
# ── Fallback for unknown runtime ──
|
||||
# Picks slash-form (hermes-shaped) since hermes is the historical
|
||||
|
||||
@@ -300,7 +300,14 @@ rows = json.load(sys.stdin)
|
||||
def text_of(r):
|
||||
body = r.get('request_body') or {}
|
||||
parts = (body.get('params') or {}).get('message', {}).get('parts') or []
|
||||
return ''.join(p.get('text','') for p in parts if p.get('type')=='text')
|
||||
# A2A v0.3 keys the Part discriminator on 'kind'; legacy senders used
|
||||
# 'type'. ProxyA2A.normalizeA2APayload (#2251) rewrites 'type' -> 'kind'
|
||||
# on ingest, so the stored request_body carries 'kind' even when the
|
||||
# caller posted 'type'. Accept EITHER so this parser asserts on the text
|
||||
# payload, not on which discriminator field the server happened to store.
|
||||
def is_text(p):
|
||||
return p.get('kind') == 'text' or p.get('type') == 'text'
|
||||
return ''.join(p.get('text', '') for p in parts if is_text(p))
|
||||
if len(rows) < 2:
|
||||
print('NEED2_GOT_'+str(len(rows)))
|
||||
else:
|
||||
@@ -309,6 +316,29 @@ else:
|
||||
check_eq "since_id feed orders ASC (oldest-new first, newest-new last)" \
|
||||
"hello-from-e2e-2|hello-from-e2e-3" "$ASC_FIRST"
|
||||
|
||||
# Wire-contract gate (#2251): the caller posted parts with the LEGACY "type"
|
||||
# discriminator, but ProxyA2A.normalizeA2APayload rewrites "type" -> "kind"
|
||||
# (A2A v0.3) BEFORE the row is durably logged. Assert the stored request_body
|
||||
# carries "kind" and no longer carries "type", so a regression that drops the
|
||||
# rename — or a feed that stops storing the normalized body — fails loudly here
|
||||
# instead of silently feeding the polling agent an untagged Part. This is the
|
||||
# end-to-end half of the Go unit tests in a2a_proxy_test.go (which assert the
|
||||
# rename in isolation); this proves it survives the durable activity_logs path.
|
||||
DISC=$(echo "$ASC_RESP" | python3 -c "
|
||||
import json, sys
|
||||
rows = json.load(sys.stdin)
|
||||
kinds, types = [], []
|
||||
for r in rows:
|
||||
body = r.get('request_body') or {}
|
||||
parts = (body.get('params') or {}).get('message', {}).get('parts') or []
|
||||
for p in parts:
|
||||
if 'kind' in p: kinds.append(p['kind'])
|
||||
if 'type' in p: types.append(p['type'])
|
||||
print(('kind' if kinds and not types else 'BAD') + ':' + ','.join(kinds) + '/' + ','.join(types))
|
||||
")
|
||||
check_eq "stored Part uses v0.3 'kind' discriminator, never legacy 'type' (#2251)" \
|
||||
"kind:text,text/" "$DISC"
|
||||
|
||||
# ---------- Phase 6: stale cursor returns 410 ----------
|
||||
echo ""
|
||||
echo "--- Phase 6: Stale / unknown cursor returns 410 ---"
|
||||
|
||||
@@ -50,7 +50,11 @@
|
||||
# Optional env (mirrors the full-saas harness where they overlap):
|
||||
# E2E_RUNTIME claude-code (default)
|
||||
# E2E_PROVISION_TIMEOUT_SECS default 900 (cold EC2 budget)
|
||||
# E2E_WORKSPACE_ONLINE_TIMEOUT_SECS default 3600 (cold-boot worst-case)
|
||||
# E2E_WORKSPACE_ONLINE_TIMEOUT_SECS default 900 (15min). A workspace that
|
||||
# cannot reach online in 15min is a staging/boot problem,
|
||||
# not slow cold-boot — fail fast so the trap tears down the
|
||||
# EC2 instead of hanging ~1h and leaking a running instance
|
||||
# (observed: run 216031 hung 32min with a live e2e-rec EC2).
|
||||
# E2E_RECONCILE_OFFLINE_TIMEOUT_SECS default 180 (PRIMARY: leave 'online'.
|
||||
# Reconciler cadence is 60s — 3 cycles +
|
||||
# AWS terminate-visibility slack.)
|
||||
@@ -82,7 +86,7 @@ CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}"
|
||||
ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}"
|
||||
RUNTIME="${E2E_RUNTIME:-claude-code}"
|
||||
PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}"
|
||||
WORKSPACE_ONLINE_TIMEOUT_SECS="${E2E_WORKSPACE_ONLINE_TIMEOUT_SECS:-3600}"
|
||||
WORKSPACE_ONLINE_TIMEOUT_SECS="${E2E_WORKSPACE_ONLINE_TIMEOUT_SECS:-900}"
|
||||
# PRIMARY bound: the reconciler ticks every 60s; it needs one cycle to see
|
||||
# the dead instance after AWS makes the terminate visible to DescribeInstances
|
||||
# (typically seconds, but can lag). 180s = ~3 cycles + slack.
|
||||
@@ -325,7 +329,18 @@ ws_field() {
|
||||
# tolerable — but wiring the same keys keeps boot behaviour identical to the
|
||||
# sibling and avoids a config path that only this test would exercise.
|
||||
SECRETS_JSON='{}'
|
||||
if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
|
||||
# Platform-managed path (E2E_LLM_PATH=platform, the DEFAULT for this test):
|
||||
# the workspace boots on the CP LLM proxy with NO tenant key, model
|
||||
# moonshot/kimi-k2.6 — the exact create combo test_staging_full_saas.sh uses
|
||||
# successfully. This test only needs the workspace to reach status=online so
|
||||
# it can kill the EC2 and assert the reconciler heals it; it does NOT exercise
|
||||
# a real LLM completion, so the platform path is both sufficient and the one
|
||||
# proven to create cleanly. (The BYOK key paths below 400'd at create — see
|
||||
# the create-failure capture added below — which is why platform is default.)
|
||||
if [ "${E2E_LLM_PATH:-platform}" = "platform" ]; then
|
||||
log " LLM path: PLATFORM-MANAGED (no tenant key; moonshot/kimi-k2.6 via proxy)"
|
||||
SECRETS_JSON='{}'
|
||||
elif [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
|
||||
SECRETS_JSON=$(python3 -c "import json,os; print(json.dumps({'MINIMAX_API_KEY': os.environ['E2E_MINIMAX_API_KEY']}))")
|
||||
elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
|
||||
SECRETS_JSON=$(python3 -c "import json,os; print(json.dumps({'ANTHROPIC_API_KEY': os.environ['E2E_ANTHROPIC_API_KEY']}))")
|
||||
@@ -345,21 +360,32 @@ print(json.dumps({
|
||||
")
|
||||
fi
|
||||
|
||||
MODEL_SLUG=$(pick_model_slug "$RUNTIME")
|
||||
E2E_LLM_PATH="${E2E_LLM_PATH:-platform}" MODEL_SLUG=$(E2E_LLM_PATH="${E2E_LLM_PATH:-platform}" pick_model_slug "$RUNTIME")
|
||||
log " MODEL_SLUG=$MODEL_SLUG"
|
||||
|
||||
log "4/6 Provisioning workspace (runtime=$RUNTIME)..."
|
||||
# --fail-with-body makes curl exit non-zero on a 4xx/5xx but STILL writes the
|
||||
# response body to stdout; the `|| { ... }` catches that so the body is printed
|
||||
# instead of `set -e` aborting the command-substitution silently (the old bug
|
||||
# that hid the real HTTP-400 reason). $WS_RESP holds the body either way.
|
||||
WS_RESP=$(tenant_call POST /workspaces \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"name\":\"E2E Reconciler\",\"runtime\":\"$RUNTIME\",\"tier\":2,\"model\":\"$MODEL_SLUG\",\"secrets\":$SECRETS_JSON}")
|
||||
WS_ID=$(echo "$WS_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['id'])")
|
||||
[ -z "$WS_ID" ] && fail "Workspace create response missing 'id': $WS_RESP"
|
||||
-d "{\"name\":\"E2E Reconciler\",\"runtime\":\"$RUNTIME\",\"tier\":2,\"model\":\"$MODEL_SLUG\",\"secrets\":$SECRETS_JSON}") || {
|
||||
rc=$?
|
||||
fail "Workspace create failed (curl rc=$rc, model=$MODEL_SLUG). Response body: $WS_RESP"
|
||||
}
|
||||
WS_ID=$(echo "$WS_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null)
|
||||
[ -z "$WS_ID" ] && fail "Workspace create response missing 'id' (model=$MODEL_SLUG): $WS_RESP"
|
||||
log " WS_ID=$WS_ID"
|
||||
|
||||
# Wait for the workspace to reach status=online and capture its instance_id.
|
||||
log " Waiting for workspace to reach status=online (up to $((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min)..."
|
||||
ONLINE_DEADLINE=$(( $(date +%s) + WORKSPACE_ONLINE_TIMEOUT_SECS ))
|
||||
ORIGINAL_INSTANCE_ID=""
|
||||
ONLINE_SINCE=""
|
||||
# Grace before falling back to the AWS workspace tag when the tenant API
|
||||
# does not surface instance_id (observed on staging).
|
||||
INSTANCE_ID_GRACE_SECS="${E2E_INSTANCE_ID_GRACE_SECS:-45}"
|
||||
WS_LAST_STATUS=""
|
||||
while true; do
|
||||
if [ "$(date +%s)" -gt "$ONLINE_DEADLINE" ]; then
|
||||
@@ -372,11 +398,27 @@ while true; do
|
||||
WS_LAST_STATUS="$WS_STATUS"
|
||||
fi
|
||||
if [ "$WS_STATUS" = "online" ]; then
|
||||
[ -z "$ONLINE_SINCE" ] && ONLINE_SINCE=$(date +%s)
|
||||
ORIGINAL_INSTANCE_ID=$(ws_field "$WS_ID" "instance_id")
|
||||
if [ -n "$ORIGINAL_INSTANCE_ID" ]; then
|
||||
break
|
||||
fi
|
||||
# online but instance_id not surfaced yet — keep polling briefly.
|
||||
# The workspace is online but the tenant API does not surface instance_id
|
||||
# (observed on staging — the DB has it, the API response omits it). After a
|
||||
# short grace, fall back to the AWS workspace-instance tag so the kill step
|
||||
# can proceed. The reconciler reads instance_id from the DB and acts on the
|
||||
# real EC2 regardless of what the API surfaces, so the AWS-tag instance is
|
||||
# the correct kill target. Without this fallback the loop spins to the online
|
||||
# deadline and fails with a misleading "never reached online".
|
||||
if [ $(( $(date +%s) - ONLINE_SINCE )) -ge "$INSTANCE_ID_GRACE_SECS" ]; then
|
||||
# ws-tenant-<slug>-<wsid...> is the workspace EC2 (vs tenant-<slug>).
|
||||
ORIGINAL_INSTANCE_ID=$(e2e_ec2_instances_for_slug "$SLUG" 2>/dev/null \
|
||||
| awk '$2 ~ /^ws-tenant-/ {print $1}' | sort -u | head -1)
|
||||
if [ -n "$ORIGINAL_INSTANCE_ID" ]; then
|
||||
log " instance_id not surfaced by API after ${INSTANCE_ID_GRACE_SECS}s — using AWS workspace tag: $ORIGINAL_INSTANCE_ID"
|
||||
break
|
||||
fi
|
||||
fi
|
||||
log " $WS_ID online but instance_id not populated yet — waiting"
|
||||
fi
|
||||
# 'failed' is transient on cold boot (bootstrap-watcher deadline vs heartbeat
|
||||
|
||||
Executable
+124
@@ -0,0 +1,124 @@
|
||||
#!/usr/bin/env bash
|
||||
# Fail-direction / load-bearing proof for the E2E_REQUIRE_LIVE
|
||||
# fail-closed-on-skip guard in test_staging_full_saas.sh.
|
||||
#
|
||||
# WHY (harden/e2e-staging-saas-failclosed): the staging SaaS E2E is being
|
||||
# hardened to become a HARD merge-gate. A gate that can reach its final `ok`
|
||||
# WITHOUT having actually exercised a provision→online→A2A cycle is a
|
||||
# false-green — it would let a refactor that short-circuits the lifecycle
|
||||
# (or a skip path that swallows it) report PASS. require_live_or_die() is the
|
||||
# guard; this test proves it FAILS (exit 5) when milestones are missing and
|
||||
# PASSES when all fired — the watch-it-fail counterpart the dev-SOP requires.
|
||||
#
|
||||
# Runs entirely offline (no LLM, no network, no provisioning) — pure shell
|
||||
# logic — so it can run on every PR in the fast lane and locally via `bash`.
|
||||
set -uo pipefail
|
||||
|
||||
# Scratch dir for the generated guard-runner stubs. EXIT trap guarantees
|
||||
# cleanup even when an assertion exits the test non-zero (lint_cleanup_traps).
|
||||
TMPDIR_E2E=$(mktemp -d -t require-live-guard-XXXXXX)
|
||||
trap 'rm -rf "$TMPDIR_E2E"' EXIT INT TERM
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
|
||||
# Reproduce the EXACT guard logic from test_staging_full_saas.sh. Kept in
|
||||
# lockstep with the host script: if the host logic changes, this test must
|
||||
# change with it (and a divergence is itself a signal to re-prove the gate).
|
||||
make_guard_runner() {
|
||||
cat <<'EOF'
|
||||
REQUIRE_LIVE="${E2E_REQUIRE_LIVE:-0}"
|
||||
LIVE_MILESTONES=""
|
||||
live_milestone() {
|
||||
case " $LIVE_MILESTONES " in
|
||||
*" $1 "*) ;;
|
||||
*) LIVE_MILESTONES="$LIVE_MILESTONES $1" ;;
|
||||
esac
|
||||
}
|
||||
require_live_or_die() {
|
||||
[ "$REQUIRE_LIVE" = "1" ] || return 0
|
||||
local required="provisioned tenant_online workspace_online a2a_roundtrip"
|
||||
local m missing=""
|
||||
for m in $required; do
|
||||
case " $LIVE_MILESTONES " in
|
||||
*" $m "*) ;;
|
||||
*) missing="$missing $m" ;;
|
||||
esac
|
||||
done
|
||||
if [ -n "$missing" ]; then
|
||||
echo "MISSING:${missing}" >&2
|
||||
exit 5
|
||||
fi
|
||||
}
|
||||
EOF
|
||||
}
|
||||
|
||||
# run_case <E2E_REQUIRE_LIVE value> <space-separated milestones to stamp>
|
||||
# echoes the observed exit code.
|
||||
run_case() {
|
||||
local require_live="$1"; shift
|
||||
local milestones="$1"; shift || true
|
||||
local stub observed m
|
||||
stub=$(mktemp "$TMPDIR_E2E/stub.XXXXXX")
|
||||
{
|
||||
echo "#!/usr/bin/env bash"
|
||||
echo "set -uo pipefail"
|
||||
make_guard_runner
|
||||
for m in $milestones; do
|
||||
echo "live_milestone $m"
|
||||
done
|
||||
echo "require_live_or_die"
|
||||
echo 'echo REACHED_END'
|
||||
} > "$stub"
|
||||
E2E_REQUIRE_LIVE="$require_live" bash "$stub" >/dev/null 2>&1
|
||||
observed=$?
|
||||
rm -f "$stub"
|
||||
echo "$observed"
|
||||
}
|
||||
|
||||
assert_rc() {
|
||||
local label="$1" require_live="$2" milestones="$3" expected="$4"
|
||||
local observed
|
||||
observed=$(run_case "$require_live" "$milestones")
|
||||
if [ "$observed" = "$expected" ]; then
|
||||
echo " ✓ $label: REQUIRE_LIVE=$require_live milestones='$milestones' → rc=$observed"
|
||||
PASS=$((PASS+1))
|
||||
else
|
||||
echo " ✗ $label: REQUIRE_LIVE=$require_live milestones='$milestones' expected=$expected OBSERVED=$observed" >&2
|
||||
FAIL=$((FAIL+1))
|
||||
fi
|
||||
}
|
||||
|
||||
echo "=== E2E_REQUIRE_LIVE fail-closed-on-skip guard proof ==="
|
||||
echo
|
||||
|
||||
# DECISIVE (false-green trap): REQUIRE_LIVE=1 but NO lifecycle ran → exit 5.
|
||||
assert_rc "require-live, nothing ran → exit 5 (the false-green trap)" \
|
||||
1 "" 5
|
||||
|
||||
# REQUIRE_LIVE=1 with a partial lifecycle (provisioned but no A2A) → exit 5.
|
||||
assert_rc "require-live, partial lifecycle → exit 5" \
|
||||
1 "provisioned tenant_online workspace_online" 5
|
||||
|
||||
# REQUIRE_LIVE=1 with every required milestone → pass (rc=0).
|
||||
assert_rc "require-live, full lifecycle → pass" \
|
||||
1 "provisioned tenant_online workspace_online a2a_roundtrip" 0
|
||||
|
||||
# Idempotency: duplicate stamps don't break membership; full set still passes.
|
||||
assert_rc "require-live, duplicate stamps still pass" \
|
||||
1 "provisioned provisioned tenant_online workspace_online a2a_roundtrip a2a_roundtrip" 0
|
||||
|
||||
# Guard is a no-op when CI did not demand a live run: a non-live local run
|
||||
# with nothing stamped must NOT exit 5 (we don't break local/debug runs).
|
||||
assert_rc "no require-live, nothing ran → pass (guard is opt-in)" \
|
||||
0 "" 0
|
||||
assert_rc "require-live unset-equivalent (0), partial → pass" \
|
||||
0 "provisioned" 0
|
||||
|
||||
# Extra unknown milestone is harmless as long as required set is present.
|
||||
assert_rc "require-live, extra milestone tolerated" \
|
||||
1 "provisioned tenant_online workspace_online a2a_roundtrip extra_thing" 0
|
||||
|
||||
echo
|
||||
echo "=== Results: $PASS passed, $FAIL failed ==="
|
||||
[ "$FAIL" -eq 0 ]
|
||||
@@ -40,9 +40,25 @@
|
||||
# E2E_INTENTIONAL_FAILURE 1 → break a step on purpose to verify
|
||||
# the EXIT trap still tears down (mirrors
|
||||
# the full-saas harness's safety net).
|
||||
# E2E_REQUIRE_LIVE 1 → fail-closed if the harness exits 0
|
||||
# WITHOUT having driven all four
|
||||
# awaiting_agent transitions. CI sets this
|
||||
# so a future skip / early-return can never
|
||||
# masquerade as a green run. Mirrors CP
|
||||
# serving-e2e SERVING_E2E_REQUIRE_LIVE.
|
||||
# E2E_STALE_POLL_DEADLINE_SECS default 240. Upper bound for the
|
||||
# heartbeat-staleness READINESS poll (step
|
||||
# 6). Replaces the old fixed sleep+one-shot
|
||||
# assert that raced the sweep cadence.
|
||||
# E2E_TRANSIENT_RETRIES default 8. Bounded retries for register /
|
||||
# re-register against transient edge errors
|
||||
# (502/503/504 from Caddy during cold TLS /
|
||||
# agent boot). Mirrors the full-saas
|
||||
# cold-start retry loop — NOT a bare sleep.
|
||||
#
|
||||
# Exit codes: 0 happy, 1 generic, 2 missing env, 3 provision timeout,
|
||||
# 4 teardown leak.
|
||||
# 4 teardown leak, 5 REQUIRE_LIVE violation (exited 0 having validated
|
||||
# nothing).
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
@@ -51,6 +67,13 @@ ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway s
|
||||
PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}"
|
||||
RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}"
|
||||
STALE_WAIT_SECS="${E2E_STALE_WAIT_SECS:-180}"
|
||||
# Readiness-poll deadline for the sweep transition (step 6). Must exceed
|
||||
# STALE_WAIT_SECS (the no-heartbeat window) by at least one sweep
|
||||
# interval so a slightly-late sweep tick is polled-for, not misread as a
|
||||
# stuck 'online'. 240 = 180s window + 60s sweep-cadence headroom.
|
||||
STALE_POLL_DEADLINE_SECS="${E2E_STALE_POLL_DEADLINE_SECS:-240}"
|
||||
TRANSIENT_RETRIES="${E2E_TRANSIENT_RETRIES:-8}"
|
||||
REQUIRE_LIVE="${E2E_REQUIRE_LIVE:-0}"
|
||||
|
||||
SLUG="e2e-ext-$(date +%Y%m%d)-${RUN_ID_SUFFIX}"
|
||||
SLUG=$(echo "$SLUG" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-' | head -c 32)
|
||||
@@ -59,6 +82,66 @@ log() { echo "[$(date +%H:%M:%S)] $*"; }
|
||||
fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; }
|
||||
ok() { echo "[$(date +%H:%M:%S)] ✅ $*"; }
|
||||
|
||||
# REQUIRE_LIVE bookkeeping: count the four awaiting_agent transitions the
|
||||
# test is contracted to prove. The EXIT trap fails-closed (exit 5) if the
|
||||
# script reaches a clean exit without all four — so a silent skip, an
|
||||
# early `return 0`, or a refactor that drops a step can never show green.
|
||||
TRANSITIONS_VERIFIED=0
|
||||
EXPECTED_TRANSITIONS=4
|
||||
require_transition() { # $1 = human label
|
||||
TRANSITIONS_VERIFIED=$((TRANSITIONS_VERIFIED + 1))
|
||||
log " [require-live] transition ${TRANSITIONS_VERIFIED}/${EXPECTED_TRANSITIONS} proven: $1"
|
||||
}
|
||||
|
||||
# Redact bearer tokens from any HTTP body before logging (mirrors the
|
||||
# full-saas sanitize_http_body so transient-error logs never leak creds).
|
||||
sanitize_http_body() {
|
||||
sed -E 's/(Bearer|token)[[:space:]]+[A-Za-z0-9._-]+/\1 REDACTED/g'
|
||||
}
|
||||
|
||||
# Bounded retry-on-transient for POST /registry/register. The tenant edge
|
||||
# (Caddy) returns 502/503/504 with an identifiable body while TLS / the
|
||||
# workspace agent finishes cold-booting — a single shot here was the
|
||||
# un-named flake (a transient edge error misread as a register failure).
|
||||
# This mirrors the full-saas cold-start loop (test_staging_full_saas.sh
|
||||
# ~L780-816): retry ONLY on a transient TRANSPORT class (5xx + body
|
||||
# match), bounded by TRANSIENT_RETRIES, and FAIL CLOSED (non-zero) once
|
||||
# the budget is spent. It deliberately does NOT retry on a 4xx — that's a
|
||||
# real contract bug (e.g. wrong payload field) and must stay red.
|
||||
# Sets REGISTER_RESP (body + trailing "HTTP_CODE=NNN" line) on success;
|
||||
# returns non-zero (caller `fail`s) when the bounded budget is exhausted.
|
||||
register_with_retry() { # $1 = step label, $2 = request body
|
||||
local label="$1" body="$2"
|
||||
local attempt code resp safe
|
||||
for attempt in $(seq 1 "$TRANSIENT_RETRIES"); do
|
||||
set +e
|
||||
resp=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST \
|
||||
"$TENANT_URL/registry/register" \
|
||||
-H "Authorization: Bearer $WS_AUTH_TOKEN" \
|
||||
-H "X-Molecule-Org-Id: $ORG_ID" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$body")
|
||||
set -e
|
||||
code=$(printf '%s' "$resp" | sed -n 's/^HTTP_CODE=//p' | tail -n1)
|
||||
code=${code:-000}
|
||||
if [ "$code" = "200" ]; then
|
||||
REGISTER_RESP="$resp"
|
||||
return 0
|
||||
fi
|
||||
safe=$(printf '%s' "$resp" | sanitize_http_body | head -c 300)
|
||||
# Retry ONLY on a transient transport class; a 4xx is a real bug.
|
||||
if echo "$code" | grep -Eq '^(502|503|504)$' \
|
||||
&& echo "$safe" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|error code: 502|error code: 504|workspace agent unreachable|connection refused|no healthy upstream'; then
|
||||
log " ${label} transient $code attempt ${attempt}/${TRANSIENT_RETRIES}: $safe"
|
||||
[ "$attempt" -lt "$TRANSIENT_RETRIES" ] && { sleep 10; continue; }
|
||||
fi
|
||||
# Non-transient (4xx, or unrecognized 5xx body): stop and fail closed.
|
||||
REGISTER_RESP="$resp"
|
||||
return 1
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
CURL_COMMON=(-sS --fail-with-body --max-time 30)
|
||||
|
||||
# ─── cleanup trap (mirrors full-saas) ────────────────────────────────────
|
||||
@@ -98,8 +181,19 @@ cleanup_org() {
|
||||
fi
|
||||
ok "Teardown clean — no orphan resources for $SLUG (${elapsed}s)"
|
||||
|
||||
# REQUIRE_LIVE fail-closed gate. Only meaningful on an OTHERWISE-CLEAN
|
||||
# exit (entry_rc==0): a script that completed all steps but somehow did
|
||||
# not register all four transitions (a skip, an early return, a dropped
|
||||
# assertion in a refactor) must NOT report success. A non-zero entry_rc
|
||||
# already carries its own failure semantics — don't mask it with 5.
|
||||
if [ "$entry_rc" = "0" ] && [ "${REQUIRE_LIVE}" = "1" ] \
|
||||
&& [ "$TRANSITIONS_VERIFIED" -lt "$EXPECTED_TRANSITIONS" ]; then
|
||||
echo "❌ REQUIRE_LIVE: exited 0 but only ${TRANSITIONS_VERIFIED}/${EXPECTED_TRANSITIONS} awaiting_agent transitions were proven — refusing to report green." >&2
|
||||
exit 5
|
||||
fi
|
||||
|
||||
case "$entry_rc" in
|
||||
0|1|2|3|4) ;;
|
||||
0|1|2|3|4|5) ;;
|
||||
*) exit 1 ;;
|
||||
esac
|
||||
}
|
||||
@@ -248,6 +342,7 @@ GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
|
||||
DB_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
|
||||
[ "$DB_STATUS" != "awaiting_agent" ] && fail "DB row status=$DB_STATUS (expected awaiting_agent — migration 046 likely not applied)"
|
||||
ok "DB row stored as awaiting_agent (proof migration 046 applied)"
|
||||
require_transition "create: provisioning → awaiting_agent (DB-verified)"
|
||||
|
||||
# ─── 5. Register the workspace (transitions to online) ──────────────────
|
||||
# Pre-fix this path was actually fine because it writes 'online', a value
|
||||
@@ -277,20 +372,20 @@ log "5/8 Registering workspace via /registry/register..."
|
||||
# url — accepted but not dispatched-to in poll mode, so
|
||||
# example.invalid is a valid sentinel.
|
||||
REGISTER_BODY=$(printf '{"id":"%s","url":"https://example.invalid:443","delivery_mode":"poll","agent_card":{"name":"e2e-ext","skills":[{"id":"echo","name":"Echo"}]}}' "$WS_ID")
|
||||
# Disable --fail-with-body for this one call so a 4xx surfaces the response
|
||||
# body (the bare CURL_COMMON would `set -e`-kill before we could log it).
|
||||
REGISTER_RESP=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST "$TENANT_URL/registry/register" \
|
||||
-H "Authorization: Bearer $WS_AUTH_TOKEN" \
|
||||
-H "X-Molecule-Org-Id: $ORG_ID" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$REGISTER_BODY") || true
|
||||
log " register response: $(echo "$REGISTER_RESP" | head -c 300)"
|
||||
echo "$REGISTER_RESP" | grep -q "HTTP_CODE=200" || fail "register returned non-200 — see body above"
|
||||
# Bounded retry-on-transient (see register_with_retry). The previous
|
||||
# single-shot here would `fail` on a cold-boot 502 from the tenant edge —
|
||||
# an un-named transient misread as a register break. The helper retries
|
||||
# ONLY that class and fails closed on a real 4xx or an exhausted budget.
|
||||
REGISTER_RESP=""
|
||||
register_with_retry "register" "$REGISTER_BODY" \
|
||||
|| fail "register returned non-200 after bounded retries — body: $(printf '%s' "$REGISTER_RESP" | sanitize_http_body | head -c 300)"
|
||||
log " register response: $(echo "$REGISTER_RESP" | sanitize_http_body | head -c 300)"
|
||||
|
||||
GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
|
||||
ONLINE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
|
||||
[ "$ONLINE_STATUS" != "online" ] && fail "Expected online after register, got $ONLINE_STATUS"
|
||||
ok "Workspace transitioned to online"
|
||||
require_transition "register: awaiting_agent → online"
|
||||
|
||||
# Confirm the register handler echoed back delivery_mode=poll. We read
|
||||
# this from the register RESPONSE, not the workspace GET response, because
|
||||
@@ -310,38 +405,63 @@ fi
|
||||
# This is the SECOND silent-failure path (registry/healthsweep.go's
|
||||
# sweepStaleRemoteWorkspaces). Pre-migration-046 the heartbeat-staleness
|
||||
# UPDATE silently failed and the workspace stuck on 'online' forever
|
||||
# even though no agent was alive. We wait the full window + a sweep
|
||||
# interval and assert the row transitions back to 'awaiting_agent'.
|
||||
log "6/8 Waiting ${STALE_WAIT_SECS}s for heartbeat-staleness sweep (no heartbeat sent)..."
|
||||
# even though no agent was alive.
|
||||
#
|
||||
# FLAKE FIX (named: sweep-cadence race). The old code did a FIXED
|
||||
# `sleep $STALE_WAIT_SECS` then a SINGLE assert. The staleness sweep is a
|
||||
# periodic tick (REMOTE_LIVENESS_STALE_AFTER + a sweep interval); if the
|
||||
# tick that flips the row lands even one second after the fixed sleep, the
|
||||
# one-shot GET reads 'online' and the test fails — a real transition,
|
||||
# misread as a flake because the assert was racing the sweep cadence.
|
||||
# Replace with: sleep through the mandatory no-heartbeat window ONCE (the
|
||||
# sweep cannot fire before the window elapses, so polling earlier is
|
||||
# pointless), then READINESS-POLL for the awaiting_agent transition up to
|
||||
# STALE_POLL_DEADLINE_SECS, hard-failing with a clear message at the
|
||||
# deadline. Deterministic: a slow-but-working sweep passes; a genuinely
|
||||
# stuck 'online' still fails (now with how long we actually waited).
|
||||
log "6/8 Waiting ${STALE_WAIT_SECS}s no-heartbeat window, then polling for sweep (up to ${STALE_POLL_DEADLINE_SECS}s total)..."
|
||||
[ "$STALE_POLL_DEADLINE_SECS" -le "$STALE_WAIT_SECS" ] && \
|
||||
fail "Misconfigured: STALE_POLL_DEADLINE_SECS ($STALE_POLL_DEADLINE_SECS) must exceed STALE_WAIT_SECS ($STALE_WAIT_SECS) by at least one sweep interval"
|
||||
sleep "$STALE_WAIT_SECS"
|
||||
|
||||
GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
|
||||
STALE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
|
||||
[ "$STALE_STATUS" != "awaiting_agent" ] && \
|
||||
fail "After ${STALE_WAIT_SECS}s with no heartbeat, expected status=awaiting_agent (sweep transition), got $STALE_STATUS — migration 046 likely not applied OR sweep not running"
|
||||
STALE_DEADLINE=$(( $(date +%s) + (STALE_POLL_DEADLINE_SECS - STALE_WAIT_SECS) ))
|
||||
STALE_STATUS=""
|
||||
while true; do
|
||||
GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
|
||||
STALE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
|
||||
[ "$STALE_STATUS" = "awaiting_agent" ] && break
|
||||
if [ "$(date +%s)" -gt "$STALE_DEADLINE" ]; then
|
||||
fail "After ${STALE_POLL_DEADLINE_SECS}s with no heartbeat, status still '$STALE_STATUS' (expected awaiting_agent sweep transition) — migration 046 likely not applied OR sweep not running"
|
||||
fi
|
||||
sleep 10
|
||||
done
|
||||
ok "Heartbeat-staleness sweep transitioned online → awaiting_agent (proof healthsweep.go fix working)"
|
||||
require_transition "sweep: online → awaiting_agent (no heartbeat)"
|
||||
|
||||
# ─── 7. Re-register and confirm we can come back online ─────────────────
|
||||
# This proves the awaiting_agent state is recoverable (re-registrable),
|
||||
# which is the whole point of using it instead of 'offline'.
|
||||
log "7/8 Re-registering after stale → confirming recovery to online..."
|
||||
# Same payload contract as step 5 (id + agent_card both required). See note
|
||||
# there for why workspace_id would 400.
|
||||
REREG_RESP=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST "$TENANT_URL/registry/register" \
|
||||
-H "Authorization: Bearer $WS_AUTH_TOKEN" \
|
||||
-H "X-Molecule-Org-Id: $ORG_ID" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$REGISTER_BODY") || true
|
||||
log " re-register response: $(echo "$REREG_RESP" | head -c 300)"
|
||||
echo "$REREG_RESP" | grep -q "HTTP_CODE=200" || fail "re-register returned non-200 — see body above"
|
||||
# there for why workspace_id would 400. Same bounded retry-on-transient.
|
||||
REGISTER_RESP=""
|
||||
register_with_retry "re-register" "$REGISTER_BODY" \
|
||||
|| fail "re-register returned non-200 after bounded retries — body: $(printf '%s' "$REGISTER_RESP" | sanitize_http_body | head -c 300)"
|
||||
log " re-register response: $(echo "$REGISTER_RESP" | sanitize_http_body | head -c 300)"
|
||||
|
||||
GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
|
||||
RECOVERED_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
|
||||
[ "$RECOVERED_STATUS" != "online" ] && \
|
||||
fail "Expected re-register to return workspace to online, got $RECOVERED_STATUS"
|
||||
ok "Re-register succeeded — awaiting_agent → online (operator-recoverable)"
|
||||
require_transition "re-register: awaiting_agent → online (recovery)"
|
||||
|
||||
# ─── 8. Done — cleanup runs in the EXIT trap ───────────────────────────
|
||||
# REQUIRE_LIVE belt-and-braces: assert here too (in addition to the EXIT
|
||||
# trap) so the failure surfaces in step order, not only post-teardown.
|
||||
if [ "${REQUIRE_LIVE}" = "1" ] && [ "$TRANSITIONS_VERIFIED" -lt "$EXPECTED_TRANSITIONS" ]; then
|
||||
fail "REQUIRE_LIVE: only ${TRANSITIONS_VERIFIED}/${EXPECTED_TRANSITIONS} transitions proven at end of run"
|
||||
fi
|
||||
log "8/8 All four awaiting_agent transitions verified."
|
||||
log "═══════════════════════════════════════════════════════════════════"
|
||||
ok "External-runtime E2E PASSED on $SLUG"
|
||||
|
||||
@@ -47,6 +47,15 @@
|
||||
# tear down cleanly (and exit 4 on leak).
|
||||
# Used by a dedicated sanity workflow
|
||||
# that verifies the safety net.
|
||||
# E2E_REQUIRE_LIVE 1 → fail-closed-on-skip guard (CI sets this).
|
||||
# When set, the run MUST actually complete
|
||||
# ≥1 full provision→online→A2A cycle. A run
|
||||
# that reaches the end without having proven
|
||||
# a real round-trip (e.g. a future refactor
|
||||
# short-circuits a stage, or a skip path
|
||||
# swallows the lifecycle) exits 5 rather than
|
||||
# reporting a false green. Mirrors CP
|
||||
# serving-e2e's SERVING_E2E_REQUIRE_LIVE.
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 happy path
|
||||
@@ -54,6 +63,37 @@
|
||||
# 2 missing required env
|
||||
# 3 provisioning timed out
|
||||
# 4 teardown left orphan resources
|
||||
# 5 E2E_REQUIRE_LIVE set but the run validated no real lifecycle (no
|
||||
# false-green-on-skip)
|
||||
#
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
# PROMOTION-READINESS (harden/e2e-staging-saas-failclosed):
|
||||
# This harness is being hardened so `E2E Staging SaaS` + `E2E Staging
|
||||
# Platform Boot` can become HARD merge-gates. continue-on-error is NOT
|
||||
# flipped here — that promotion is the CTO's irreversible branch-protection
|
||||
# call. What this branch makes fail-closed (was false-green / un-named
|
||||
# flake before):
|
||||
# • Provision/online waits are bounded readiness-POLLS, not fixed sleeps;
|
||||
# each hard-fails with a named mechanism + last-seen signal on deadline,
|
||||
# never a silent timeout (cp#245 boot-timeout class).
|
||||
# • Peer-discovery (9b) asserts a real 2xx, not just "not 404" — a 5xx /
|
||||
# 000 / empty no longer reads as "reachable".
|
||||
# • Activity-log (9b) is ASSERTED reachable (2xx + parseable), not
|
||||
# logged-and-ignored behind `|| echo '[]'`.
|
||||
# • Child activity provenance (10) is asserted (was soft-logged).
|
||||
# • E2E_REQUIRE_LIVE=1 (CI) makes the run exit 5 if it reached the end
|
||||
# without proving a real provision→online→A2A round-trip — no
|
||||
# false-green-on-skip.
|
||||
# STILL BLOCKS making it REQUIRED (must clear before the CTO flips
|
||||
# continue-on-error→false in .gitea/workflows/e2e-staging-saas.yml):
|
||||
# • De-flake window: N consecutive green runs on main for BOTH jobs
|
||||
# (platform-boot shares the cp#245 boot surface — #2187 tracks its
|
||||
# flip). This harness removes the harness-side flake mechanisms; the
|
||||
# remaining surface is real-infra (EC2 cold boot, CF DNS) latency,
|
||||
# already bounded by the readiness polls above.
|
||||
# • Branch-protection required-context wiring is a repo-settings change,
|
||||
# not a code change in this PR.
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
@@ -90,6 +130,41 @@ log() { echo "[$(date +%H:%M:%S)] $*"; }
|
||||
fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; }
|
||||
ok() { echo "[$(date +%H:%M:%S)] ✅ $*"; }
|
||||
|
||||
# ─── fail-closed-on-skip live-lifecycle guard ───────────────────────────
|
||||
# E2E_REQUIRE_LIVE=1 (set by CI) asserts this run ACTUALLY exercised a full
|
||||
# provision→online→A2A cycle. Each load-bearing lifecycle stage stamps a
|
||||
# milestone via live_milestone(); at the very end, require_live_or_die()
|
||||
# checks every required milestone fired. Mechanism: without this, a future
|
||||
# refactor that short-circuits a stage — or a skip/early-return path that
|
||||
# swallows the lifecycle — would let the script reach its final `ok` and
|
||||
# report GREEN having validated nothing. Mirrors CP serving-e2e's
|
||||
# SERVING_E2E_REQUIRE_LIVE (skip-if-absent must be LOUD, never silent green).
|
||||
REQUIRE_LIVE="${E2E_REQUIRE_LIVE:-0}"
|
||||
LIVE_MILESTONES=""
|
||||
live_milestone() {
|
||||
# Idempotent set-membership append. Space-delimited; names are tokens.
|
||||
case " $LIVE_MILESTONES " in
|
||||
*" $1 "*) ;;
|
||||
*) LIVE_MILESTONES="$LIVE_MILESTONES $1" ;;
|
||||
esac
|
||||
}
|
||||
require_live_or_die() {
|
||||
# No-op unless CI demanded a live run.
|
||||
[ "$REQUIRE_LIVE" = "1" ] || return 0
|
||||
local required="provisioned tenant_online workspace_online a2a_roundtrip"
|
||||
local m missing=""
|
||||
for m in $required; do
|
||||
case " $LIVE_MILESTONES " in
|
||||
*" $m "*) ;;
|
||||
*) missing="$missing $m" ;;
|
||||
esac
|
||||
done
|
||||
if [ -n "$missing" ]; then
|
||||
echo "[$(date +%H:%M:%S)] ❌ E2E_REQUIRE_LIVE=1 but the run did NOT prove a full live lifecycle — missing milestone(s):${missing}. Reached:${LIVE_MILESTONES:-<none>}. This is a false-green-on-skip guard: a run that validates no real provision→online→A2A cycle MUST NOT report green." >&2
|
||||
exit 5
|
||||
fi
|
||||
}
|
||||
|
||||
# Per-runtime model slug dispatch — see lib/model_slug.sh for the rationale.
|
||||
# Extracted so unit tests (tests/e2e/test_model_slug.sh) can pin every branch
|
||||
# without booting the full 11-step lifecycle.
|
||||
@@ -197,7 +272,7 @@ cleanup_org() {
|
||||
# case statement, and opens a false-positive priority-high
|
||||
# "safety net broken" issue (#2159, 2026-04-27).
|
||||
case "$entry_rc" in
|
||||
0|1|2|3|4) ;; # contracted codes — let bash use entry_rc
|
||||
0|1|2|3|4|5) ;; # contracted codes — let bash use entry_rc
|
||||
*) exit 1 ;; # anything else is a generic failure
|
||||
esac
|
||||
}
|
||||
@@ -295,6 +370,7 @@ print('(no org row found for slug=$SLUG — DB drift?)')
|
||||
esac
|
||||
done
|
||||
ok "Tenant provisioning complete"
|
||||
live_milestone provisioned
|
||||
|
||||
# Derive tenant domain from CP hostname so the same harness works in
|
||||
# both prod (api.moleculesai.app → moleculesai.app) and staging
|
||||
@@ -351,6 +427,7 @@ while true; do
|
||||
sleep 5
|
||||
done
|
||||
ok "Tenant reachable at $TENANT_URL"
|
||||
live_milestone tenant_online
|
||||
|
||||
# Sanity-test path: once the tenant is provisioned, poisoning the
|
||||
# tenant token proves the EXIT trap + leak assertion still fire.
|
||||
@@ -570,6 +647,7 @@ fi
|
||||
WS_TO_CHECK=("$PARENT_ID")
|
||||
[ -n "$CHILD_ID" ] && WS_TO_CHECK+=("$CHILD_ID")
|
||||
wait_workspaces_online_routable "7/11 Waiting for workspace(s) to reach status=online (up to $((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min — hermes cold boot)..." "${WS_TO_CHECK[@]}"
|
||||
live_milestone workspace_online
|
||||
|
||||
# ─── 7a. Real chat image upload/download round-trip ───────────────────
|
||||
# This deliberately uses the production workflow: tenant admin/session auth
|
||||
@@ -886,7 +964,7 @@ fi
|
||||
# identical on main's scheduled synthetic E2E and on PRs (so it is an
|
||||
# environmental backend regression, never PR-introduced).
|
||||
if echo "$AGENT_TEXT" | grep -qiF "message contained no text content"; then
|
||||
fail "A2A — EMPTY COMPLETION (backend regression, NOT a platform/workspace-server bug). The configured model (MODEL_SLUG=${MODEL_SLUG:-?}) returned a 2xx completion with no text part; the runtime surfaced 'message contained no text content.'. Operator action: check the staging LLM backend / proxy for the canary model (MiniMax-M2 since #2710) — empty assistant turns, not an auth/quota/boot fault. Raw: $AGENT_TEXT"
|
||||
fail "A2A — EMPTY COMPLETION (backend regression, NOT a platform/workspace-server bug). The configured model (MODEL_SLUG=${MODEL_SLUG:-?}) returned a 2xx completion with no text part; the runtime surfaced 'message contained no text content.'. Operator action: check the staging LLM backend / proxy for the canary model (the claude-code default is minimax:MiniMax-M2.7 since #2263; was bare MiniMax-M2 #2710) — empty assistant turns, not an auth/quota/boot fault. Raw: $AGENT_TEXT"
|
||||
fi
|
||||
# Generic catch-all — falls through if none of the known regressions hit.
|
||||
if echo "$AGENT_TEXT" | grep -qiE "error|exception"; then
|
||||
@@ -952,7 +1030,14 @@ for KA_ATTEMPT in $(seq 1 6); do
|
||||
KA_SAFE_BODY=$(printf '%s' "$KA_RESP" | sanitize_http_body)
|
||||
# Retry ONLY on transient transport errors — never on an agent-level
|
||||
# error (those must surface and fail the gate).
|
||||
if echo "$KA_CODE" | grep -Eq '^(502|503|504)$' && echo "$KA_SAFE_BODY" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|workspace agent unreachable|connection refused|no healthy upstream|workspace agent busy|native_session'; then
|
||||
# #2263: include the Cloudflare-shaped literal `error code: 502/504` token so a
|
||||
# bare edge/gateway 502 (no "Bad Gateway" body) is retried here the same way the
|
||||
# cold-start PONG probe (line ~800) and the delegation loop (line ~1234) already
|
||||
# do. Without it, a single un-retried edge 502 right after a healthy round-trip
|
||||
# fell through to break and failed the gate on the first attempt (Platform Boot
|
||||
# job, task 268859). Bounded by the existing 6-attempt / sleep-10 loop — no new
|
||||
# sleep-as-fix; this only widens the transient-match to the sibling pattern.
|
||||
if echo "$KA_CODE" | grep -Eq '^(502|503|504)$' && echo "$KA_SAFE_BODY" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|error code: 502|error code: 504|workspace agent unreachable|connection refused|no healthy upstream|workspace agent busy|native_session'; then
|
||||
log " known-answer A2A transient $KA_CODE attempt $KA_ATTEMPT/6: $KA_SAFE_BODY"
|
||||
if [ "$KA_ATTEMPT" -lt 6 ]; then sleep 10; continue; fi
|
||||
fi
|
||||
@@ -974,6 +1059,11 @@ except Exception:
|
||||
" 2>/dev/null || echo "")
|
||||
# CORE GATE: contains PINEAPPLE (real round-trip) AND no error-as-text.
|
||||
a2a_assert_real_completion "$KA_TEXT" "PINEAPPLE" "A2A known-answer (parent, $RUNTIME/$MODEL_SLUG)"
|
||||
# Real, deterministic LLM round-trip proven — the load-bearing milestone for
|
||||
# the fail-closed-on-skip guard. Stamped AFTER a2a_assert_real_completion (not
|
||||
# after the looser PONG check) so the milestone means a verified completion,
|
||||
# not just a 2xx-with-text.
|
||||
live_milestone a2a_roundtrip
|
||||
|
||||
# ─── 8c. byok-routing regression guard (#1994) ─────────────────────────
|
||||
# The parent was provisioned with the customer's OWN vendor key
|
||||
@@ -1099,18 +1189,50 @@ print(json.dumps({
|
||||
ok "HMA memory write+read roundtripped"
|
||||
|
||||
log "9b. Peer discovery + activity log smoke..."
|
||||
# FAIL-CLOSED: assert a real 2xx, not merely "not 404". The previous
|
||||
# `[ "$PEERS_CODE" = "404" ] && fail` only caught the route-missing case —
|
||||
# a 5xx, 000 (connection failure), or empty capture ALL fell through to
|
||||
# "reachable" (false-green: a broken-but-present route read as healthy).
|
||||
# Mechanism: route the http_code into its own tempfile (no stderr capture,
|
||||
# which the old `2>&1 | head -1` could pollute with a curl error line) and
|
||||
# require 2xx explicitly.
|
||||
PEERS_TMP=$(e2e_tmp /tmp/e2e_peers.XXXXXX)
|
||||
set +e
|
||||
tenant_call GET "/registry/$PARENT_ID/peers" -o /dev/null -w "%{http_code}\n" 2>&1 | head -1 > /tmp/peers_code.txt
|
||||
PEERS_CODE=$(tenant_call GET "/registry/$PARENT_ID/peers" \
|
||||
-o "$PEERS_TMP" -w "%{http_code}" 2>/dev/null)
|
||||
PEERS_RC=$?
|
||||
set -e
|
||||
PEERS_CODE=$(cat /tmp/peers_code.txt)
|
||||
[ "$PEERS_CODE" = "404" ] && fail "Peers endpoint missing (404) — route regression"
|
||||
PEERS_CODE=${PEERS_CODE:-000}
|
||||
if [ "$PEERS_CODE" = "404" ]; then
|
||||
fail "Peers endpoint missing (404) — route regression. /registry/$PARENT_ID/peers"
|
||||
fi
|
||||
if [ "$PEERS_RC" != "0" ] || [ "$PEERS_CODE" -lt 200 ] || [ "$PEERS_CODE" -ge 300 ]; then
|
||||
fail "Peers endpoint unhealthy (curl_rc=$PEERS_RC, http=$PEERS_CODE) — not a clean 2xx, so 'reachable' would be a false-green. Body: $(head -c 200 "$PEERS_TMP" 2>/dev/null | sanitize_http_body)"
|
||||
fi
|
||||
ok "Peers endpoint reachable (HTTP $PEERS_CODE)"
|
||||
|
||||
ACTIVITY=$(tenant_call GET "/activity?workspace_id=$PARENT_ID&limit=5" 2>/dev/null || echo '[]')
|
||||
ACTIVITY_COUNT=$(echo "$ACTIVITY" | python3 -c "import json,sys
|
||||
d=json.load(sys.stdin)
|
||||
print(len(d if isinstance(d, list) else d.get('events', [])))" 2>/dev/null || echo 0)
|
||||
log " Activity events observed: $ACTIVITY_COUNT"
|
||||
# FAIL-CLOSED: the activity-log read was `|| echo '[]'` then the count was
|
||||
# only LOGGED, never asserted — a 5xx / network failure silently became an
|
||||
# empty list and the step exited 0 having validated nothing (false-green:
|
||||
# "validated nothing" class). Assert the endpoint returns a 2xx and a
|
||||
# parseable activity shape. We do NOT assert count>0 (the parent may
|
||||
# legitimately have 0 events this early — that's a real, valid state), but
|
||||
# we DO require the call to have actually succeeded and returned valid JSON.
|
||||
ACTIVITY_TMP=$(e2e_tmp /tmp/e2e_activity.XXXXXX)
|
||||
set +e
|
||||
ACTIVITY_CODE=$(tenant_call GET "/activity?workspace_id=$PARENT_ID&limit=5" \
|
||||
-o "$ACTIVITY_TMP" -w "%{http_code}" 2>/dev/null)
|
||||
ACTIVITY_RC=$?
|
||||
set -e
|
||||
ACTIVITY_CODE=${ACTIVITY_CODE:-000}
|
||||
if [ "$ACTIVITY_RC" != "0" ] || [ "$ACTIVITY_CODE" -lt 200 ] || [ "$ACTIVITY_CODE" -ge 300 ]; then
|
||||
fail "Activity-log endpoint unhealthy (curl_rc=$ACTIVITY_RC, http=$ACTIVITY_CODE) — was previously swallowed by '|| echo []' and reported as 0 events (false-green). Body: $(head -c 200 "$ACTIVITY_TMP" 2>/dev/null | sanitize_http_body)"
|
||||
fi
|
||||
ACTIVITY_COUNT=$(python3 -c "import json,sys
|
||||
d=json.load(open(sys.argv[1]))
|
||||
print(len(d if isinstance(d, list) else d.get('events', [])))" "$ACTIVITY_TMP" 2>/dev/null) \
|
||||
|| fail "Activity-log returned HTTP $ACTIVITY_CODE but body was not parseable JSON (events array / {events:[...]}). Body: $(head -c 200 "$ACTIVITY_TMP" 2>/dev/null | sanitize_http_body)"
|
||||
log " Activity events observed: $ACTIVITY_COUNT (endpoint 2xx + parseable ✓)"
|
||||
|
||||
# ─── 9c. Workspace KV memory Edit round-trip ─────────────────────────
|
||||
# Pins the Edit affordance added to the canvas Memory tab. The UI calls
|
||||
@@ -1261,14 +1383,44 @@ except Exception:
|
||||
[ -z "$DELEG_TEXT" ] && fail "Delegation returned no text. Raw: ${DELEG_RESP:0:200}"
|
||||
ok "Delegation proxy works (child responded: \"${DELEG_TEXT:0:60}\")"
|
||||
|
||||
CHILD_ACT=$(tenant_call GET "/activity?workspace_id=$CHILD_ID&limit=20" 2>/dev/null || echo '[]')
|
||||
if echo "$CHILD_ACT" | grep -q "$PARENT_ID"; then
|
||||
# FAIL-CLOSED via bounded readiness-POLL (was soft-logged false-green).
|
||||
# The activity pipeline is async, so an immediate single read can miss the
|
||||
# parent reference — but "did not reference parent" was previously just
|
||||
# LOGGED and the step passed regardless, so a genuinely broken provenance
|
||||
# pipeline (parent never recorded as source) read as success. Mechanism:
|
||||
# poll the child activity log for the parent id for a bounded window
|
||||
# (E2E_CHILD_ACTIVITY_TIMEOUT_SECS, default 60s) — this is the real
|
||||
# readiness signal (provenance row materialised), not a fixed sleep — and
|
||||
# hard-fail with a named mechanism if it never appears.
|
||||
CHILD_ACT_DEADLINE=$(( $(date +%s) + ${E2E_CHILD_ACTIVITY_TIMEOUT_SECS:-60} ))
|
||||
CHILD_ACT_SEEN=0
|
||||
CHILD_ACT_LASTCODE="000"
|
||||
while true; do
|
||||
CHILD_ACT_TMP=$(e2e_tmp /tmp/e2e_child_act.XXXXXX)
|
||||
set +e
|
||||
CHILD_ACT_CODE=$(tenant_call GET "/activity?workspace_id=$CHILD_ID&limit=20" \
|
||||
-o "$CHILD_ACT_TMP" -w "%{http_code}" 2>/dev/null)
|
||||
set -e
|
||||
CHILD_ACT_LASTCODE=${CHILD_ACT_CODE:-000}
|
||||
if grep -q "$PARENT_ID" "$CHILD_ACT_TMP" 2>/dev/null; then
|
||||
CHILD_ACT_SEEN=1
|
||||
break
|
||||
fi
|
||||
[ "$(date +%s)" -ge "$CHILD_ACT_DEADLINE" ] && break
|
||||
sleep 5
|
||||
done
|
||||
if [ "$CHILD_ACT_SEEN" = "1" ]; then
|
||||
ok "Child activity log records parent as source"
|
||||
else
|
||||
log "Child activity log did not reference parent (pipeline may be async)"
|
||||
fail "Child activity log never referenced parent $PARENT_ID within ${E2E_CHILD_ACTIVITY_TIMEOUT_SECS:-60}s (last http=$CHILD_ACT_LASTCODE) — delegation-provenance pipeline regression (parent not recorded as source). Previously soft-logged → false-green."
|
||||
fi
|
||||
fi
|
||||
|
||||
# ─── 11. Teardown runs via trap ────────────────────────────────────────
|
||||
# Fail-closed-on-skip: before declaring PASS, assert (when CI demanded a live
|
||||
# run) that every load-bearing lifecycle milestone actually fired. A run that
|
||||
# reaches here without provision→online→A2A having truly happened exits 5
|
||||
# instead of reporting green. Teardown still runs (EXIT trap) on that exit.
|
||||
require_live_or_die
|
||||
log "11/11 All checks passed. Teardown runs via EXIT trap."
|
||||
ok "═══ STAGING $MODE-SAAS E2E PASSED ═══"
|
||||
|
||||
@@ -0,0 +1,141 @@
|
||||
package handlers
|
||||
|
||||
// a2a_outbound_envelope_test.go — outbound A2A `message/send` envelope
|
||||
// CONTRACT gate (issue #2251).
|
||||
//
|
||||
// #2251: an outbound A2A envelope shipped without `role` and with text
|
||||
// parts keyed `type` instead of the v0.3-canonical `kind`. The receiver's
|
||||
// a-2-a-sdk v0.3 Pydantic validator silently rejected the message
|
||||
// post-dispatch — the sender saw a happy 200/202 while the brief was
|
||||
// dropped (the same invisible-rejection failure class as the v0.2→v0.3
|
||||
// content bug pinned by a2a_corpus_test.go, but on the SEND side).
|
||||
//
|
||||
// The inbound corpus replay (a2a_corpus_test.go) proves normalizeA2APayload
|
||||
// produces `parts[].kind` + a non-empty messageId, but it does NOT assert
|
||||
// `role`, and it only covers what we RECEIVE. Nothing pins what core
|
||||
// EMITS. This file pins the emit contract at the helper that builds the
|
||||
// parts (buildA2AMessageParts, used by both delegate_task and
|
||||
// delegate_task_async) and asserts the canonical Part key is `kind`.
|
||||
//
|
||||
// Part-object schema (A2A v0.3): every Part MUST carry a `kind`
|
||||
// discriminator ("text" | "file" | "data"); there is NO `type` key. A
|
||||
// text Part is {"kind":"text","text":"..."}. Emitting `type` makes the
|
||||
// v0.3 validator drop the Part.
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestBuildA2AMessageParts_TextPartUsesKindNotType pins the v0.3 Part
|
||||
// discriminator for the text part emitted on every outbound A2A
|
||||
// delegation. RED before #2251's fix (the helper emitted
|
||||
// {"type":"text",...}); the receiver's v0.3 Pydantic validator drops a
|
||||
// Part keyed `type`, silently losing the task text.
|
||||
func TestBuildA2AMessageParts_TextPartUsesKindNotType(t *testing.T) {
|
||||
parts := buildA2AMessageParts("do the work", nil)
|
||||
if len(parts) == 0 {
|
||||
t.Fatal("buildA2AMessageParts returned no parts for a non-empty task")
|
||||
}
|
||||
text := parts[0]
|
||||
|
||||
if _, hasType := text["type"]; hasType {
|
||||
t.Errorf("text part uses forbidden v0.2 key `type` %v — A2A v0.3 Parts discriminate on `kind`; `type` is dropped by the receiver's validator (#2251)", text)
|
||||
}
|
||||
kind, ok := text["kind"].(string)
|
||||
if !ok {
|
||||
t.Fatalf("text part missing string `kind` discriminator; got %v", text)
|
||||
}
|
||||
if kind != "text" {
|
||||
t.Errorf("text part kind = %q, want \"text\"", kind)
|
||||
}
|
||||
if text["text"] != "do the work" {
|
||||
t.Errorf("text part text = %v, want \"do the work\"", text["text"])
|
||||
}
|
||||
}
|
||||
|
||||
// TestBuildA2AMessageParts_FilePartUsesKind guards the file-attachment
|
||||
// Part the same way. The file path was already correct (it used `kind`),
|
||||
// so this is a non-regression pin — it must STAY `kind` when the text
|
||||
// path is fixed (a careless "make them consistent" edit could flip both
|
||||
// to the wrong key).
|
||||
func TestBuildA2AMessageParts_FilePartUsesKind(t *testing.T) {
|
||||
atts := []AgentMessageAttachment{
|
||||
{URI: "https://example.com/a.png", MimeType: "image/png", Name: "a.png"},
|
||||
}
|
||||
parts := buildA2AMessageParts("caption", atts)
|
||||
if len(parts) < 2 {
|
||||
t.Fatalf("expected text + file parts, got %d", len(parts))
|
||||
}
|
||||
file := parts[1]
|
||||
if _, hasType := file["type"]; hasType {
|
||||
t.Errorf("file part uses forbidden `type` key: %v", file)
|
||||
}
|
||||
if _, hasKind := file["kind"]; !hasKind {
|
||||
t.Errorf("file part missing `kind` discriminator: %v", file)
|
||||
}
|
||||
}
|
||||
|
||||
// TestDelegationOutboundEnvelope_RoleAndKind pins the FULL outbound
|
||||
// envelope contract — role + parts[].kind — on the canonical helper.
|
||||
// A v0.3 `message` MUST carry `role` ("user" for a delegation request)
|
||||
// and `parts` whose every entry discriminates on `kind`. This is the
|
||||
// shape the receiver's MessageSendParams validator accepts; an envelope
|
||||
// missing `role` or keyed `type` is silently rejected (#2251).
|
||||
//
|
||||
// Built from the same primitives delegation.go / mcp_tools.go assemble
|
||||
// (role:"user" + buildA2AMessageParts) so the round-trip through
|
||||
// json.Marshal proves the wire bytes are v0.3-valid.
|
||||
func TestDelegationOutboundEnvelope_RoleAndKind(t *testing.T) {
|
||||
envelope := map[string]interface{}{
|
||||
"method": "message/send",
|
||||
"params": map[string]interface{}{
|
||||
"message": map[string]interface{}{
|
||||
"role": "user",
|
||||
"messageId": "deleg-1",
|
||||
"parts": buildA2AMessageParts("do the work", nil),
|
||||
},
|
||||
},
|
||||
}
|
||||
raw, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
var parsed map[string]interface{}
|
||||
if err := json.Unmarshal(raw, &parsed); err != nil {
|
||||
t.Fatalf("unmarshal envelope: %v", err)
|
||||
}
|
||||
|
||||
params, _ := parsed["params"].(map[string]interface{})
|
||||
if params == nil {
|
||||
t.Fatal("envelope missing params")
|
||||
}
|
||||
msg, _ := params["message"].(map[string]interface{})
|
||||
if msg == nil {
|
||||
t.Fatal("envelope missing params.message")
|
||||
}
|
||||
|
||||
// role is mandatory on a v0.3 message — the receiver rejects without it.
|
||||
role, hasRole := msg["role"].(string)
|
||||
if !hasRole || role == "" {
|
||||
t.Errorf("params.message missing non-empty `role` — v0.3 requires it; omitting it is the other half of #2251")
|
||||
}
|
||||
|
||||
parts, _ := msg["parts"].([]interface{})
|
||||
if len(parts) == 0 {
|
||||
t.Fatal("params.message.parts is empty")
|
||||
}
|
||||
for i, p := range parts {
|
||||
pm, _ := p.(map[string]interface{})
|
||||
if pm == nil {
|
||||
t.Errorf("part %d is not an object: %v", i, p)
|
||||
continue
|
||||
}
|
||||
if _, hasType := pm["type"]; hasType {
|
||||
t.Errorf("part %d uses forbidden `type` key (must be `kind`): %v", i, pm)
|
||||
}
|
||||
if _, hasKind := pm["kind"]; !hasKind {
|
||||
t.Errorf("part %d missing `kind` discriminator: %v", i, pm)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -801,6 +801,18 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) {
|
||||
if _, hasID := msg["messageId"]; !hasID {
|
||||
msg["messageId"] = uuid.New().String()
|
||||
}
|
||||
// #2251: default params.message.role to "user" when absent.
|
||||
// The downstream a2a-sdk v0.3 Pydantic validator marks role a
|
||||
// REQUIRED field; a role-less envelope fails parse with
|
||||
// "params.message.role Field required". The Go builders
|
||||
// (mcp_tools/delegation/scheduler/channels) already set it, but
|
||||
// raw external/canvas POSTs to ProxyA2A may omit it — making this
|
||||
// the single canonical choke that guarantees a schema-valid role.
|
||||
// Mirror the messageId default exactly: inject only when missing,
|
||||
// never overwrite a caller-supplied role (e.g. "agent").
|
||||
if _, hasRole := msg["role"]; !hasRole {
|
||||
msg["role"] = "user"
|
||||
}
|
||||
_, hasParts := msg["parts"]
|
||||
rawContent, hasContent := msg["content"]
|
||||
if !hasParts {
|
||||
@@ -832,6 +844,27 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) {
|
||||
}
|
||||
}
|
||||
}
|
||||
// #2251: wire hygiene — the A2A v0.3 Part discriminator is
|
||||
// "kind", but some builders/clients emit the legacy "type" key
|
||||
// (e.g. delegation.go). The v0.3 Pydantic validator keys on
|
||||
// "kind"; a stray "type" leaves the Part untagged. Rename
|
||||
// "type" → "kind" on every Part that lacks an explicit "kind"
|
||||
// so the discriminator is always present on the wire.
|
||||
if parts, ok := msg["parts"].([]interface{}); ok {
|
||||
for _, p := range parts {
|
||||
part, ok := p.(map[string]interface{})
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if _, hasKind := part["kind"]; hasKind {
|
||||
continue
|
||||
}
|
||||
if t, hasType := part["type"]; hasType {
|
||||
part["kind"] = t
|
||||
delete(part, "type")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1514,6 +1514,142 @@ func TestNormalizeA2APayload_NoMessageNoCheck(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// --- #2251: role default + part-kind hygiene contract tests ---
|
||||
//
|
||||
// These assert normalizeA2APayload is the single canonical Go choke that
|
||||
// guarantees a schema-valid outbound message/send envelope: it injects a
|
||||
// default params.message.role="user" when the sender omitted role (the bug
|
||||
// that made delegate_task fail the peer's a2a Pydantic validator with
|
||||
// "params.message.role Field required" while reply_to_workspace worked), and
|
||||
// it renames the legacy Part discriminator "type"→"kind" for wire hygiene.
|
||||
|
||||
// normMsg is a small helper that runs normalizeA2APayload and returns the
|
||||
// resolved params.message map, failing the test on any normalization error.
|
||||
func normMsg(t *testing.T, raw string) map[string]interface{} {
|
||||
t.Helper()
|
||||
out, _, perr := normalizeA2APayload([]byte(raw))
|
||||
if perr != nil {
|
||||
t.Fatalf("normalizeA2APayload returned error: %+v", perr)
|
||||
}
|
||||
var parsed map[string]interface{}
|
||||
if err := json.Unmarshal(out, &parsed); err != nil {
|
||||
t.Fatalf("output not valid JSON: %v", err)
|
||||
}
|
||||
params, ok := parsed["params"].(map[string]interface{})
|
||||
if !ok {
|
||||
t.Fatalf("output missing params object: %s", string(out))
|
||||
}
|
||||
msg, ok := params["message"].(map[string]interface{})
|
||||
if !ok {
|
||||
t.Fatalf("output missing params.message object: %s", string(out))
|
||||
}
|
||||
return msg
|
||||
}
|
||||
|
||||
func TestNormalizeA2APayload_DefaultsRoleWhenMissing(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
raw string
|
||||
}{
|
||||
{
|
||||
name: "v0.3 parts, no role",
|
||||
raw: `{"method":"message/send","params":{"message":{"parts":[{"kind":"text","text":"hi"}]}}}`,
|
||||
},
|
||||
{
|
||||
name: "v0.2 string content, no role",
|
||||
raw: `{"method":"message/send","params":{"message":{"content":"hi"}}}`,
|
||||
},
|
||||
{
|
||||
name: "legacy type part, no role",
|
||||
raw: `{"method":"message/send","params":{"message":{"parts":[{"type":"text","text":"hi"}]}}}`,
|
||||
},
|
||||
{
|
||||
name: "already wrapped jsonrpc, no role",
|
||||
raw: `{"jsonrpc":"2.0","id":"x","method":"message/send","params":{"message":{"parts":[{"kind":"text","text":"hi"}]}}}`,
|
||||
},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
msg := normMsg(t, tc.raw)
|
||||
if msg["role"] != "user" {
|
||||
t.Errorf("expected role defaulted to \"user\", got %v", msg["role"])
|
||||
}
|
||||
// Parts must remain valid (non-empty) after normalization.
|
||||
parts, ok := msg["parts"].([]interface{})
|
||||
if !ok || len(parts) == 0 {
|
||||
t.Fatalf("expected non-empty parts after normalization, got %v", msg["parts"])
|
||||
}
|
||||
// Every part must carry the v0.3 "kind" discriminator.
|
||||
for i, p := range parts {
|
||||
part, ok := p.(map[string]interface{})
|
||||
if !ok {
|
||||
t.Fatalf("part %d is not an object: %v", i, p)
|
||||
}
|
||||
if _, hasKind := part["kind"]; !hasKind {
|
||||
t.Errorf("part %d missing \"kind\" discriminator: %v", i, part)
|
||||
}
|
||||
if _, hasType := part["type"]; hasType {
|
||||
t.Errorf("part %d still has legacy \"type\" key: %v", i, part)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeA2APayload_PreservesExplicitRole(t *testing.T) {
|
||||
// A caller-supplied role (e.g. "agent") must NOT be overwritten with "user".
|
||||
msg := normMsg(t, `{"method":"message/send","params":{"message":{"role":"agent","parts":[{"kind":"text","text":"hi"}]}}}`)
|
||||
if msg["role"] != "agent" {
|
||||
t.Errorf("explicit role overwritten: expected \"agent\", got %v", msg["role"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeA2APayload_RenamesPartTypeToKind(t *testing.T) {
|
||||
// Mirrors delegation.go's builder which emits {"type":"text",...}. After
|
||||
// normalization the wire Part must be discriminated by "kind".
|
||||
msg := normMsg(t, `{"method":"message/send","params":{"message":{"role":"user","parts":[{"type":"text","text":"a"},{"type":"file","uri":"workspace:/x"}]}}}`)
|
||||
parts := msg["parts"].([]interface{})
|
||||
if len(parts) != 2 {
|
||||
t.Fatalf("expected 2 parts, got %d", len(parts))
|
||||
}
|
||||
wantKind := []string{"text", "file"}
|
||||
for i, p := range parts {
|
||||
part := p.(map[string]interface{})
|
||||
if part["kind"] != wantKind[i] {
|
||||
t.Errorf("part %d: expected kind=%q, got %v", i, wantKind[i], part["kind"])
|
||||
}
|
||||
if _, hasType := part["type"]; hasType {
|
||||
t.Errorf("part %d still carries legacy \"type\": %v", i, part)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeA2APayload_DoesNotClobberKindWithType(t *testing.T) {
|
||||
// If a part has BOTH kind and type, kind wins and is left untouched.
|
||||
msg := normMsg(t, `{"method":"message/send","params":{"message":{"role":"user","parts":[{"kind":"text","type":"ignored","text":"a"}]}}}`)
|
||||
part := msg["parts"].([]interface{})[0].(map[string]interface{})
|
||||
if part["kind"] != "text" {
|
||||
t.Errorf("expected kind preserved as \"text\", got %v", part["kind"])
|
||||
}
|
||||
}
|
||||
|
||||
// TestNormalizeA2APayload_RoleDefault_ContractRegression documents the
|
||||
// pre-fix failure: without the role default, a role-less message/send body
|
||||
// emerged from normalization still missing params.message.role, which the
|
||||
// peer's a2a Pydantic validator rejects. This asserts the POST-fix invariant
|
||||
// (role present) directly; before the a2a_proxy.go change this assertion
|
||||
// fails (role is absent → msg["role"] == nil).
|
||||
func TestNormalizeA2APayload_RoleDefault_ContractRegression(t *testing.T) {
|
||||
msg := normMsg(t, `{"method":"message/send","params":{"message":{"parts":[{"kind":"text","text":"delegate this"}]}}}`)
|
||||
role, hasRole := msg["role"]
|
||||
if !hasRole {
|
||||
t.Fatal("REGRESSION (#2251): params.message.role absent after normalization — peer a2a validator will reject with 'role Field required'")
|
||||
}
|
||||
if role != "user" {
|
||||
t.Errorf("expected default role \"user\", got %v", role)
|
||||
}
|
||||
}
|
||||
|
||||
// --- resolveAgentURL direct unit tests ---
|
||||
|
||||
func TestResolveAgentURL_CacheHit(t *testing.T) {
|
||||
|
||||
@@ -179,8 +179,11 @@ func (h *DelegationHandler) Delegate(c *gin.Context) {
|
||||
"message": map[string]interface{}{
|
||||
"role": "user",
|
||||
"messageId": delegationID,
|
||||
"parts": []map[string]interface{}{{"type": "text", "text": body.Task}},
|
||||
"metadata": map[string]interface{}{"delegation_id": delegationID},
|
||||
// A2A v0.3 Part discriminator is `kind`, NOT `type` (#2251) —
|
||||
// a `type`-keyed Part is dropped by the receiver's v0.3
|
||||
// validator, silently losing the delegated task.
|
||||
"parts": []map[string]interface{}{{"kind": "text", "text": body.Task}},
|
||||
"metadata": map[string]interface{}{"delegation_id": delegationID},
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
@@ -192,7 +192,11 @@ func (h *MCPHandler) toolGetWorkspaceInfo(ctx context.Context, workspaceID strin
|
||||
// follow in the order provided, with kind derived from MIME type.
|
||||
func buildA2AMessageParts(task string, attachments []AgentMessageAttachment) []map[string]interface{} {
|
||||
parts := []map[string]interface{}{
|
||||
{"type": "text", "text": task},
|
||||
// A2A v0.3 Part discriminator is `kind`, NOT `type` (#2251).
|
||||
// The receiver's v0.3 Pydantic validator drops a Part keyed
|
||||
// `type`, silently losing the task text — the file part below
|
||||
// already uses `kind`, this is the matching fix for text.
|
||||
{"kind": "text", "text": task},
|
||||
}
|
||||
for _, att := range attachments {
|
||||
kind := kindFromMimeType(att.MimeType)
|
||||
|
||||
@@ -161,7 +161,7 @@ func (h *PluginsHandler) uninstallViaDocker(ctx context.Context, c *gin.Context,
|
||||
// 1. Strip plugin's rule/fragment markers from CLAUDE.md (mirrors
|
||||
// AgentskillsAdaptor.uninstall lines 184-188). Best-effort: if
|
||||
// the user edited CLAUDE.md, our marker stays untouched.
|
||||
h.stripPluginMarkersFromMemory(ctx, workspaceID, containerName, pluginName)
|
||||
h.stripPluginMarkersFromMemory(ctx, containerName, pluginName)
|
||||
|
||||
// 2. Remove copied skill dirs declared in the plugin's plugin.yaml.
|
||||
for _, skill := range skillNames {
|
||||
@@ -171,11 +171,9 @@ func (h *PluginsHandler) uninstallViaDocker(ctx context.Context, c *gin.Context,
|
||||
log.Printf("Plugin uninstall: skipping invalid skill name %q in %s: %v", skill, pluginName, err)
|
||||
continue
|
||||
}
|
||||
if _, rmErr := h.execAsRoot(ctx, containerName, []string{
|
||||
_, _ = h.execAsRoot(ctx, containerName, []string{
|
||||
"rm", "-rf", "/configs/skills/" + skill,
|
||||
}); rmErr != nil {
|
||||
log.Printf("Plugin uninstall: failed to remove skill %s from %s: %v", skill, workspaceID, rmErr)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// 3. Delete the plugin directory itself (as root to handle file ownership).
|
||||
|
||||
@@ -393,7 +393,7 @@ func (h *PluginsHandler) readPluginSkillsFromContainer(ctx context.Context, cont
|
||||
// `# Plugin: <name> /` — mirrors AgentskillsAdaptor.uninstall's stripping
|
||||
// logic so install/uninstall are symmetric. Best-effort: silent on read or
|
||||
// write failure, since the rest of uninstall must still succeed.
|
||||
func (h *PluginsHandler) stripPluginMarkersFromMemory(ctx context.Context, workspaceID, containerName, pluginName string) {
|
||||
func (h *PluginsHandler) stripPluginMarkersFromMemory(ctx context.Context, containerName, pluginName string) {
|
||||
// Use sed via bash -c for atomic in-place delete: drop the marker line
|
||||
// and the blank line that follows it (install adds a leading blank line
|
||||
// before the marker via append_to_memory). Three sed passes mirror the
|
||||
@@ -417,9 +417,7 @@ func (h *PluginsHandler) stripPluginMarkersFromMemory(ctx context.Context, works
|
||||
`awk 'BEGIN{skip=0; blanks=0} /^%s/{skip=1; blanks=0; next} skip==1 && /^[[:space:]]*$/{blanks++; if(blanks>=2){skip=0; print; next} next} /^# Plugin: /{if(skip==1)skip=0} skip==1{next} {print}' /configs/CLAUDE.md > /tmp/claude.new && mv /tmp/claude.new /configs/CLAUDE.md`,
|
||||
regexpEscapeForAwk(marker),
|
||||
)
|
||||
if _, awkErr := h.execAsRoot(ctx, containerName, []string{"bash", "-c", script}); awkErr != nil {
|
||||
log.Printf("Plugin uninstall: failed to strip markers from CLAUDE.md for %s in %s: %v", pluginName, workspaceID, awkErr)
|
||||
}
|
||||
_, _ = h.execAsRoot(ctx, containerName, []string{"bash", "-c", script})
|
||||
}
|
||||
|
||||
// regexpEscapeForAwk escapes characters that have special meaning inside an
|
||||
|
||||
@@ -0,0 +1,160 @@
|
||||
package models
|
||||
|
||||
// Contract test: the EXACT request bodies the workspace runtime emits for
|
||||
// POST /registry/register and POST /registry/heartbeat bind cleanly against
|
||||
// the real RegisterPayload / HeartbeatPayload structs — and a body missing a
|
||||
// binding:"required" field is REJECTED.
|
||||
//
|
||||
// Why this exists — the same blind-spot class as the #2251 A2A bug
|
||||
// ----------------------------------------------------------------
|
||||
// The existing registry_test.go binds HAND-WRITTEN JSON literals
|
||||
// (`{"id":"ws-123","agent_card":{...}}`) that encode the *test author's*
|
||||
// idea of the wire shape, not the bytes the runtime actually produces. The
|
||||
// runtime's producer (molecule-ai-workspace-runtime main.py:484 /
|
||||
// heartbeat.py:233) is a separate hand-rolled dict. Nothing pinned that the
|
||||
// two agree on the required keys.
|
||||
//
|
||||
// These golden bodies are byte-for-byte the shapes the runtime emits (see the
|
||||
// companion Python contract test test_registry_payload_contract.py, which
|
||||
// asserts the runtime PRODUCES exactly these required keys). Together the two
|
||||
// halves form a producer→consumer contract: if the runtime drops a required
|
||||
// key, the Python test fails; if this struct adds/renames a required field,
|
||||
// the Go test below fails — drift can't pass silently on either side.
|
||||
//
|
||||
// gin's ShouldBindJSON runs `binding.JSON.BindBody`, which is json.Unmarshal
|
||||
// followed by the go-playground validator on the `binding` tags. We invoke
|
||||
// that exact path here without standing up a gin.Context / DB / Redis.
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/gin-gonic/gin/binding"
|
||||
)
|
||||
|
||||
// bindJSON mirrors gin's ShouldBindJSON: decode + validate the `binding` tags.
|
||||
func bindJSON(t *testing.T, body []byte, out any) error {
|
||||
t.Helper()
|
||||
return binding.JSON.BindBody(body, out)
|
||||
}
|
||||
|
||||
// ---- /registry/register --------------------------------------------------
|
||||
|
||||
// The exact body main.py emits (workspace_id + workspace_url + the hand-rolled
|
||||
// agent_card_dict). agent_card is json.RawMessage on the struct so its inner
|
||||
// shape is opaque to the bind — only presence is required.
|
||||
const runtimeRegisterBody = `{
|
||||
"id": "11111111-1111-1111-1111-111111111111",
|
||||
"url": "https://ws.example/a2a",
|
||||
"agent_card": {
|
||||
"name": "pm",
|
||||
"description": "team lead",
|
||||
"version": "1.0.0",
|
||||
"url": "https://ws.example/a2a",
|
||||
"skills": [{"id": "coding", "name": "coding", "description": "coding", "tags": []}],
|
||||
"capabilities": {"streaming": true, "pushNotifications": false},
|
||||
"configuration_status": "ready"
|
||||
}
|
||||
}`
|
||||
|
||||
func TestRegisterPayload_RuntimeBodyBinds(t *testing.T) {
|
||||
var p RegisterPayload
|
||||
if err := bindJSON(t, []byte(runtimeRegisterBody), &p); err != nil {
|
||||
t.Fatalf("runtime register body must bind against RegisterPayload, got: %v", err)
|
||||
}
|
||||
if p.ID != "11111111-1111-1111-1111-111111111111" {
|
||||
t.Errorf("id not decoded: %q", p.ID)
|
||||
}
|
||||
if len(p.AgentCard) == 0 {
|
||||
t.Error("agent_card must be present (binding:required)")
|
||||
}
|
||||
if p.URL == "" {
|
||||
t.Error("url should round-trip from the runtime body")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRegisterPayload_MissingID_Rejected(t *testing.T) {
|
||||
// The #2251-style regression: runtime drops the required `id` key.
|
||||
const noID = `{"url":"https://ws.example/a2a","agent_card":{"name":"pm"}}`
|
||||
var p RegisterPayload
|
||||
if err := bindJSON(t, []byte(noID), &p); err == nil {
|
||||
t.Fatal("a register body missing the required `id` MUST be rejected (would 400); got nil error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRegisterPayload_MissingAgentCard_Rejected(t *testing.T) {
|
||||
const noCard = `{"id":"ws-1","url":"https://ws.example/a2a"}`
|
||||
var p RegisterPayload
|
||||
if err := bindJSON(t, []byte(noCard), &p); err == nil {
|
||||
t.Fatal("a register body missing the required `agent_card` MUST be rejected (would 400); got nil error")
|
||||
}
|
||||
}
|
||||
|
||||
// ---- /registry/heartbeat -------------------------------------------------
|
||||
|
||||
// The exact body heartbeat.py:233 emits (no wedge/metadata, the healthy case).
|
||||
const runtimeHeartbeatBody = `{
|
||||
"workspace_id": "00000000-0000-0000-0000-000000000688",
|
||||
"error_rate": 0.0,
|
||||
"sample_error": "",
|
||||
"active_tasks": 0,
|
||||
"current_task": "",
|
||||
"uptime_seconds": 42
|
||||
}`
|
||||
|
||||
func TestHeartbeatPayload_RuntimeBodyBinds(t *testing.T) {
|
||||
var p HeartbeatPayload
|
||||
if err := bindJSON(t, []byte(runtimeHeartbeatBody), &p); err != nil {
|
||||
t.Fatalf("runtime heartbeat body must bind against HeartbeatPayload, got: %v", err)
|
||||
}
|
||||
if p.WorkspaceID != "00000000-0000-0000-0000-000000000688" {
|
||||
t.Errorf("workspace_id not decoded: %q", p.WorkspaceID)
|
||||
}
|
||||
if p.UptimeSeconds != 42 {
|
||||
t.Errorf("uptime_seconds not decoded: %d", p.UptimeSeconds)
|
||||
}
|
||||
}
|
||||
|
||||
// The wedged-runtime heartbeat (heartbeat.py _runtime_state_payload +
|
||||
// _runtime_metadata_payload layered on) must also bind — runtime_metadata is a
|
||||
// pointer so a present block decodes, and an absent one stays nil.
|
||||
const runtimeHeartbeatWedgedBody = `{
|
||||
"workspace_id": "00000000-0000-0000-0000-000000000688",
|
||||
"error_rate": 0.5,
|
||||
"active_tasks": 1,
|
||||
"current_task": "stuck",
|
||||
"uptime_seconds": 99,
|
||||
"runtime_state": "wedged",
|
||||
"sample_error": "Control request timeout: initialize",
|
||||
"runtime_metadata": {
|
||||
"capabilities": {"heartbeat": true, "scheduler": false},
|
||||
"idle_timeout_seconds": 600
|
||||
}
|
||||
}`
|
||||
|
||||
func TestHeartbeatPayload_WedgedRuntimeBodyBinds(t *testing.T) {
|
||||
var p HeartbeatPayload
|
||||
if err := bindJSON(t, []byte(runtimeHeartbeatWedgedBody), &p); err != nil {
|
||||
t.Fatalf("wedged heartbeat body must bind, got: %v", err)
|
||||
}
|
||||
if p.RuntimeState != "wedged" {
|
||||
t.Errorf("runtime_state not decoded: %q", p.RuntimeState)
|
||||
}
|
||||
if p.RuntimeMetadata == nil {
|
||||
t.Fatal("runtime_metadata must decode to a non-nil pointer when present")
|
||||
}
|
||||
if got := p.RuntimeMetadata.Capabilities["heartbeat"]; !got {
|
||||
t.Error("runtime_metadata.capabilities[heartbeat] should be true")
|
||||
}
|
||||
if p.RuntimeMetadata.IdleTimeoutSeconds == nil || *p.RuntimeMetadata.IdleTimeoutSeconds != 600 {
|
||||
t.Error("runtime_metadata.idle_timeout_seconds should decode to 600")
|
||||
}
|
||||
}
|
||||
|
||||
func TestHeartbeatPayload_MissingWorkspaceID_Rejected(t *testing.T) {
|
||||
// The drift the producer-side Python test guards: workspace_id renamed/dropped.
|
||||
const renamed = `{"id":"ws-688","error_rate":0.0,"active_tasks":0}`
|
||||
var p HeartbeatPayload
|
||||
if err := bindJSON(t, []byte(renamed), &p); err == nil {
|
||||
t.Fatal("a heartbeat body missing the required `workspace_id` MUST be rejected (would 400); got nil error")
|
||||
}
|
||||
}
|
||||
@@ -16,7 +16,7 @@ const SchemaVersion = 1
|
||||
// Fingerprint is a stable content hash of the generated projection (schema
|
||||
// version + provider catalog + runtime native sets). It changes iff the
|
||||
// registry DATA changes (comment-only YAML edits do not churn it).
|
||||
const Fingerprint = "ec6b93409e7b9cf8"
|
||||
const Fingerprint = "e457249eb0fd77a2"
|
||||
|
||||
// GenProvider is the generated projection of one provider catalog entry —
|
||||
// the subset a downstream consumer needs to derive + display a provider.
|
||||
@@ -84,8 +84,8 @@ var Runtimes = map[string][]GenRuntimeRef{
|
||||
"claude-code": {
|
||||
{Name: "anthropic-oauth", Models: []string{"sonnet", "opus", "haiku", "anthropic:sonnet", "anthropic:opus", "anthropic:haiku"}},
|
||||
{Name: "anthropic-api", Models: []string{"claude-sonnet-4-6", "claude-opus-4-7", "claude-haiku-4-5", "claude-sonnet-4-5", "anthropic:claude-sonnet-4-6", "anthropic:claude-opus-4-7", "anthropic:claude-haiku-4-5", "anthropic:claude-sonnet-4-5"}},
|
||||
{Name: "kimi-coding", Models: []string{"kimi-for-coding", "kimi-k2.5", "kimi-k2", "moonshot:kimi-k2.6", "moonshot:kimi-k2.5"}},
|
||||
{Name: "minimax", Models: []string{"MiniMax-M2", "MiniMax-M2.7", "MiniMax-M2.7-highspeed", "MiniMax-M3", "minimax:MiniMax-M2", "minimax:MiniMax-M2.7", "minimax:MiniMax-M2.7-highspeed", "minimax:MiniMax-M3"}},
|
||||
{Name: "kimi-coding", Models: []string{"kimi-for-coding", "kimi-k2.5", "kimi-k2"}},
|
||||
{Name: "minimax", Models: []string{"MiniMax-M2", "MiniMax-M2.7", "MiniMax-M2.7-highspeed", "MiniMax-M3"}},
|
||||
{Name: "platform", Models: []string{"anthropic/claude-opus-4-7", "anthropic/claude-sonnet-4-6", "moonshot/kimi-k2.6", "moonshot/kimi-k2.5", "minimax/MiniMax-M2.7", "minimax/MiniMax-M2.7-highspeed", "minimax/MiniMax-M3"}},
|
||||
{Name: "zai", Models: []string{}},
|
||||
{Name: "deepseek", Models: []string{}},
|
||||
|
||||
@@ -827,29 +827,25 @@ runtimes:
|
||||
- anthropic:claude-sonnet-4-5
|
||||
- name: kimi-coding
|
||||
# BYOK kimi-coding gateway ids — bare form is the canonical id
|
||||
# the gateway routes; the colon form `moonshot:kimi-k2.*` is the
|
||||
# legacy BYOK selection form (already in use on the openclaw
|
||||
# native set below). claude-code's adapter accepts both
|
||||
# (internal#718 P4 PR-1).
|
||||
# the gateway routes. The colon form `moonshot:kimi-k2.*` was
|
||||
# removed because claude-code's adapter cannot strip the
|
||||
# `moonshot:` prefix — it only handles `anthropic:`/`claude:`
|
||||
# (cp#521). The bare forms already cover these models.
|
||||
models:
|
||||
- kimi-for-coding
|
||||
- kimi-k2.5
|
||||
- kimi-k2
|
||||
- moonshot:kimi-k2.6
|
||||
- moonshot:kimi-k2.5
|
||||
- name: minimax
|
||||
# BYOK MiniMax ids — bare form is the canonical id; colon form is
|
||||
# the legacy BYOK selection spelling carried in the create corpus
|
||||
# and the openclaw template (internal#718 P4 PR-1).
|
||||
# BYOK MiniMax ids — bare form is the canonical id. The colon
|
||||
# forms `minimax:MiniMax-*` were removed because claude-code's
|
||||
# adapter cannot strip the `minimax:` prefix — it only handles
|
||||
# `anthropic:`/`claude:` (cp#521). The bare forms already cover
|
||||
# these models.
|
||||
models:
|
||||
- MiniMax-M2
|
||||
- MiniMax-M2.7
|
||||
- MiniMax-M2.7-highspeed
|
||||
- MiniMax-M3
|
||||
- minimax:MiniMax-M2
|
||||
- minimax:MiniMax-M2.7
|
||||
- minimax:MiniMax-M2.7-highspeed
|
||||
- minimax:MiniMax-M3
|
||||
# Platform-managed (no tenant key; Molecule owns billing). The
|
||||
# vendor/model-namespaced ids the proxy resolves to the upstream vendor.
|
||||
# Canonical for the template's `provider: platform` model entries — the
|
||||
|
||||
@@ -324,3 +324,46 @@ func TestVertexProviderRegistered(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestPlatformProvider_AuthEnvIsUsageTokenOnly is the SSOT-side regression
|
||||
// gate for the platform-managed auth_env drift class (issue #2250 — the
|
||||
// codex template's `platform` provider shipped
|
||||
// auth_env: [MOLECULE_LLM_USAGE_TOKEN, ANTHROPIC_API_KEY], wrongly
|
||||
// advertising a vendor key under a platform-managed provider).
|
||||
//
|
||||
// The `platform` provider is the closed Molecule proxy arm: the platform
|
||||
// owns billing and injects MOLECULE_LLM_USAGE_TOKEN, so a tenant supplies
|
||||
// NO vendor credential. Listing ANTHROPIC_API_KEY (or any other vendor key)
|
||||
// in its auth_env makes the canvas demand a credential the platform path
|
||||
// neither needs nor uses, and lets a stray vendor key satisfy the
|
||||
// "auth present" check on a path that ignores it — exactly the wrong-bill /
|
||||
// silent-no-op failure mode the BYOK-vs-platform split exists to prevent.
|
||||
//
|
||||
// EXACT-equality (not membership): the prior template-side test only
|
||||
// asserted `"MOLECULE_LLM_USAGE_TOKEN" in auth_env`, which PASSED against
|
||||
// the buggy two-element list. Pin the WHOLE set so an extra vendor key
|
||||
// trips the gate. This is the core providers.yaml SSOT; the template
|
||||
// derives from / must byte-match this set (drift-gated by molecule-ci).
|
||||
// On core this currently PASSES (auth_env is already clean; the vendor
|
||||
// key lives in the separate auth_token_env field) — the gate locks that
|
||||
// in so a future drift onto this SSOT trips CI.
|
||||
func TestPlatformProvider_AuthEnvIsUsageTokenOnly(t *testing.T) {
|
||||
ps, err := Load()
|
||||
if err != nil {
|
||||
t.Fatalf("Load() error = %v", err)
|
||||
}
|
||||
var platform *Provider
|
||||
for i := range ps {
|
||||
if ps[i].Name == "platform" {
|
||||
platform = &ps[i]
|
||||
break
|
||||
}
|
||||
}
|
||||
if platform == nil {
|
||||
t.Fatal("platform provider missing from providers.yaml — the closed proxy arm must exist")
|
||||
}
|
||||
want := []string{"MOLECULE_LLM_USAGE_TOKEN"}
|
||||
if len(platform.AuthEnv) != len(want) || platform.AuthEnv[0] != want[0] {
|
||||
t.Errorf("platform provider auth_env = %v, want exactly %v — a vendor key under a platform-managed provider is the #2250 drift; auth_token_env (the proxy's internal projection target) is a SEPARATE field and must not leak into auth_env", platform.AuthEnv, want)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -117,14 +117,15 @@ func TestModelsForRuntime_ExactModelIDs(t *testing.T) {
|
||||
"anthropic:claude-haiku-4-5", "anthropic:claude-sonnet-4-5",
|
||||
// anthropic via platform proxy (namespaced)
|
||||
"anthropic/claude-opus-4-7", "anthropic/claude-sonnet-4-6",
|
||||
// kimi (kimi-coding gateway, bare + legacy colon-namespaced BYOK)
|
||||
// kimi (kimi-coding gateway, bare form only — colon-forms removed
|
||||
// because claude-code's adapter cannot strip the moonshot: prefix;
|
||||
// openclaw retains them natively, cp#521).
|
||||
"kimi-for-coding", "kimi-k2.5", "kimi-k2",
|
||||
"moonshot:kimi-k2.6", "moonshot:kimi-k2.5",
|
||||
// kimi via platform proxy
|
||||
"moonshot/kimi-k2.6", "moonshot/kimi-k2.5",
|
||||
// minimax BYOK (bare + legacy colon-namespaced)
|
||||
// minimax BYOK (bare form only — colon-forms removed because
|
||||
// claude-code's adapter cannot strip the minimax: prefix, cp#521).
|
||||
"MiniMax-M2", "MiniMax-M2.7", "MiniMax-M2.7-highspeed", "MiniMax-M3",
|
||||
"minimax:MiniMax-M2", "minimax:MiniMax-M2.7", "minimax:MiniMax-M2.7-highspeed", "minimax:MiniMax-M3",
|
||||
// minimax via platform proxy
|
||||
"minimax/MiniMax-M2.7", "minimax/MiniMax-M2.7-highspeed", "minimax/MiniMax-M3",
|
||||
},
|
||||
|
||||
@@ -29,7 +29,7 @@ import (
|
||||
// canonicalProvidersYAMLSHA256 is the sha256 of the canonical providers.yaml as
|
||||
// synced from molecule-controlplane. Bumped deliberately on each re-sync (see
|
||||
// file doc). Cross-checked live by the sync-providers-yaml CI workflow.
|
||||
const canonicalProvidersYAMLSHA256 = "846ddef11ec423ebf2e96b5da21bd89129dbc3f0a2d14ac086940e005c079387"
|
||||
const canonicalProvidersYAMLSHA256 = "9eb6f97fc37b528c91936be4a75dd87f6c7172742b4535d76b9bb2231ee18e80"
|
||||
|
||||
func TestSyncedYAMLMatchesCanonicalSHA(t *testing.T) {
|
||||
sum := sha256.Sum256(embeddedYAML)
|
||||
|
||||
@@ -36,11 +36,22 @@ package registry
|
||||
// runtime <> 'external'. Paused/hibernated/removed/provisioning/
|
||||
// awaiting_agent rows are out of scope; external rows are covered by
|
||||
// the remote-heartbeat pass.
|
||||
// - Per-cycle row cap + per-workspace timeout so one slow CP call can't
|
||||
// stall the sweep.
|
||||
// - Per-cycle row cap + per-cycle deadline + per-workspace timeout so
|
||||
// one slow CP call (or a degraded-but-not-erroring CP) can't stall
|
||||
// the sweep.
|
||||
// - TOCTOU re-confirm before any flip: IsRunning resolves instance_id
|
||||
// independently, so a row whose instance_id was cleared/NULLed (by a
|
||||
// concurrent delete, the CP-orphan-sweeper, or a reprovision) between
|
||||
// the reconciler's SELECT and the IsRunning probe yields a STALE
|
||||
// (false, nil) that does NOT prove the EC2 is dead. We re-read the
|
||||
// row's current (status, instance_id) and flip ONLY when the SAME
|
||||
// non-empty instance we asked CP about is still the workspace's
|
||||
// recorded instance AND it's still online/degraded. Mirrors the
|
||||
// guarded-write re-confirm in healthsweep.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"log"
|
||||
"time"
|
||||
|
||||
@@ -70,6 +81,20 @@ const CPInstanceReconcileLimit = 200
|
||||
// timeout context derived from the cycle context.
|
||||
const cpInstanceCheckTimeout = 10 * time.Second
|
||||
|
||||
// cpInstanceCycleDeadline bounds the wall-time of one whole reconcile
|
||||
// pass. With per-workspace 10s timeouts and a 200-row cap, a degraded-
|
||||
// but-not-erroring CP (each IsRunning slow but under the per-workspace
|
||||
// cap) could otherwise drag one cycle out for tens of minutes and starve
|
||||
// the next tick. Mirrors cp_orphan_sweeper's orphanSweepDeadline; chosen
|
||||
// under the 60s interval so a stuck cycle is abandoned before the next
|
||||
// one is due and the backlog drains across subsequent cycles.
|
||||
const cpInstanceCycleDeadline = 45 * time.Second
|
||||
|
||||
// cpInstanceReconfirmTimeout bounds the TOCTOU re-confirm read. This is a
|
||||
// single indexed primary-key lookup, so it should never be slow; a tight
|
||||
// timeout keeps the re-confirm from itself becoming a stall point.
|
||||
const cpInstanceReconfirmTimeout = 5 * time.Second
|
||||
|
||||
// StartCPInstanceReconciler runs the authoritative EC2-state reconcile
|
||||
// loop until ctx is cancelled. A nil checker makes the loop a no-op
|
||||
// (matches the nil-tolerant pattern of the sibling CP sweeper).
|
||||
@@ -106,21 +131,41 @@ func StartCPInstanceReconciler(ctx context.Context, checker InstanceRunningCheck
|
||||
}
|
||||
}
|
||||
|
||||
// reconcileRow pairs a workspace id with the instance_id captured in the
|
||||
// SAME SELECT, so the TOCTOU re-confirm can verify CP's (false, nil)
|
||||
// answer is about the instance the row still records — not one cleared
|
||||
// out from under us between the SELECT and the IsRunning probe.
|
||||
type reconcileRow struct {
|
||||
id string
|
||||
instanceID string
|
||||
}
|
||||
|
||||
// reconcileOnce executes one reconcile pass. Defensive against db.DB
|
||||
// being nil so a misconfigured boot doesn't panic.
|
||||
//
|
||||
// Scope: online + SaaS-EC2 workspaces only. runtime='external' rows are
|
||||
// excluded (covered by the remote-heartbeat pass); paused/hibernated/
|
||||
// removed/provisioning/awaiting_agent are excluded by the status filter.
|
||||
func reconcileOnce(ctx context.Context, checker InstanceRunningChecker, onOffline OfflineHandler) {
|
||||
// Scope: online/degraded + SaaS-EC2 workspaces only. runtime='external'
|
||||
// rows are excluded (covered by the remote-heartbeat pass); paused/
|
||||
// hibernated/removed/provisioning/awaiting_agent are excluded by the
|
||||
// status filter. `degraded` is included because a SaaS workspace whose
|
||||
// heartbeat handler flipped it degraded then lost its EC2 falls through
|
||||
// every other sweep (matches healthsweep's `status IN ('online',
|
||||
// 'degraded')`).
|
||||
func reconcileOnce(parent context.Context, checker InstanceRunningChecker, onOffline OfflineHandler) {
|
||||
if db.DB == nil {
|
||||
return
|
||||
}
|
||||
|
||||
rows, err := db.DB.QueryContext(ctx, `
|
||||
SELECT id::text
|
||||
// Per-cycle deadline so a degraded-but-not-erroring CP (each IsRunning
|
||||
// slow but under the per-workspace cap) can't drag one cycle out for
|
||||
// tens of minutes and starve the next tick. Per-workspace IsRunning
|
||||
// timeouts derive from this cycle context.
|
||||
cycleCtx, cancelCycle := context.WithTimeout(parent, cpInstanceCycleDeadline)
|
||||
defer cancelCycle()
|
||||
|
||||
rows, err := db.DB.QueryContext(cycleCtx, `
|
||||
SELECT id::text, instance_id
|
||||
FROM workspaces
|
||||
WHERE status = 'online'
|
||||
WHERE status IN ('online', 'degraded')
|
||||
AND instance_id IS NOT NULL
|
||||
AND instance_id != ''
|
||||
AND COALESCE(runtime, '') <> 'external'
|
||||
@@ -133,46 +178,130 @@ func reconcileOnce(ctx context.Context, checker InstanceRunningChecker, onOfflin
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var ids []string
|
||||
var candidates []reconcileRow
|
||||
for rows.Next() {
|
||||
var id string
|
||||
if scanErr := rows.Scan(&id); scanErr != nil {
|
||||
var r reconcileRow
|
||||
if scanErr := rows.Scan(&r.id, &r.instanceID); scanErr != nil {
|
||||
log.Printf("cp-instance-reconciler: row scan failed: %v", scanErr)
|
||||
continue
|
||||
}
|
||||
ids = append(ids, id)
|
||||
candidates = append(candidates, r)
|
||||
}
|
||||
if iterErr := rows.Err(); iterErr != nil {
|
||||
log.Printf("cp-instance-reconciler: rows iteration failed: %v", iterErr)
|
||||
return
|
||||
}
|
||||
|
||||
for _, id := range ids {
|
||||
processed, skipped := 0, 0
|
||||
for _, c := range candidates {
|
||||
// Abandon the cycle if we've blown the per-cycle deadline; the
|
||||
// next tick re-reads from the top (ORDER BY updated_at DESC) and
|
||||
// drains the backlog. Without this a slow CP could keep one cycle
|
||||
// running past its interval and never let a fresh one start.
|
||||
if cycleCtx.Err() != nil {
|
||||
log.Printf("cp-instance-reconciler: cycle deadline reached — processed %d, %d skipped (TOCTOU/changed), remaining deferred to next cycle", processed, skipped)
|
||||
return
|
||||
}
|
||||
processed++
|
||||
|
||||
// Per-workspace timeout so one slow CP round-trip can't stall
|
||||
// the whole sweep.
|
||||
checkCtx, cancel := context.WithTimeout(ctx, cpInstanceCheckTimeout)
|
||||
running, checkErr := checker.IsRunning(checkCtx, id)
|
||||
// the whole sweep. Derived from cycleCtx so the cycle deadline
|
||||
// always dominates.
|
||||
checkCtx, cancel := context.WithTimeout(cycleCtx, cpInstanceCheckTimeout)
|
||||
running, checkErr := checker.IsRunning(checkCtx, c.id)
|
||||
cancel()
|
||||
|
||||
if checkErr != nil {
|
||||
// FAIL-SAFE: transient DB/transport error (or a no-backend
|
||||
// signal). IsRunning returns (true, err) on these, so never
|
||||
// flip — leave the row online and retry next cycle.
|
||||
log.Printf("cp-instance-reconciler: IsRunning(%s) errored, leaving online (fail-safe): %v", id, checkErr)
|
||||
log.Printf("cp-instance-reconciler: IsRunning(%s) errored, leaving online (fail-safe): %v", c.id, checkErr)
|
||||
continue
|
||||
}
|
||||
if running {
|
||||
continue
|
||||
}
|
||||
|
||||
// CLEAN "not running" — CP authoritatively reports the EC2 is
|
||||
// terminated/stopped/absent. Feed it into the existing offline +
|
||||
// (false, nil) is NOT yet proof the EC2 is dead. IsRunning
|
||||
// resolves instance_id independently (resolveInstanceID); if the
|
||||
// row's instance_id was cleared/NULLed (concurrent delete, the
|
||||
// CP-orphan-sweeper NULLing it, a reprovision) or the row moved
|
||||
// off online/degraded between our SELECT and this probe,
|
||||
// IsRunning returns a STALE (false, nil) that reflects a missing
|
||||
// instance_id, NOT a confirmed-terminated EC2. Re-confirm against
|
||||
// the row's CURRENT state and flip ONLY when the SAME non-empty
|
||||
// instance we asked CP about is still recorded AND the row is
|
||||
// still online/degraded. Mirrors healthsweep's guarded write.
|
||||
if !reconfirmStillOfflineCandidate(cycleCtx, c) {
|
||||
skipped++
|
||||
continue
|
||||
}
|
||||
|
||||
// CONFIRMED "not running" — CP authoritatively reports the EC2 is
|
||||
// terminated/stopped/absent AND the row still records that exact
|
||||
// instance as online/degraded. Feed it into the existing offline +
|
||||
// auto-heal machinery: onOffline flips the row offline and
|
||||
// triggers RestartByID, which reprovisions with the existing
|
||||
// volume.
|
||||
log.Printf("cp-instance-reconciler: workspace %s is status=online but its EC2 is not running (terminated/stopped) — flipping offline + triggering reprovision", id)
|
||||
log.Printf("cp-instance-reconciler: workspace %s (instance %s) is online/degraded but its EC2 is not running (terminated/stopped) — flipping offline + triggering reprovision", c.id, c.instanceID)
|
||||
if onOffline != nil {
|
||||
onOffline(ctx, id)
|
||||
onOffline(cycleCtx, c.id)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// reconfirmStillOfflineCandidate re-reads the workspace's CURRENT
|
||||
// (status, instance_id) and reports whether it is STILL a valid offline
|
||||
// candidate for the instance we just probed. It returns true ONLY when:
|
||||
//
|
||||
// - the row still exists, AND
|
||||
// - current status IN ('online','degraded'), AND
|
||||
// - current instance_id is non-empty, AND
|
||||
// - current instance_id == the instance_id captured in the original
|
||||
// SELECT (the one whose liveness CP just answered about).
|
||||
//
|
||||
// Any other outcome (row gone, status moved off online/degraded,
|
||||
// instance_id cleared or now points at a different instance) means the
|
||||
// IsRunning (false, nil) was a stale/cleared-instance snapshot rather
|
||||
// than a confirmed-terminated EC2 — return false so the caller skips the
|
||||
// flip. A DB error during re-confirm is treated as "not confirmed"
|
||||
// (false): fail-safe toward NOT flipping a workspace we can't re-verify.
|
||||
func reconfirmStillOfflineCandidate(parent context.Context, c reconcileRow) bool {
|
||||
if db.DB == nil {
|
||||
return false
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(parent, cpInstanceReconfirmTimeout)
|
||||
defer cancel()
|
||||
|
||||
var curStatus, curInstanceID string
|
||||
err := db.DB.QueryRowContext(ctx, `
|
||||
SELECT status, COALESCE(instance_id, '')
|
||||
FROM workspaces
|
||||
WHERE id = $1
|
||||
`, c.id).Scan(&curStatus, &curInstanceID)
|
||||
if err != nil {
|
||||
if err == sql.ErrNoRows {
|
||||
// Row deleted between SELECT and re-confirm — definitely not a
|
||||
// terminated-EC2 signal. Skip.
|
||||
log.Printf("cp-instance-reconciler: re-confirm %s: row gone — skipping flip (stale snapshot, not a dead EC2)", c.id)
|
||||
return false
|
||||
}
|
||||
// Transient DB error — fail-safe toward NOT flipping.
|
||||
log.Printf("cp-instance-reconciler: re-confirm %s errored, skipping flip (fail-safe): %v", c.id, err)
|
||||
return false
|
||||
}
|
||||
|
||||
if curStatus != "online" && curStatus != "degraded" {
|
||||
log.Printf("cp-instance-reconciler: re-confirm %s: status moved to %q since SELECT — skipping flip", c.id, curStatus)
|
||||
return false
|
||||
}
|
||||
if curInstanceID == "" {
|
||||
log.Printf("cp-instance-reconciler: re-confirm %s: instance_id cleared since SELECT — skipping flip (CP answered about a now-detached instance)", c.id)
|
||||
return false
|
||||
}
|
||||
if curInstanceID != c.instanceID {
|
||||
log.Printf("cp-instance-reconciler: re-confirm %s: instance_id changed %s -> %s since SELECT (reprovision) — skipping flip", c.id, c.instanceID, curInstanceID)
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
@@ -63,16 +63,48 @@ func (r *recordingOffline) got() []string {
|
||||
}
|
||||
|
||||
// expectReconcileQuery registers the reconciler's SELECT, pinning the
|
||||
// scope-critical predicates: status='online', instance_id present, and
|
||||
// runtime <> 'external'. A future widening that drops any of these (e.g.
|
||||
// sweeping paused rows, or external rows the heartbeat pass owns) fails
|
||||
// every test that uses this helper.
|
||||
// scope-critical predicates: status IN ('online','degraded'), instance_id
|
||||
// present (captured as a column for the TOCTOU re-confirm), and runtime
|
||||
// <> 'external'. A future widening that drops any of these (e.g. sweeping
|
||||
// paused rows, or external rows the heartbeat pass owns), or that drops
|
||||
// the instance_id column the re-confirm depends on, fails every test that
|
||||
// uses this helper.
|
||||
func expectReconcileQuery(mock sqlmock.Sqlmock, rows *sqlmock.Rows) {
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces\s+WHERE status = 'online'\s+AND instance_id IS NOT NULL\s+AND instance_id != ''\s+AND COALESCE\(runtime, ''\) <> 'external'\s+ORDER BY updated_at DESC\s+LIMIT \$1`).
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT id::text, instance_id\s+FROM workspaces\s+WHERE status IN \('online', 'degraded'\)\s+AND instance_id IS NOT NULL\s+AND instance_id != ''\s+AND COALESCE\(runtime, ''\) <> 'external'\s+ORDER BY updated_at DESC\s+LIMIT \$1`).
|
||||
WithArgs(CPInstanceReconcileLimit).
|
||||
WillReturnRows(rows)
|
||||
}
|
||||
|
||||
// reconcileRows builds the two-column (id, instance_id) result the
|
||||
// reconciler's SELECT now returns. Pass id/instance_id pairs.
|
||||
func reconcileRows(pairs ...[2]string) *sqlmock.Rows {
|
||||
r := sqlmock.NewRows([]string{"id", "instance_id"})
|
||||
for _, p := range pairs {
|
||||
r.AddRow(p[0], p[1])
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
// expectReconfirm registers the TOCTOU re-confirm primary-key lookup for
|
||||
// workspace id `wsID`, returning the row's CURRENT (status, instance_id).
|
||||
// This is what the reconciler re-reads after IsRunning returns (false,
|
||||
// nil), before it flips: it only flips when the SAME non-empty instance
|
||||
// is still recorded AND status is still online/degraded.
|
||||
func expectReconfirm(mock sqlmock.Sqlmock, wsID, curStatus, curInstanceID string) {
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT status, COALESCE\(instance_id, ''\)\s+FROM workspaces\s+WHERE id = \$1`).
|
||||
WithArgs(wsID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "instance_id"}).AddRow(curStatus, curInstanceID))
|
||||
}
|
||||
|
||||
// expectReconfirmNoRows registers a re-confirm lookup that finds the row
|
||||
// gone (deleted between SELECT and re-confirm) — the reconciler must
|
||||
// treat this as "not a dead EC2" and skip the flip.
|
||||
func expectReconfirmNoRows(mock sqlmock.Sqlmock, wsID string) {
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT status, COALESCE\(instance_id, ''\)\s+FROM workspaces\s+WHERE id = \$1`).
|
||||
WithArgs(wsID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "instance_id"}))
|
||||
}
|
||||
|
||||
// TestReconcileOnce_NotRunning_FlipsOffline — the core bug (core#2247):
|
||||
// an online SaaS workspace whose EC2 is terminated. CP reports a CLEAN
|
||||
// (false, nil); onOffline MUST be called with that id so the existing
|
||||
@@ -82,7 +114,10 @@ func TestReconcileOnce_NotRunning_FlipsOffline(t *testing.T) {
|
||||
checker := &fakeRunningChecker{running: map[string]bool{"ws-dead": false}}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).AddRow("ws-dead"))
|
||||
expectReconcileQuery(mock, reconcileRows([2]string{"ws-dead", "i-dead"}))
|
||||
// (false,nil) → re-confirm: row still online with the SAME instance →
|
||||
// confirmed-dead → flip.
|
||||
expectReconfirm(mock, "ws-dead", "online", "i-dead")
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
@@ -102,7 +137,8 @@ func TestReconcileOnce_Running_DoesNotFlip(t *testing.T) {
|
||||
checker := &fakeRunningChecker{running: map[string]bool{"ws-alive": true}}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).AddRow("ws-alive"))
|
||||
// Running → no re-confirm, no flip.
|
||||
expectReconcileQuery(mock, reconcileRows([2]string{"ws-alive", "i-alive"}))
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
@@ -126,7 +162,9 @@ func TestReconcileOnce_TransientError_DoesNotFlip(t *testing.T) {
|
||||
}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).AddRow("ws-blip"))
|
||||
// (true,err) short-circuits BEFORE the re-confirm — no re-confirm query
|
||||
// is registered, so a stray re-confirm would fail ExpectationsWereMet.
|
||||
expectReconcileQuery(mock, reconcileRows([2]string{"ws-blip", "i-blip"}))
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
@@ -143,19 +181,20 @@ func TestReconcileOnce_TransientError_DoesNotFlip(t *testing.T) {
|
||||
|
||||
// TestReconcileOnce_QueryScopeExcludesExternalAndNonOnline — pins the
|
||||
// SELECT predicate. The regex in expectReconcileQuery requires
|
||||
// status='online' AND runtime <> 'external'; if a future edit widens the
|
||||
// scope to include paused/hibernated/removed rows or external rows (owned
|
||||
// by the heartbeat pass), this query no longer matches and sqlmock fails
|
||||
// the test. With the predicate intact, a DB that has only out-of-scope
|
||||
// rows returns empty → no IsRunning, no flip.
|
||||
// status IN ('online','degraded') AND runtime <> 'external'; if a future
|
||||
// edit widens the scope to include paused/hibernated/removed rows or
|
||||
// external rows (owned by the heartbeat pass), or narrows it back to drop
|
||||
// 'degraded', this query no longer matches and sqlmock fails the test.
|
||||
// With the predicate intact, a DB that has only out-of-scope rows returns
|
||||
// empty → no IsRunning, no flip.
|
||||
func TestReconcileOnce_QueryScopeExcludesExternalAndNonOnline(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
checker := &fakeRunningChecker{}
|
||||
off := &recordingOffline{}
|
||||
|
||||
// The predicate filters out external + non-online rows server-side,
|
||||
// modelled as the empty result those filters produce.
|
||||
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}))
|
||||
// The predicate filters out external + out-of-scope-status rows
|
||||
// server-side, modelled as the empty result those filters produce.
|
||||
expectReconcileQuery(mock, reconcileRows())
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
@@ -180,10 +219,13 @@ func TestReconcileOnce_MixedBatch(t *testing.T) {
|
||||
}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, sqlmock.NewRows([]string{"id"}).
|
||||
AddRow("ws-dead").
|
||||
AddRow("ws-alive").
|
||||
AddRow("ws-blip"))
|
||||
expectReconcileQuery(mock, reconcileRows(
|
||||
[2]string{"ws-dead", "i-dead"},
|
||||
[2]string{"ws-alive", "i-alive"},
|
||||
[2]string{"ws-blip", "i-blip"},
|
||||
))
|
||||
// Only ws-dead reaches the re-confirm ((false,nil)); it confirms.
|
||||
expectReconfirm(mock, "ws-dead", "online", "i-dead")
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
@@ -195,6 +237,147 @@ func TestReconcileOnce_MixedBatch(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcileOnce_TOCTOU_InstanceChanged_DoesNotFlip — the HIGH-1
|
||||
// regression guard. IsRunning returns a CLEAN (false, nil), but between
|
||||
// the reconciler's SELECT and the probe the row's instance_id changed
|
||||
// (reprovision attached a fresh EC2). IsRunning's independent
|
||||
// resolveInstanceID is the reason the (false,nil) is stale: it may have
|
||||
// resolved an empty/old instance. The re-confirm sees a DIFFERENT
|
||||
// instance_id and MUST skip — flipping here would knock out a workspace
|
||||
// whose NEW EC2 is not proven dead and fire RestartByID on a just-
|
||||
// reprovisioned row.
|
||||
func TestReconcileOnce_TOCTOU_InstanceChanged_DoesNotFlip(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
checker := &fakeRunningChecker{running: map[string]bool{"ws-race": false}}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, reconcileRows([2]string{"ws-race", "i-old"}))
|
||||
// Re-confirm: row is still online but now points at a DIFFERENT
|
||||
// instance (reprovisioned) → the (false,nil) was about i-old which is
|
||||
// no longer attached → skip.
|
||||
expectReconfirm(mock, "ws-race", "online", "i-new")
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
if got := off.got(); len(got) != 0 {
|
||||
t.Fatalf("TOCTOU guard violated: instance_id changed since SELECT must NOT flip, got %v", got)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcileOnce_TOCTOU_InstanceCleared_DoesNotFlip — same HIGH-1
|
||||
// guard, the instance_id-NULLed variant (CP-orphan-sweeper or a delete
|
||||
// cleared it). Re-confirm sees an empty instance_id → skip.
|
||||
func TestReconcileOnce_TOCTOU_InstanceCleared_DoesNotFlip(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
checker := &fakeRunningChecker{running: map[string]bool{"ws-cleared": false}}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, reconcileRows([2]string{"ws-cleared", "i-gone"}))
|
||||
expectReconfirm(mock, "ws-cleared", "online", "") // instance_id cleared
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
if got := off.got(); len(got) != 0 {
|
||||
t.Fatalf("TOCTOU guard violated: cleared instance_id must NOT flip, got %v", got)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcileOnce_TOCTOU_StatusMoved_DoesNotFlip — same HIGH-1 guard,
|
||||
// the status-moved variant. The row left online/degraded (e.g. paused or
|
||||
// removed) between SELECT and re-confirm → skip.
|
||||
func TestReconcileOnce_TOCTOU_StatusMoved_DoesNotFlip(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
checker := &fakeRunningChecker{running: map[string]bool{"ws-paused": false}}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, reconcileRows([2]string{"ws-paused", "i-keep"}))
|
||||
expectReconfirm(mock, "ws-paused", "paused", "i-keep") // status moved out of scope
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
if got := off.got(); len(got) != 0 {
|
||||
t.Fatalf("TOCTOU guard violated: row no longer online/degraded must NOT flip, got %v", got)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcileOnce_TOCTOU_RowGone_DoesNotFlip — same HIGH-1 guard, the
|
||||
// row-deleted variant. The re-confirm finds no row (concurrent delete) →
|
||||
// skip; a stale (false,nil) about a just-deleted row must never fire
|
||||
// onOffline/RestartByID.
|
||||
func TestReconcileOnce_TOCTOU_RowGone_DoesNotFlip(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
checker := &fakeRunningChecker{running: map[string]bool{"ws-deleted": false}}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, reconcileRows([2]string{"ws-deleted", "i-x"}))
|
||||
expectReconfirmNoRows(mock, "ws-deleted") // row gone
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
if got := off.got(); len(got) != 0 {
|
||||
t.Fatalf("TOCTOU guard violated: deleted row must NOT flip, got %v", got)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcileOnce_Degraded_FlipsOffline — MED-3 scope. A `degraded`
|
||||
// SaaS workspace whose EC2 is gone is otherwise covered by NO sweep. It's
|
||||
// in scope (the SELECT regex requires status IN ('online','degraded')),
|
||||
// CP reports (false,nil), the re-confirm shows it STILL degraded with the
|
||||
// SAME instance → flip.
|
||||
func TestReconcileOnce_Degraded_FlipsOffline(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
checker := &fakeRunningChecker{running: map[string]bool{"ws-degraded": false}}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, reconcileRows([2]string{"ws-degraded", "i-deg"}))
|
||||
expectReconfirm(mock, "ws-degraded", "degraded", "i-deg")
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
if got := off.got(); len(got) != 1 || got[0] != "ws-degraded" {
|
||||
t.Fatalf("expected onOffline(ws-degraded), got %v", got)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconfirm_DBError_DoesNotFlip — re-confirm fail-safe. If the
|
||||
// re-confirm read itself errors (transient DB blip), we treat it as "not
|
||||
// confirmed" and skip the flip rather than acting on an unverifiable
|
||||
// (false,nil).
|
||||
func TestReconcileOnce_ReconfirmDBError_DoesNotFlip(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
checker := &fakeRunningChecker{running: map[string]bool{"ws-x": false}}
|
||||
off := &recordingOffline{}
|
||||
|
||||
expectReconcileQuery(mock, reconcileRows([2]string{"ws-x", "i-x"}))
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT status, COALESCE\(instance_id, ''\)\s+FROM workspaces\s+WHERE id = \$1`).
|
||||
WithArgs("ws-x").
|
||||
WillReturnError(errors.New("connection reset"))
|
||||
|
||||
reconcileOnce(context.Background(), checker, off.handler())
|
||||
|
||||
if got := off.got(); len(got) != 0 {
|
||||
t.Fatalf("re-confirm DB error must fail-safe (no flip), got %v", got)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcileOnce_QueryError — DB transient failure. Reconcile returns
|
||||
// without panicking and never probes IsRunning or flips anything.
|
||||
func TestReconcileOnce_QueryError(t *testing.T) {
|
||||
@@ -202,7 +385,7 @@ func TestReconcileOnce_QueryError(t *testing.T) {
|
||||
checker := &fakeRunningChecker{}
|
||||
off := &recordingOffline{}
|
||||
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces`).
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT id::text, instance_id\s+FROM workspaces`).
|
||||
WithArgs(CPInstanceReconcileLimit).
|
||||
WillReturnError(errors.New("connection refused"))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user