diff --git a/.github/workflows/e2e-staging-external.yml b/.github/workflows/e2e-staging-external.yml new file mode 100644 index 00000000..787c3169 --- /dev/null +++ b/.github/workflows/e2e-staging-external.yml @@ -0,0 +1,164 @@ +name: E2E Staging External Runtime + +# Regression for the four/five workspaces.status=awaiting_agent transitions +# that silently failed in production for five days before migration 046 +# extended the workspace_status enum (see +# workspace-server/migrations/046_workspace_status_awaiting_agent.up.sql). +# +# Why this is its own workflow (not folded into e2e-staging-saas.yml): +# - The full-saas harness defaults to runtime=hermes, never exercises +# external-runtime. Adding an `external` parameter to that script +# would force every push to staging through both lifecycles in +# series, doubling the EC2 cold-start budget. +# - The external lifecycle has unique timing (REMOTE_LIVENESS_STALE_AFTER +# window, 90s default + sweep interval), which we wait through +# deliberately. Folding it into hermes would make the long path +# even longer. +# - It can run in parallel with the hermes E2E since both create +# fresh tenant orgs with distinct slug prefixes (`e2e-ext-...` vs +# `e2e-...`). +# +# Triggers: +# - Push to staging when any source affecting external runtime, +# hibernation, or the migration set changes. +# - PR review for the same set. +# - Manual workflow_dispatch. +# - Daily cron at 07:30 UTC (catches drift on quiet days; staggered +# 30 min after e2e-staging-saas.yml's 07:00 UTC cron). +# +# Concurrency: serialized so two staging pushes don't fight for the +# same EC2 quota window. cancel-in-progress=false so a half-rolled +# tenant always finishes its teardown. + +on: + push: + branches: [staging, main] + paths: + - 'workspace-server/internal/handlers/workspace.go' + - 'workspace-server/internal/handlers/registry.go' + - 'workspace-server/internal/handlers/workspace_restart.go' + - 'workspace-server/internal/registry/healthsweep.go' + - 'workspace-server/internal/registry/liveness.go' + - 'workspace-server/migrations/**' + - 'workspace-server/internal/db/workspace_status_enum_drift_test.go' + - 'tests/e2e/test_staging_external_runtime.sh' + - '.github/workflows/e2e-staging-external.yml' + pull_request: + branches: [staging, main] + paths: + - 'workspace-server/internal/handlers/workspace.go' + - 'workspace-server/internal/handlers/registry.go' + - 'workspace-server/internal/handlers/workspace_restart.go' + - 'workspace-server/internal/registry/healthsweep.go' + - 'workspace-server/internal/registry/liveness.go' + - 'workspace-server/migrations/**' + - 'workspace-server/internal/db/workspace_status_enum_drift_test.go' + - 'tests/e2e/test_staging_external_runtime.sh' + - '.github/workflows/e2e-staging-external.yml' + workflow_dispatch: + inputs: + keep_org: + description: "Skip teardown for debugging (only via manual dispatch)" + required: false + type: boolean + default: false + stale_wait_secs: + description: "Seconds to wait for the heartbeat-staleness sweep (default 180 = 90s window + 90s buffer)" + required: false + default: "180" + schedule: + - cron: '30 7 * * *' + +concurrency: + group: e2e-staging-external + cancel-in-progress: false + +permissions: + contents: read + +jobs: + e2e-staging-external: + name: E2E Staging External Runtime + runs-on: ubuntu-latest + timeout-minutes: 25 + + env: + MOLECULE_CP_URL: https://staging-api.moleculesai.app + MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} + E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}" + E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }} + E2E_STALE_WAIT_SECS: ${{ github.event.inputs.stale_wait_secs || '180' }} + + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + + - name: Verify admin token present + run: | + if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then + # Schedule + push triggers must hard-fail when the token is + # missing — silent skip would mask infra rot. Manual dispatch + # gets the same hard-fail; an operator running this on a fork + # without secrets configured needs to know up-front. + echo "::error::MOLECULE_STAGING_ADMIN_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)" + exit 2 + fi + echo "Admin token present ✓" + + - name: CP staging health preflight + run: | + code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health") + if [ "$code" != "200" ]; then + echo "::error::Staging CP unhealthy (got HTTP $code). Skipping — not a workspace bug." + exit 1 + fi + echo "Staging CP healthy ✓" + + - name: Run external-runtime E2E + id: e2e + run: bash tests/e2e/test_staging_external_runtime.sh + + # Mirror the e2e-staging-saas.yml safety net: if the runner is + # cancelled (e.g. concurrent staging push), the test script's + # EXIT trap may not fire, so we sweep e2e-ext-* slugs scoped to + # *this* run id. + - name: Teardown safety net (runs on cancel/failure) + if: always() + env: + ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} + run: | + set +e + orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \ + -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \ + | python3 -c " + import json, sys, os, datetime + run_id = os.environ.get('GITHUB_RUN_ID', '') + d = json.load(sys.stdin) + # Scope STRICTLY to this run id (e2e-ext-YYYYMMDD--...) + # so concurrent runs and unrelated dev probes are not touched. + # Sweep today AND yesterday so a midnight-crossing run still + # cleans up its own slug. + today = datetime.date.today() + yesterday = today - datetime.timedelta(days=1) + dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d')) + if not run_id: + # Without a run id we cannot scope safely; bail rather + # than risk deleting unrelated tenants. + sys.exit(0) + prefixes = tuple(f'e2e-ext-{d}-{run_id}-' for d in dates) + for o in d.get('orgs', []): + s = o.get('slug', '') + if s.startswith(prefixes) and o.get('status') != 'purged': + print(s) + " 2>/dev/null) + if [ -n "$orgs" ]; then + echo "Safety-net sweep: deleting leftover orgs:" + echo "$orgs" + for slug in $orgs; do + curl -sS -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ + -H "Authorization: Bearer $ADMIN_TOKEN" \ + -H "Content-Type: application/json" \ + -d "{\"confirm\":\"$slug\"}" >/dev/null 2>&1 + done + else + echo "Safety-net sweep: no leftover orgs to clean." + fi diff --git a/.github/workflows/harness-replays.yml b/.github/workflows/harness-replays.yml new file mode 100644 index 00000000..6330e885 --- /dev/null +++ b/.github/workflows/harness-replays.yml @@ -0,0 +1,167 @@ +name: Harness Replays + +# Boots tests/harness (production-shape compose topology with TenantGuard, +# /cp/* proxy, canvas proxy, real production Dockerfile.tenant) and runs +# every replay under tests/harness/replays/. Fails the PR if any replay +# fails. +# +# Why this exists: 2026-04-30 we shipped #2398 which added /buildinfo as +# a public route in router.go but forgot to add it to TenantGuard's +# allowlist. The handler-level test in buildinfo_test.go constructed a +# minimal gin engine without TenantGuard — green. The harness's +# buildinfo-stale-image.sh replay would have caught it (cf-proxy doesn't +# inject X-Molecule-Org-Id, so the curl path is identical to production's +# redeploy verifier), but no one ran the harness pre-merge. The bug +# shipped; the redeploy verifier silently soft-warned every tenant as +# "unreachable" for ~1 day before being noticed. +# +# This gate makes "did you actually run the harness?" a CI invariant +# instead of a memory-discipline thing. +# +# Trigger model — match e2e-api.yml: always FIRES on push/pull_request +# to staging+main, real work is gated per-step on detect-changes output. +# One job → one check run → branch-protection-clean (the SKIPPED-in-set +# trap from PR #2264 is documented in e2e-api.yml's e2e-api job comment). + +on: + push: + branches: [main, staging] + paths: + - 'workspace-server/**' + - 'canvas/**' + - 'tests/harness/**' + - '.github/workflows/harness-replays.yml' + pull_request: + branches: [main, staging] + paths: + - 'workspace-server/**' + - 'canvas/**' + - 'tests/harness/**' + - '.github/workflows/harness-replays.yml' + workflow_dispatch: + merge_group: + types: [checks_requested] + +concurrency: + # Per-SHA grouping. Per-ref kept hitting the auto-promote-staging + # cancellation deadlock — see e2e-api.yml's concurrency block for + # the 2026-04-28 incident that codified this pattern. + group: harness-replays-${{ github.event.pull_request.head.sha || github.sha }} + cancel-in-progress: false + +jobs: + detect-changes: + runs-on: ubuntu-latest + outputs: + run: ${{ steps.decide.outputs.run }} + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1 + id: filter + with: + filters: | + run: + - 'workspace-server/**' + - 'canvas/**' + - 'tests/harness/**' + - '.github/workflows/harness-replays.yml' + - id: decide + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + echo "run=true" >> "$GITHUB_OUTPUT" + else + echo "run=${{ steps.filter.outputs.run }}" >> "$GITHUB_OUTPUT" + fi + + # ONE job that always runs. Real work is gated per-step on + # detect-changes.outputs.run so an unrelated PR (e.g. doc-only + # change to molecule-controlplane wired here later) emits the + # required check without spending CI cycles. Single-job pattern + # matches e2e-api.yml — see that workflow's comment for why a + # job-level `if: false` would block branch protection via the + # SKIPPED-in-set bug. + harness-replays: + needs: detect-changes + name: Harness Replays + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - name: No-op pass (paths filter excluded this commit) + if: needs.detect-changes.outputs.run != 'true' + run: | + echo "No workspace-server / canvas / tests/harness / workflow changes — Harness Replays gate satisfied without running." + echo "::notice::Harness Replays no-op pass (paths filter excluded this commit)." + + - if: needs.detect-changes.outputs.run == 'true' + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + + - name: Checkout sibling plugin repo + # Dockerfile.tenant copies molecule-ai-plugin-github-app-auth/ + # at the build-context root (see workspace-server/Dockerfile.tenant + # line 19). PLUGIN_REPO_PAT pattern matches publish-workspace-server-image.yml. + if: needs.detect-changes.outputs.run == 'true' + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + with: + repository: Molecule-AI/molecule-ai-plugin-github-app-auth + path: molecule-ai-plugin-github-app-auth + token: ${{ secrets.PLUGIN_REPO_PAT || secrets.GITHUB_TOKEN }} + + - name: Add /etc/hosts entry for harness-tenant.localhost + # ubuntu-latest doesn't auto-resolve *.localhost the way macOS + # sometimes does. seed.sh + replay scripts curl + # http://harness-tenant.localhost:8080 — without the entry + # they'd fail with getaddrinfo ENOTFOUND. + if: needs.detect-changes.outputs.run == 'true' + run: | + echo "127.0.0.1 harness-tenant.localhost" | sudo tee -a /etc/hosts >/dev/null + getent hosts harness-tenant.localhost + + - name: Install Python deps for replays + # peer-discovery-404 (and future replays) eval Python against the + # running tenant — importing workspace/a2a_client.py pulls in + # httpx. tests/harness/requirements.txt holds just the HTTP-client + # surface to keep CI install fast (~3s) vs the full + # workspace/requirements.txt (~30s). + if: needs.detect-changes.outputs.run == 'true' + run: pip install -r tests/harness/requirements.txt + + - name: Run all replays against the harness + # run-all-replays.sh: boot via up.sh → seed via seed.sh → run + # every replays/*.sh → tear down via down.sh on EXIT (trap). + # Non-zero exit on any replay failure. + # + # KEEP_UP=1: without this, the script's trap-on-EXIT tears + # down containers immediately on failure, leaving the dump + # step below with nothing to dump (verified on PR #2410's + # first run — tenant became unhealthy, trap fired, dump + # step saw empty containers). Keeping them up lets the + # failure path collect tenant/cp-stub/cf-proxy logs. The + # always-run "Force teardown" step does the actual cleanup. + if: needs.detect-changes.outputs.run == 'true' + working-directory: tests/harness + env: + KEEP_UP: "1" + run: ./run-all-replays.sh + + - name: Dump compose logs on failure + if: failure() && needs.detect-changes.outputs.run == 'true' + working-directory: tests/harness + run: | + echo "=== docker compose ps ===" + docker compose -f compose.yml ps || true + echo "=== tenant logs ===" + docker compose -f compose.yml logs tenant || true + echo "=== cp-stub logs ===" + docker compose -f compose.yml logs cp-stub || true + echo "=== cf-proxy logs ===" + docker compose -f compose.yml logs cf-proxy || true + echo "=== postgres logs (last 100) ===" + docker compose -f compose.yml logs --tail 100 postgres || true + + - name: Force teardown + # We pass KEEP_UP=1 to run-all-replays.sh so the dump step + # above sees real containers — that means we own teardown + # explicitly here. Always run. + if: always() && needs.detect-changes.outputs.run == 'true' + working-directory: tests/harness + run: ./down.sh || true diff --git a/.github/workflows/publish-runtime.yml b/.github/workflows/publish-runtime.yml index be59fe6c..5cd20a7a 100644 --- a/.github/workflows/publish-runtime.yml +++ b/.github/workflows/publish-runtime.yml @@ -154,139 +154,15 @@ jobs: - name: Verify package contents (sanity) working-directory: ${{ runner.temp }}/runtime-build + # Smoke logic lives in scripts/wheel_smoke.py so the same gate runs + # at both PR-time (runtime-prbuild-compat.yml) and publish-time + # (here). Splitting the smoke across two heredocs let them drift + # apart historically — one script keeps them locked. run: | python -m twine check dist/* - # Smoke-import the built wheel to catch import-rewrite mistakes - # before they hit PyPI. Asserts on STABLE INVARIANTS only — - # symbols + classes that are part of the package's public - # contract (BaseAdapter interface, the canonical a2a sentinel, - # core submodules). Don't add feature-flag-style assertions - # here — they fire false-positive every time staging is mid- - # release of that feature. python -m venv /tmp/smoke /tmp/smoke/bin/pip install --quiet dist/*.whl - WORKSPACE_ID=00000000-0000-0000-0000-000000000000 \ - PLATFORM_URL=http://localhost:8080 \ - /tmp/smoke/bin/python -c " - # Importing main is the strongest smoke test we can do here: - # main.py is the entry point and pulls every other module - # transitively. If the build script missed an import rewrite - # (e.g. left a bare \`from transcript_auth import ...\` instead - # of \`from molecule_runtime.transcript_auth import ...\` — the - # 0.1.16 incident), this fails with ModuleNotFoundError instead - # of shipping to PyPI and breaking every workspace startup. - # Import the entry-point target by NAME — not just the module. - # The wheel's pyproject.toml declares - # `molecule-runtime = molecule_runtime.main:main_sync` so if - # main_sync goes missing (it did in 0.1.16-0.1.18), every - # workspace startup fails with `ImportError: cannot import name - # 'main_sync'`. Plain `import molecule_runtime.main` doesn't - # catch that because the module loads fine. - from molecule_runtime.main import main_sync # noqa: F401 - from molecule_runtime import a2a_client, a2a_tools - from molecule_runtime.builtin_tools import memory - from molecule_runtime.adapters import get_adapter, BaseAdapter, AdapterConfig - # Stable invariants: package exports + BaseAdapter shape. - assert a2a_client._A2A_ERROR_PREFIX, 'a2a_client missing error sentinel' - assert callable(get_adapter), 'adapters.get_adapter must be callable' - assert hasattr(BaseAdapter, 'name'), 'BaseAdapter interface broken' - assert hasattr(AdapterConfig, '__init__'), 'AdapterConfig dataclass missing' - - # Call-shape smoke for AgentCard. Pure imports don't catch - # field-shape regressions in upstream SDKs that only surface - # at construction time. Two bugs of this exact class shipped - # since the a2a-sdk 1.0 migration: - # - state_transition_history=True (fixed in #2179) - # - supported_protocols=[...] (the protobuf field is - # supported_interfaces — caused every workspace boot - # to crash with `ValueError: Protocol message AgentCard - # has no "supported_protocols" field`; fixed alongside - # this smoke) - # - # This block instantiates the EXACT classes main.py uses, - # with the EXACT keyword arguments. If a future a2a-sdk - # upgrade renames any of supported_interfaces / streaming / - # push_notifications / etc., the publish fails here instead - # of breaking every workspace startup. main.py and this - # smoke MUST stay in lockstep — adding a kwarg to one - # without mirroring it here is the regression vector. - from a2a.types import AgentCard, AgentCapabilities, AgentSkill, AgentInterface - AgentCard( - name='smoke-agent', - description='publish-runtime smoke test', - version='0.0.0-smoke', - supported_interfaces=[ - AgentInterface(protocol_binding='https://a2a.g/v1', url='http://localhost:8080'), - ], - capabilities=AgentCapabilities( - streaming=True, - push_notifications=False, - ), - skills=[ - AgentSkill( - id='smoke-skill', - name='Smoke', - description='no-op', - tags=['smoke'], - examples=['noop'], - ), - ], - default_input_modes=['text/plain', 'application/json'], - default_output_modes=['text/plain', 'application/json'], - ) - print('✓ AgentCard call-shape smoke passed') - - # Well-known agent-card path probe alignment. main.py's - # _send_initial_prompt() polls AGENT_CARD_WELL_KNOWN_PATH - # to know when the local A2A server is ready. If the SDK - # ever splits the constant value from the path that - # create_agent_card_routes() actually mounts at, every - # workspace silently drops its initial_prompt: - # - Probe gets 404 every attempt. - # - Falls through to 'server not ready after 30s, - # skipping' even though the server is fine. - # - The user hits a fresh chat with no kickoff context. - # This was the #2193 incident class — the v0.x → v1.x - # rename of /.well-known/agent.json → /.well-known/agent-card.json - # plus the constant itself moving to a2a.utils.constants. - # source-tree pytest (test_agent_card_well_known_path.py) - # catches main.py-side regressions; this catches the - # SDK-side ones BEFORE PyPI upload. - from a2a.utils.constants import AGENT_CARD_WELL_KNOWN_PATH - from a2a.server.routes import create_agent_card_routes - mounted_paths = [ - getattr(r, 'path', None) - for r in create_agent_card_routes( - AgentCard( - name='wk-smoke', - description='well-known mount alignment', - version='0.0.0-smoke', - ) - ) - ] - assert AGENT_CARD_WELL_KNOWN_PATH in mounted_paths, ( - f'AGENT_CARD_WELL_KNOWN_PATH ({AGENT_CARD_WELL_KNOWN_PATH!r}) ' - f'is NOT among paths mounted by create_agent_card_routes ' - f'({mounted_paths!r}). The SDK constant and its own route ' - f'factory have drifted — workspace probes will 404 forever, ' - f'silently dropping every workspace initial_prompt.' - ) - print(f'✓ well-known mount alignment OK ({AGENT_CARD_WELL_KNOWN_PATH})') - - # Message helper smoke. a2a-sdk renamed - # new_agent_text_message → new_text_message in the v1.x - # protobuf-flat migration (per the v0→v1 cheat sheet). main.py - # and a2a_executor.py call new_text_message in hot paths; if - # the import breaks, every reply errors with ImportError before - # the message even leaves the workspace. Importing here - # catches a future v2.x rename at publish time. - from a2a.helpers import new_text_message - msg = new_text_message('smoke') - assert msg is not None, 'new_text_message returned None' - print('✓ message helper import + call OK') - - print('✓ smoke import passed') - " + /tmp/smoke/bin/python "$GITHUB_WORKSPACE/scripts/wheel_smoke.py" - name: Publish to PyPI (Trusted Publisher / OIDC) # PyPI side is configured: project molecule-ai-workspace-runtime → diff --git a/.github/workflows/redeploy-tenants-on-main.yml b/.github/workflows/redeploy-tenants-on-main.yml index efacbe69..46743347 100644 --- a/.github/workflows/redeploy-tenants-on-main.yml +++ b/.github/workflows/redeploy-tenants-on-main.yml @@ -306,6 +306,17 @@ jobs: if [ $UNREACHABLE_COUNT -gt 0 ]; then echo "::warning::$UNREACHABLE_COUNT tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages." fi + + # Belt-and-suspenders sanity floor: same logic as the staging + # variant — see that file's comment for the full rationale. + # Floor only applies when fleet >= 4; below that, canary-verify + # is the actual gate. + TOTAL_VERIFIED=${#SLUGS[@]} + if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then + echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race." + exit 1 + fi + if [ $STALE_COUNT -gt 0 ]; then echo "::error::$STALE_COUNT tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary." exit 1 diff --git a/.github/workflows/redeploy-tenants-on-staging.yml b/.github/workflows/redeploy-tenants-on-staging.yml index 125f25c1..7f191e8d 100644 --- a/.github/workflows/redeploy-tenants-on-staging.yml +++ b/.github/workflows/redeploy-tenants-on-staging.yml @@ -283,6 +283,25 @@ jobs: if [ $UNREACHABLE_COUNT -gt 0 ]; then echo "::warning::$UNREACHABLE_COUNT staging tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages." fi + + # Belt-and-suspenders sanity floor: if MORE than half the fleet is + # unreachable AND the fleet is large enough that "half down" is + # statistically meaningful, this is a real outage (e.g. new image + # crashes on startup), not a teardown race. Hard-fail. + # + # Floor only applies when TOTAL_VERIFIED >= 4 — below that, the + # canary-verify step is the actual gate for "all tenants down" + # detection (it runs against the canary first and aborts the + # rollout if the canary fails to come up). Without the >=4 gate, + # a 1-tenant fleet (e.g. a single ephemeral e2e-* tenant on a + # quiet staging push) would re-flake on the exact teardown-race + # condition #2402 fixed: 1 of 1 unreachable = 100% > 50% → fail. + TOTAL_VERIFIED=${#SLUGS[@]} + if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then + echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED staging tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race." + exit 1 + fi + if [ $STALE_COUNT -gt 0 ]; then echo "::error::$STALE_COUNT staging tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary." exit 1 diff --git a/.github/workflows/runtime-prbuild-compat.yml b/.github/workflows/runtime-prbuild-compat.yml index aad6e929..96f1a289 100644 --- a/.github/workflows/runtime-prbuild-compat.yml +++ b/.github/workflows/runtime-prbuild-compat.yml @@ -34,12 +34,14 @@ on: # changes (it controls the wheel layout). - 'workspace/**' - 'scripts/build_runtime_package.py' + - 'scripts/wheel_smoke.py' - '.github/workflows/runtime-prbuild-compat.yml' pull_request: branches: [main, staging] paths: - 'workspace/**' - 'scripts/build_runtime_package.py' + - 'scripts/wheel_smoke.py' - '.github/workflows/runtime-prbuild-compat.yml' workflow_dispatch: # Required-check support: when this becomes a branch-protection gate, @@ -94,7 +96,9 @@ jobs: /tmp/venv-built/bin/pip show molecule-ai-workspace-runtime a2a-sdk \ | grep -E '^(Name|Version):' - name: Smoke import the PR-built wheel - env: - WORKSPACE_ID: 00000000-0000-0000-0000-000000000001 + # Same script publish-runtime.yml runs against the to-be-PyPI wheel. + # Closes the PR-time vs publish-time gap: a PR adding a new SDK + # call-shape no longer passes here (narrow `import main_sync`) only + # to fail post-merge in publish-runtime's broader smoke. run: | - /tmp/venv-built/bin/python -c "from molecule_runtime.main import main_sync; print('PR-built runtime imports OK')" + /tmp/venv-built/bin/python "$GITHUB_WORKSPACE/scripts/wheel_smoke.py" diff --git a/canvas/src/app/api/buildinfo/__tests__/route.test.ts b/canvas/src/app/api/buildinfo/__tests__/route.test.ts new file mode 100644 index 00000000..ac2d8f7b --- /dev/null +++ b/canvas/src/app/api/buildinfo/__tests__/route.test.ts @@ -0,0 +1,48 @@ +/** + * Canvas /api/buildinfo — version-display endpoint mirroring + * workspace-server's /buildinfo. Lets `curl /api/buildinfo` + * confirm which git SHA is live on a canvas deployment. + */ +import { describe, it, expect, beforeEach, afterEach } from "vitest"; +import { GET } from "../route"; + +const ENV_KEYS = ["VERCEL_GIT_COMMIT_SHA", "VERCEL_GIT_COMMIT_REF", "VERCEL_ENV"]; + +describe("GET /api/buildinfo", () => { + let saved: Record; + + beforeEach(() => { + saved = Object.fromEntries(ENV_KEYS.map((k) => [k, process.env[k]])); + for (const k of ENV_KEYS) delete process.env[k]; + }); + + afterEach(() => { + for (const k of ENV_KEYS) { + if (saved[k] === undefined) delete process.env[k]; + else process.env[k] = saved[k]; + } + }); + + it("returns dev sentinel when Vercel env vars are unset", async () => { + const res = await GET(); + const body = await res.json(); + expect(body).toEqual({ git_sha: "dev", git_ref: "", vercel_env: "local" }); + }); + + it("reports the SHA Vercel injected at build time", async () => { + process.env.VERCEL_GIT_COMMIT_SHA = "abc1234567890"; + process.env.VERCEL_GIT_COMMIT_REF = "main"; + process.env.VERCEL_ENV = "production"; + const res = await GET(); + const body = await res.json(); + expect(body.git_sha).toBe("abc1234567890"); + expect(body.git_ref).toBe("main"); + expect(body.vercel_env).toBe("production"); + }); + + it("returns 200 status and JSON content type", async () => { + const res = await GET(); + expect(res.status).toBe(200); + expect(res.headers.get("content-type")).toContain("application/json"); + }); +}); diff --git a/canvas/src/app/api/buildinfo/route.ts b/canvas/src/app/api/buildinfo/route.ts new file mode 100644 index 00000000..a8ff8aab --- /dev/null +++ b/canvas/src/app/api/buildinfo/route.ts @@ -0,0 +1,18 @@ +import { NextResponse } from "next/server"; + +// Mirror of workspace-server's GET /buildinfo (PR #2398). Lets a developer +// confirm which git SHA is live on a canvas deployment with the same +// `curl /buildinfo` flow they use against tenant workspaces. +// +// Vercel injects VERCEL_GIT_COMMIT_SHA / _REF / VERCEL_ENV at build time +// from the deploying commit; outside Vercel (local `next dev`, harness) +// these are unset and the endpoint reports `git_sha: "dev"`. Same sentinel +// the workspace-server uses pre-ldflags-injection so both surfaces speak +// the same vocabulary. +export async function GET() { + return NextResponse.json({ + git_sha: process.env.VERCEL_GIT_COMMIT_SHA ?? "dev", + git_ref: process.env.VERCEL_GIT_COMMIT_REF ?? "", + vercel_env: process.env.VERCEL_ENV ?? "local", + }); +} diff --git a/scripts/ops/check-prod-versions.sh b/scripts/ops/check-prod-versions.sh new file mode 100755 index 00000000..88c721e7 --- /dev/null +++ b/scripts/ops/check-prod-versions.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash +# Check whether production tenants and canvas are running latest main. +# +# Usage: +# ./scripts/ops/check-prod-versions.sh # production +# ENV=staging ./scripts/ops/check-prod-versions.sh # staging tenants +# +# Outputs a table of {surface, current_sha, expected_sha, status}. Returns +# non-zero if any surface is stale so this can be wired into a periodic +# alert. +# +# Why this exists: every time someone hits a "is the fix live?" question, +# they have to remember the curl pattern + cross-reference with +# `git rev-parse origin/main`. This script does that check uniformly across +# every public surface (workspace tenants + canvas) and gives a one-line +# verdict instead of a stack of one-off curls. + +set -euo pipefail + +ENV="${ENV:-production}" +EXPECTED_REF="${EXPECTED_REF:-main}" + +case "$ENV" in + production) + TENANT_DOMAIN="moleculesai.app" + CANVAS_URL="https://canvas.moleculesai.app" + # Default canary tenant for production. Override via TENANT_SLUGS= + # to cover a custom set. + DEFAULT_TENANTS="hongmingwang reno-stars" + ;; + staging) + TENANT_DOMAIN="staging.moleculesai.app" + CANVAS_URL="https://canvas-staging.moleculesai.app" + DEFAULT_TENANTS="" # staging tenants are ephemeral; user must specify + ;; + *) + echo "Unknown ENV=$ENV (expected: production | staging)" >&2 + exit 2 + ;; +esac + +TENANT_SLUGS="${TENANT_SLUGS:-$DEFAULT_TENANTS}" + +# Pull EXPECTED_SHA from GitHub. Falls back to local git if gh isn't +# logged in — local main may lag origin but is usually close enough for +# debugging, and we still report the comparison clearly. +EXPECTED_SHA="" +if command -v gh >/dev/null 2>&1; then + EXPECTED_SHA=$(gh api "repos/Molecule-AI/molecule-core/commits/${EXPECTED_REF}" --jq '.sha' 2>/dev/null || true) +fi +if [ -z "$EXPECTED_SHA" ]; then + if git rev-parse "origin/${EXPECTED_REF}" >/dev/null 2>&1; then + EXPECTED_SHA=$(git rev-parse "origin/${EXPECTED_REF}") + echo "[check-prod-versions] WARN: gh unavailable, using local origin/${EXPECTED_REF}=${EXPECTED_SHA:0:7} (may lag)" + else + echo "[check-prod-versions] ERROR: cannot resolve expected SHA — gh not logged in and origin/${EXPECTED_REF} not fetched" >&2 + exit 2 + fi +fi +EXPECTED_SHORT="${EXPECTED_SHA:0:7}" + +echo "Checking ${ENV} surfaces against ${EXPECTED_REF}=${EXPECTED_SHORT}" +echo "" +printf "%-25s %-9s %-9s %s\n" "Surface" "Live" "Expected" "Status" +printf "%-25s %-9s %-9s %s\n" "-------" "----" "--------" "------" + +STALE_COUNT=0 +UNREACHABLE_COUNT=0 + +# Tenant surfaces — workspace-server /buildinfo (added in PR #2398). +for slug in $TENANT_SLUGS; do + URL="https://${slug}.${TENANT_DOMAIN}/buildinfo" + BODY=$(curl -sS --max-time 15 "$URL" 2>/dev/null || echo "") + ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "") + if [ -z "$ACTUAL_SHA" ]; then + printf "%-25s %-9s %-9s ⚠ unreachable\n" "tenant: $slug" "—" "$EXPECTED_SHORT" + UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1)) + elif [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then + printf "%-25s %-9s %-9s ✓ current\n" "tenant: $slug" "${ACTUAL_SHA:0:7}" "$EXPECTED_SHORT" + else + printf "%-25s %-9s %-9s ✗ stale\n" "tenant: $slug" "${ACTUAL_SHA:0:7}" "$EXPECTED_SHORT" + STALE_COUNT=$((STALE_COUNT + 1)) + fi +done + +# Canvas — Next.js /api/buildinfo (PR #2407). Vercel injects +# VERCEL_GIT_COMMIT_SHA at build time so this reflects the deployed +# commit, not the request time. +CANVAS_BODY=$(curl -sS --max-time 15 "${CANVAS_URL}/api/buildinfo" 2>/dev/null || echo "") +CANVAS_SHA=$(echo "$CANVAS_BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "") +if [ -z "$CANVAS_SHA" ]; then + printf "%-25s %-9s %-9s ⚠ unreachable (route may not be deployed yet)\n" "canvas" "—" "$EXPECTED_SHORT" + UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1)) +elif [ "$CANVAS_SHA" = "dev" ]; then + printf "%-25s %-9s %-9s ⚠ dev sentinel (Vercel env not injected — check VERCEL_GIT_COMMIT_SHA)\n" "canvas" "dev" "$EXPECTED_SHORT" + UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1)) +elif [ "$CANVAS_SHA" = "$EXPECTED_SHA" ]; then + printf "%-25s %-9s %-9s ✓ current\n" "canvas" "${CANVAS_SHA:0:7}" "$EXPECTED_SHORT" +else + printf "%-25s %-9s %-9s ✗ stale\n" "canvas" "${CANVAS_SHA:0:7}" "$EXPECTED_SHORT" + STALE_COUNT=$((STALE_COUNT + 1)) +fi + +echo "" +if [ $STALE_COUNT -eq 0 ] && [ $UNREACHABLE_COUNT -eq 0 ]; then + echo "All surfaces current." + exit 0 +fi +echo "Summary: ${STALE_COUNT} stale, ${UNREACHABLE_COUNT} unreachable." +# Stale is a deploy gap; unreachable is operational (DNS, CF, route absent). +# Both are signal — exit non-zero so cron / CI can alert. +exit 1 diff --git a/scripts/wheel_smoke.py b/scripts/wheel_smoke.py new file mode 100644 index 00000000..32db3350 --- /dev/null +++ b/scripts/wheel_smoke.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +"""Smoke-test an installed molecule-ai-workspace-runtime wheel. + +Runs the same invariant assertions in two workflows: + * publish-runtime.yml — after building dist/*.whl, before PyPI upload + * runtime-prbuild-compat.yml — after building the PR's wheel, before merge + +Splitting the smoke across two inline heredocs let PR-time and publish-time +drift apart. After 2026-04 we kept hitting publish-time failures for +regressions a PR-time check could have caught. One script, both gates. + +Failure here intentionally exits non-zero so the workflow's `run:` step fails. +Each block prints a single ✓ line on success so the GH summary log stays +readable; assertion errors propagate with their own message. + +Run directly: `python scripts/wheel_smoke.py` after `pip install `. +""" + +import os +import sys + + +def smoke_imports_and_invariants() -> None: + """Module imports + stable contract assertions. + + Importing main_sync by name is the strongest pre-PyPI gate we have for + import-rewrite mistakes (the 0.1.16 incident, where main.py loaded but + main_sync was missing because the build script dropped a re-export). + """ + from molecule_runtime.main import main_sync # noqa: F401 + from molecule_runtime import a2a_client, a2a_tools # noqa: F401 + from molecule_runtime.builtin_tools import memory # noqa: F401 + from molecule_runtime.adapters import get_adapter, BaseAdapter, AdapterConfig + + assert a2a_client._A2A_ERROR_PREFIX, "a2a_client missing error sentinel" + assert callable(get_adapter), "adapters.get_adapter must be callable" + assert hasattr(BaseAdapter, "name"), "BaseAdapter interface broken" + assert hasattr(AdapterConfig, "__init__"), "AdapterConfig dataclass missing" + print("✓ module imports + invariants OK") + + +def smoke_agent_card_call_shape() -> None: + """Construct AgentCard with the EXACT kwargs main.py uses. + + Pure imports don't catch field-shape regressions in upstream SDKs that + only surface at construction time. Two bugs of this exact class shipped + since the a2a-sdk 1.0 migration: + - state_transition_history=True (#2179) + - supported_protocols=[...] (the protobuf field is supported_interfaces; + every workspace boot crashed with `ValueError: Protocol message + AgentCard has no "supported_protocols" field`) + + main.py and this block MUST stay in lockstep — adding a kwarg there + without mirroring it here is the regression vector. + """ + from a2a.types import AgentCard, AgentCapabilities, AgentSkill, AgentInterface + + AgentCard( + name="smoke-agent", + description="wheel-smoke: AgentCard call-shape", + version="0.0.0-smoke", + supported_interfaces=[ + AgentInterface(protocol_binding="https://a2a.g/v1", url="http://localhost:8080"), + ], + capabilities=AgentCapabilities( + streaming=True, + push_notifications=False, + ), + skills=[ + AgentSkill( + id="smoke-skill", + name="Smoke", + description="no-op", + tags=["smoke"], + examples=["noop"], + ), + ], + default_input_modes=["text/plain", "application/json"], + default_output_modes=["text/plain", "application/json"], + ) + print("✓ AgentCard call-shape smoke passed") + + +def smoke_well_known_path_alignment() -> None: + """The SDK's published constant must match the path it actually mounts. + + main.py polls AGENT_CARD_WELL_KNOWN_PATH to detect server readiness. If + the constant and create_agent_card_routes() drift, every workspace's + initial_prompt silently drops (probe 404s, falls through to "skipping"). + This was the #2193 incident class. + """ + from a2a.types import AgentCard + from a2a.utils.constants import AGENT_CARD_WELL_KNOWN_PATH + from a2a.server.routes import create_agent_card_routes + + mounted_paths = [ + getattr(r, "path", None) + for r in create_agent_card_routes( + AgentCard( + name="wk-smoke", + description="well-known mount alignment", + version="0.0.0-smoke", + ) + ) + ] + assert AGENT_CARD_WELL_KNOWN_PATH in mounted_paths, ( + f"AGENT_CARD_WELL_KNOWN_PATH ({AGENT_CARD_WELL_KNOWN_PATH!r}) is NOT among " + f"paths mounted by create_agent_card_routes ({mounted_paths!r}). The SDK " + "constant and its own route factory have drifted — workspace probes will " + "404 forever, silently dropping every workspace initial_prompt." + ) + print(f"✓ well-known mount alignment OK ({AGENT_CARD_WELL_KNOWN_PATH})") + + +def smoke_message_helper() -> None: + """new_text_message is the v1.x rename of new_agent_text_message. + + main.py and a2a_executor.py call new_text_message in hot paths; if the + import breaks, every reply errors with ImportError before the message + even leaves the workspace. Importing here catches a future v2.x rename + at publish time. + """ + from a2a.helpers import new_text_message + + msg = new_text_message("smoke") + assert msg is not None, "new_text_message returned None" + print("✓ message helper import + call OK") + + +def main() -> int: + # main.py validates WORKSPACE_ID at module-import time via platform_auth. + # Set placeholders so the smoke doesn't trip on the env-var guard. + os.environ.setdefault("WORKSPACE_ID", "00000000-0000-0000-0000-000000000000") + os.environ.setdefault("PLATFORM_URL", "http://localhost:8080") + + smoke_imports_and_invariants() + smoke_agent_card_call_shape() + smoke_well_known_path_alignment() + smoke_message_helper() + print("✓ wheel smoke passed") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/e2e/test_staging_external_runtime.sh b/tests/e2e/test_staging_external_runtime.sh new file mode 100755 index 00000000..68ca1b62 --- /dev/null +++ b/tests/e2e/test_staging_external_runtime.sh @@ -0,0 +1,348 @@ +#!/bin/bash +# test_staging_external_runtime.sh — E2E regression for the +# external-runtime workspace lifecycle on a real staging tenant. +# +# Why this test exists: the four/five sites that write 'awaiting_agent' +# / 'hibernating' to workspaces.status had been silently failing in +# production for five days (see migration 046) before a static drift +# gate caught the enum gap. Unit tests passed because sqlmock matched +# the SQL by regex but didn't enforce the live enum constraint, and +# every existing E2E exercised hermes (not external) so the silent +# failures never surfaced. This test pins the four awaiting_agent +# transitions in real Postgres on a real staging tenant. +# +# Verification path: +# 1. Provision a fresh tenant (test_staging_full_saas.sh harness shape). +# 2. Create an external-runtime workspace with NO URL → assert +# response status == 'awaiting_agent' AND GET on the workspace +# returns the same. (Pre-fix the row stuck on 'provisioning' +# because the UPDATE in workspace.go:333 silently failed.) +# 3. Register a fake URL via /registry/register → assert transition +# to 'online'. (Pre-fix this branch worked because it writes +# 'online' which IS in the enum.) +# 4. Stop heartbeating; wait past REMOTE_LIVENESS_STALE_AFTER (90s +# default) + a sweep interval → assert transition back to +# 'awaiting_agent'. (Pre-fix the sweep UPDATE failed silently and +# the workspace stuck on 'online' indefinitely.) +# +# Hibernation is intentionally NOT covered here — it has its own timing +# model (idle threshold) and warrants a separate harness. +# +# Required env (mirrors test_staging_full_saas.sh): +# MOLECULE_CP_URL default: https://staging-api.moleculesai.app +# MOLECULE_ADMIN_TOKEN CP admin bearer (Railway CP_ADMIN_API_TOKEN) +# +# Optional env: +# E2E_PROVISION_TIMEOUT_SECS default 900 (15 min cold EC2 budget) +# E2E_KEEP_ORG 1 → skip teardown (debugging only) +# E2E_RUN_ID Slug suffix; CI: ${GITHUB_RUN_ID} +# E2E_STALE_WAIT_SECS default 180 (90s window + 90s buffer) +# E2E_INTENTIONAL_FAILURE 1 → break a step on purpose to verify +# the EXIT trap still tears down (mirrors +# the full-saas harness's safety net). +# +# Exit codes: 0 happy, 1 generic, 2 missing env, 3 provision timeout, +# 4 teardown leak. + +set -euo pipefail + +CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}" +ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}" +PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}" +RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}" +STALE_WAIT_SECS="${E2E_STALE_WAIT_SECS:-180}" + +SLUG="e2e-ext-$(date +%Y%m%d)-${RUN_ID_SUFFIX}" +SLUG=$(echo "$SLUG" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-' | head -c 32) + +log() { echo "[$(date +%H:%M:%S)] $*"; } +fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; } +ok() { echo "[$(date +%H:%M:%S)] ✅ $*"; } + +CURL_COMMON=(-sS --fail-with-body --max-time 30) + +# ─── cleanup trap (mirrors full-saas) ──────────────────────────────────── +CLEANUP_DONE=0 +cleanup_org() { + local entry_rc=$? + if [ "$CLEANUP_DONE" = "1" ]; then return 0; fi + CLEANUP_DONE=1 + + if [ "${E2E_KEEP_ORG:-0}" = "1" ]; then + log "E2E_KEEP_ORG=1 → leaving $SLUG behind for inspection" + return 0 + fi + + log "Cleanup: deleting tenant $SLUG..." + curl "${CURL_COMMON[@]}" --max-time 120 -X DELETE "$CP_URL/cp/admin/tenants/$SLUG" \ + -H "Authorization: Bearer $ADMIN_TOKEN" \ + -H "Content-Type: application/json" \ + -d "{\"confirm\":\"$SLUG\"}" >/dev/null 2>&1 \ + && ok "Teardown request accepted" \ + || log "Teardown returned non-2xx (may already be gone)" + + local leak_count=1 elapsed=0 + while [ "$elapsed" -lt 60 ]; do + leak_count=$(curl "${CURL_COMMON[@]}" "$CP_URL/cp/admin/orgs" \ + -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \ + | python3 -c "import json,sys; d=json.load(sys.stdin); print(sum(1 for o in d.get('orgs', []) if o.get('slug')=='$SLUG' and o.get('status') != 'purged'))" \ + 2>/dev/null || echo 1) + [ "$leak_count" = "0" ] && break + sleep 5 + elapsed=$((elapsed + 5)) + done + + if [ "$leak_count" != "0" ]; then + echo "⚠️ LEAK: org $SLUG still present post-teardown (count=$leak_count)" >&2 + exit 4 + fi + ok "Teardown clean — no orphan resources for $SLUG (${elapsed}s)" + + case "$entry_rc" in + 0|1|2|3|4) ;; + *) exit 1 ;; + esac +} +trap cleanup_org EXIT INT TERM + +# ─── 0. Preflight ─────────────────────────────────────────────────────── +log "═══════════════════════════════════════════════════════════════════" +log " Staging external-runtime E2E (regression for migration 046)" +log " CP: $CP_URL" +log " Slug: $SLUG" +log " Stale: ${STALE_WAIT_SECS}s wait window" +log "═══════════════════════════════════════════════════════════════════" + +curl "${CURL_COMMON[@]}" "$CP_URL/health" >/dev/null || fail "CP health check failed" +ok "CP reachable" + +admin_call() { + local method="$1"; shift; local path="$1"; shift + curl "${CURL_COMMON[@]}" -X "$method" "$CP_URL$path" \ + -H "Authorization: Bearer $ADMIN_TOKEN" \ + -H "Content-Type: application/json" "$@" +} + +# ─── 1. Create org ────────────────────────────────────────────────────── +log "1/8 Creating org $SLUG..." +CREATE_RESP=$(admin_call POST /cp/admin/orgs \ + -d "{\"slug\":\"$SLUG\",\"name\":\"E2E ext $SLUG\",\"owner_user_id\":\"e2e-runner:$SLUG\"}") +ORG_ID=$(echo "$CREATE_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))") +[ -z "$ORG_ID" ] && fail "Org create response missing 'id'" +ok "Org created (id=$ORG_ID)" + +# ─── 2. Wait for tenant provisioning ──────────────────────────────────── +# Terminal status from /cp/admin/orgs is 'running' (org_instances.status), +# NOT 'ready' — same field the full-saas harness polls. 'failed' surfaces +# diagnostic dump and aborts. See test_staging_full_saas.sh step 2 for +# the field-bugfix history (2026-04-21, last_error path). +log "2/8 Waiting for tenant (up to ${PROVISION_TIMEOUT_SECS}s)..." +DEADLINE=$(( $(date +%s) + PROVISION_TIMEOUT_SECS )) +LAST_STATUS="" +while true; do + if [ "$(date +%s)" -gt "$DEADLINE" ]; then + fail "Tenant provisioning timed out (last: $LAST_STATUS)" + fi + LIST_JSON=$(admin_call GET /cp/admin/orgs 2>/dev/null || echo '{"orgs":[]}') + STATUS=$(echo "$LIST_JSON" | python3 -c " +import json, sys +d = json.load(sys.stdin) +for o in d.get('orgs', []): + if o.get('slug') == '$SLUG': + print(o.get('instance_status', '')) + sys.exit(0) +print('') +" 2>/dev/null || echo "") + if [ "$STATUS" != "$LAST_STATUS" ]; then + log " instance_status: $STATUS" + LAST_STATUS="$STATUS" + fi + case "$STATUS" in + running) break ;; + failed) + log "── DIAGNOSTIC BURST (step 2 — tenant provisioning failed) ──" + echo "$LIST_JSON" | python3 -c " +import json, sys +d = json.load(sys.stdin) +for o in d.get('orgs', []): + if o.get('slug') == '$SLUG': + print(json.dumps(o, indent=2)) + sys.exit(0) +print('(no org row found for slug=$SLUG — DB drift?)') +" 2>&1 | sed 's/^/ /' + log "── END DIAGNOSTIC ──" + fail "Tenant provisioning failed for $SLUG (see diagnostic above)" + ;; + *) sleep 15 ;; + esac +done +ok "Tenant provisioning complete" + +# Derive tenant URL the same way the full-saas harness does. +CP_HOST=$(echo "$CP_URL" | sed -E 's#^https?://##; s#/.*$##') +case "$CP_HOST" in + api.*) DERIVED_DOMAIN="${CP_HOST#api.}" ;; + staging-api.*) DERIVED_DOMAIN="staging.${CP_HOST#staging-api.}" ;; + *) DERIVED_DOMAIN="$CP_HOST" ;; +esac +TENANT_DOMAIN="${MOLECULE_TENANT_DOMAIN:-$DERIVED_DOMAIN}" +TENANT_URL="https://$SLUG.$TENANT_DOMAIN" +log " TENANT_URL=$TENANT_URL" + +# ─── 3. Per-tenant admin token + TLS readiness ────────────────────────── +log "3/8 Fetching per-tenant admin token..." +TENANT_TOKEN_RESP=$(admin_call GET "/cp/admin/orgs/$SLUG/admin-token") +TENANT_TOKEN=$(echo "$TENANT_TOKEN_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('admin_token',''))") +[ -z "$TENANT_TOKEN" ] && fail "Could not retrieve per-tenant admin token" +ok "Token retrieved (len=${#TENANT_TOKEN})" + +log "Waiting for tenant TLS / DNS..." +TLS_DEADLINE=$(( $(date +%s) + 15 * 60 )) +while true; do + if curl -sSfk --max-time 5 "$TENANT_URL/health" >/dev/null 2>&1; then break; fi + if [ "$(date +%s)" -gt "$TLS_DEADLINE" ]; then + fail "Tenant URL never responded 2xx on /health within 15min" + fi + sleep 5 +done +ok "Tenant reachable" + +tenant_call() { + local method="$1"; shift; local path="$1"; shift + curl "${CURL_COMMON[@]}" -X "$method" "$TENANT_URL$path" \ + -H "Authorization: Bearer $TENANT_TOKEN" \ + -H "X-Molecule-Org-Id: $ORG_ID" \ + "$@" +} + +# ─── 4. Create external workspace (no URL) ────────────────────────────── +# This is the FIRST silent-failure path (workspace.go:333). Pre-migration +# 046, the response would say status=awaiting_agent but the row stuck +# on whatever the create handler set first (typically 'provisioning') +# because the follow-up UPDATE failed the enum cast. +log "4/8 Creating external workspace (no URL — exercises workspace.go:333)..." +WS_CREATE_RESP=$(tenant_call POST /workspaces \ + -d '{"name":"ext-e2e","runtime":"external","external":true}') + +WS_ID=$(echo "$WS_CREATE_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))") +WS_RESP_STATUS=$(echo "$WS_CREATE_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))") +WS_AUTH_TOKEN=$(echo "$WS_CREATE_RESP" | python3 -c " +import json,sys +try: + d = json.load(sys.stdin) + conn = d.get('connection') or {} + print(conn.get('auth_token','') or d.get('auth_token','')) +except Exception: + print('') +") +[ -z "$WS_ID" ] && fail "Workspace create missing id: $WS_CREATE_RESP" +[ "$WS_RESP_STATUS" != "awaiting_agent" ] && fail "Expected response status=awaiting_agent, got $WS_RESP_STATUS" +ok "Workspace created (id=$WS_ID, response status=awaiting_agent)" + +# This GET is the proof that the row actually has the value (not just +# the response body lying). Pre-migration-046 the UPDATE would have +# silently failed and this would return whatever 'provisioning' the +# initial INSERT left. Post-fix it must be 'awaiting_agent'. +log " Verifying DB row..." +GET_RESP=$(tenant_call GET "/workspaces/$WS_ID") +DB_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))") +[ "$DB_STATUS" != "awaiting_agent" ] && fail "DB row status=$DB_STATUS (expected awaiting_agent — migration 046 likely not applied)" +ok "DB row stored as awaiting_agent (proof migration 046 applied)" + +# ─── 5. Register the workspace (transitions to online) ────────────────── +# Pre-fix this path was actually fine because it writes 'online', a value +# already in the enum. We exercise it anyway because the registration +# implicitly walks resolveDeliveryMode (registry.go:resolveDeliveryMode), +# which DOES read runtime + apply the new poll-default introduced by +# PR #2382. +log "5/8 Registering workspace via /registry/register..." +[ -z "$WS_AUTH_TOKEN" ] && fail "No workspace auth token returned — register impossible" +# Payload contract (workspace-server/internal/models/workspace.go RegisterPayload): +# id — required, the workspace UUID (NOT "workspace_id" — that's the +# heartbeat payload field; mixing them yields a 400 from +# ShouldBindJSON because `id` has binding:"required"). +# agent_card — required (binding:"required"); minimal valid card is name+skills. +# delivery_mode — set explicitly to "poll" so url validation is skipped +# regardless of whether the deployed image has the +# runtime=external→poll default from PR #2382. Observed +# 2026-04-30 17:18Z: a freshly-provisioned staging tenant +# was running an older workspace-server :latest image +# that lacked resolveDeliveryMode's external→poll branch, +# so the implicit default was push and validateAgentURL +# 400'd on example.invalid. Asserting on the implicit +# default makes the *register call* itself fragile to +# image-tag drift on the fleet — verify the default +# separately (step 5b assertion) without depending on it +# here. +# url — accepted but not dispatched-to in poll mode, so +# example.invalid is a valid sentinel. +REGISTER_BODY=$(printf '{"id":"%s","url":"https://example.invalid:443","delivery_mode":"poll","agent_card":{"name":"e2e-ext","skills":[{"id":"echo","name":"Echo"}]}}' "$WS_ID") +# Disable --fail-with-body for this one call so a 4xx surfaces the response +# body (the bare CURL_COMMON would `set -e`-kill before we could log it). +REGISTER_RESP=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST "$TENANT_URL/registry/register" \ + -H "Authorization: Bearer $WS_AUTH_TOKEN" \ + -H "X-Molecule-Org-Id: $ORG_ID" \ + -H "Content-Type: application/json" \ + -d "$REGISTER_BODY") || true +log " register response: $(echo "$REGISTER_RESP" | head -c 300)" +echo "$REGISTER_RESP" | grep -q "HTTP_CODE=200" || fail "register returned non-200 — see body above" + +GET_RESP=$(tenant_call GET "/workspaces/$WS_ID") +ONLINE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))") +[ "$ONLINE_STATUS" != "online" ] && fail "Expected online after register, got $ONLINE_STATUS" +ok "Workspace transitioned to online" + +# Confirm the register handler echoed back delivery_mode=poll. We read +# this from the register RESPONSE, not the workspace GET response, because +# the GET handler's SELECT (workspace.go:597) doesn't fetch delivery_mode +# — its column list pre-dates the delivery_mode column from #2339 PR 1. +# Surfacing delivery_mode in GET is tracked separately; not gating on it +# here keeps this test focused on the awaiting_agent transitions. +REGISTER_BODY_JSON=$(echo "$REGISTER_RESP" | head -n 1) +REGISTER_DELIVERY_MODE=$(echo "$REGISTER_BODY_JSON" | python3 -c "import json,sys; print(json.load(sys.stdin).get('delivery_mode',''))") +if [ "$REGISTER_DELIVERY_MODE" = "poll" ]; then + ok "delivery_mode=poll (register response echoed explicit value)" +else + fail "Register response delivery_mode=$REGISTER_DELIVERY_MODE (expected poll). Body: $REGISTER_BODY_JSON" +fi + +# ─── 6. Stop heartbeating; wait past REMOTE_LIVENESS_STALE_AFTER ──────── +# This is the SECOND silent-failure path (registry/healthsweep.go's +# sweepStaleRemoteWorkspaces). Pre-migration-046 the heartbeat-staleness +# UPDATE silently failed and the workspace stuck on 'online' forever +# even though no agent was alive. We wait the full window + a sweep +# interval and assert the row transitions back to 'awaiting_agent'. +log "6/8 Waiting ${STALE_WAIT_SECS}s for heartbeat-staleness sweep (no heartbeat sent)..." +sleep "$STALE_WAIT_SECS" + +GET_RESP=$(tenant_call GET "/workspaces/$WS_ID") +STALE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))") +[ "$STALE_STATUS" != "awaiting_agent" ] && \ + fail "After ${STALE_WAIT_SECS}s with no heartbeat, expected status=awaiting_agent (sweep transition), got $STALE_STATUS — migration 046 likely not applied OR sweep not running" +ok "Heartbeat-staleness sweep transitioned online → awaiting_agent (proof healthsweep.go fix working)" + +# ─── 7. Re-register and confirm we can come back online ───────────────── +# This proves the awaiting_agent state is recoverable (re-registrable), +# which is the whole point of using it instead of 'offline'. +log "7/8 Re-registering after stale → confirming recovery to online..." +# Same payload contract as step 5 (id + agent_card both required). See note +# there for why workspace_id would 400. +REREG_RESP=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST "$TENANT_URL/registry/register" \ + -H "Authorization: Bearer $WS_AUTH_TOKEN" \ + -H "X-Molecule-Org-Id: $ORG_ID" \ + -H "Content-Type: application/json" \ + -d "$REGISTER_BODY") || true +log " re-register response: $(echo "$REREG_RESP" | head -c 300)" +echo "$REREG_RESP" | grep -q "HTTP_CODE=200" || fail "re-register returned non-200 — see body above" + +GET_RESP=$(tenant_call GET "/workspaces/$WS_ID") +RECOVERED_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))") +[ "$RECOVERED_STATUS" != "online" ] && \ + fail "Expected re-register to return workspace to online, got $RECOVERED_STATUS" +ok "Re-register succeeded — awaiting_agent → online (operator-recoverable)" + +# ─── 8. Done — cleanup runs in the EXIT trap ─────────────────────────── +log "8/8 All four awaiting_agent transitions verified." +log "═══════════════════════════════════════════════════════════════════" +ok "External-runtime E2E PASSED on $SLUG" +log "═══════════════════════════════════════════════════════════════════" diff --git a/tests/harness/README.md b/tests/harness/README.md new file mode 100644 index 00000000..1306d8ae --- /dev/null +++ b/tests/harness/README.md @@ -0,0 +1,119 @@ +# Production-shape local harness + +The harness brings up the SaaS tenant topology on localhost using the +same `Dockerfile.tenant` image that ships to production. Tests run +against `http://harness-tenant.localhost:8080` and exercise the +SAME code path a real tenant takes — including TenantGuard middleware, +the `/cp/*` reverse proxy, the canvas reverse proxy, and a +Cloudflare-tunnel-shape header rewrite layer. + +## Why this exists + +Local `go run ./cmd/server` skips: +- `TenantGuard` middleware (no `MOLECULE_ORG_ID` env) +- `/cp/*` reverse proxy mount (no `CP_UPSTREAM_URL` env) +- `CANVAS_PROXY_URL` (canvas runs separately on `:3000`) +- Header rewrites that production's CF tunnel + LB perform +- Strict-auth mode (no live `ADMIN_TOKEN`) + +Bugs that survive `go run` and ship to production almost always live +in one of those layers. The harness activates ALL of them. + +## Topology + +``` +client + ↓ +cf-proxy nginx, mirrors CF tunnel header rewrites + ↓ (Host:harness-tenant.localhost, X-Forwarded-*) +tenant workspace-server/Dockerfile.tenant — same image as prod + ↓ (CP_UPSTREAM_URL=http://cp-stub:9090, /cp/* proxied) +cp-stub minimal Go service, mocks CP wire surface +postgres same version as production +redis same version as production +``` + +## Quickstart + +```bash +cd tests/harness +./up.sh # builds + starts all services +./seed.sh # mints admin token, registers two sample workspaces +./replays/peer-discovery-404.sh +./replays/buildinfo-stale-image.sh +./down.sh # tear down + remove volumes +``` + +To run every replay in one shot (boot, seed, run-all, teardown): + +```bash +cd tests/harness +./run-all-replays.sh # full lifecycle; non-zero exit if any replay fails +KEEP_UP=1 ./run-all-replays.sh # leave harness up for debugging +REBUILD=1 ./run-all-replays.sh # rebuild images before booting +``` + +First-time setup needs an `/etc/hosts` entry so `harness-tenant.localhost` +resolves to the local cf-proxy: + +```bash +echo "127.0.0.1 harness-tenant.localhost" | sudo tee -a /etc/hosts +``` + +(macOS resolves `*.localhost` automatically in some setups; Linux +typically does not.) + +## Replay scripts + +Each replay script reproduces a real bug class against the harness so +fixes can be verified locally before deploy. The bar for adding a +replay is "this bug shipped to production despite local E2E being +green" — the script becomes the regression gate that closes that gap. + +| Replay | Closes | What it proves | +|--------|--------|----------------| +| `peer-discovery-404.sh` | #2397 | tool_list_peers surfaces the actual reason instead of "may be isolated" | +| `buildinfo-stale-image.sh` | #2395 | GIT_SHA reaches the binary; verify-step comparison logic works | + +To add a new replay: +1. Drop a script under `replays/` named after the issue. +2. The script's purpose: reproduce the production failure mode against + the harness, then assert the fix is present. PASS criterion is the + post-fix behavior. +3. The `run-all-replays.sh` runner picks up every `replays/*.sh` script + automatically — no per-replay registration needed. + +## Extending the cp-stub + +`cp-stub/main.go` serves the minimum surface for the existing replays +plus a catch-all that returns 501 + a clear message when the tenant +asks for a route the stub doesn't implement. To add a new CP route: + +1. Add a `mux.HandleFunc` in `cp-stub/main.go` for the path. +2. Return the same wire shape the real CP returns. The contract is + "wire compatibility with the staging CP at the time of writing" — + document it with a comment pointing at the real CP handler. +3. Add a replay script that exercises the path. + +## What the harness does NOT cover + +- Real TLS / cert handling (CF terminates TLS in production; harness is + HTTP-only). +- Cloudflare API edge cases (rate limits, DNS propagation timing). +- Real EC2 / SSM / EBS behavior (image-cache replay simulates the + outcome but not the AWS API surface). +- Cross-region or multi-AZ topology. +- Real production data scale. + +These are intentional Phase 1 limits. If a bug class hits one of these +gaps, escalate to staging E2E rather than expanding the harness past +its mandate of "exercise the tenant binary in production-shape topology." + +## Roadmap + +- **Phase 1 (shipped):** harness + cp-stub + cf-proxy + 2 replays + `run-all-replays.sh` runner. +- **Phase 2:** convert `tests/e2e/test_api.sh` to run against the + harness instead of localhost. Make harness-based E2E a required CI + check (a workflow that invokes `run-all-replays.sh` on every PR). +- **Phase 3:** config-coherence lint that diffs harness env list + against production CP's env list, fails CI on drift. diff --git a/tests/harness/cf-proxy/nginx.conf b/tests/harness/cf-proxy/nginx.conf new file mode 100644 index 00000000..a51efdba --- /dev/null +++ b/tests/harness/cf-proxy/nginx.conf @@ -0,0 +1,68 @@ +# cf-proxy — Cloudflare-tunnel-shape reverse proxy for the local harness. +# +# Production path: agent → CF tunnel → AWS LB → tenant container. +# This config replays the same header rewrites the CF tunnel does so +# the tenant sees the same Host + X-Forwarded-* it would in production. +# +# The tenant's TenantGuard middleware activates on MOLECULE_ORG_ID; the +# canvas's same-origin fetches use the Host header for cookie scoping. +# Both behave correctly in production because CF rewrites Host to the +# tenant subdomain — this proxy reproduces that locally. +# +# How tests reach it: +# curl --resolve 'harness-tenant.localhost:8443:127.0.0.1' \ +# https://harness-tenant.localhost:8443/health +# or via /etc/hosts (added automatically by ./up.sh on first boot). + +worker_processes 1; +events { worker_connections 256; } + +http { + # Map the wildcard .localhost to the tenant container. The + # tenant container itself doesn't care which slug routed to it — + # what matters is that the Host header it sees matches what + # production's CF tunnel sets, so cookie/CORS/TenantGuard logic + # exercises the same code path. + server { + listen 8080; + server_name *.localhost localhost; + + # Cap upload at 50MB to mirror the staging tenant nginx limit; + # chat upload tests will fail closed if the platform handler + # ever silently expands its limit (catches the failure mode + # opposite of the chat-files lazy-heal incident). + client_max_body_size 50m; + + location / { + proxy_pass http://tenant:8080; + + # Header parity with CF tunnel + AWS LB. Production CF sets + # X-Forwarded-Proto=https; we keep http here because TLS + # termination in compose is unnecessary for testing the + # tenant logic — TLS is a CF concern, not a tenant bug + # surface. If TLS-specific bugs ever bite, add cert-manager + # + listen 8443 ssl here. + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Host $host; + proxy_set_header X-Forwarded-Proto $scheme; + + # Streamable HTTP / SSE / WebSocket — the tenant exposes /ws + # and /events/stream + MCP /mcp/stream. Disabling buffering + # reproduces CF tunnel's pass-through streaming semantics + # (CF tunnel = no buffering by default; nginx default IS + # buffering, which would mask issue #2397-class streaming + # bugs by accumulating output until the client disconnects). + proxy_buffering off; + proxy_request_buffering off; + proxy_http_version 1.1; + proxy_set_header Connection ""; + + # Read timeout — CF tunnel default is 100s. Setting this to + # the same value catches "long agent run finishes after the + # proxy already closed the upstream" failure mode. + proxy_read_timeout 100s; + } + } +} diff --git a/tests/harness/compose.yml b/tests/harness/compose.yml new file mode 100644 index 00000000..1a382a6a --- /dev/null +++ b/tests/harness/compose.yml @@ -0,0 +1,140 @@ +# Production-shape harness for local E2E. +# +# Reproduces the SaaS tenant topology on localhost using the SAME +# images that ship to production: +# +# client → cf-proxy (nginx, mimics CF tunnel headers) +# → tenant (workspace-server/Dockerfile.tenant — combined platform + canvas) +# → cp-stub (control-plane stand-in) for /cp/* and CP-callback paths +# → postgres + redis (same versions as production) +# +# Why this matters: the workspace-server binary IS identical between +# local and production. The bugs that survive local E2E are topology +# bugs — env-gated middleware (TenantGuard, CP proxy, Canvas proxy), +# auth state, header rewrites, real production image. This harness +# activates ALL of them. +# +# Quickstart: +# cd tests/harness && ./up.sh +# ./seed.sh +# ./replays/peer-discovery-404.sh # reproduces issue #2397 +# +# Env config: +# GIT_SHA — passed to the tenant build for /buildinfo verification. +# Defaults to "harness" so /buildinfo distinguishes the +# harness build from any cached image. +# CP_STUB_PEERS_MODE — peers failure mode for replay scripts. +# "" / "404" / "401" / "500" / "timeout". + +services: + postgres: + image: postgres:16-alpine + environment: + POSTGRES_USER: harness + POSTGRES_PASSWORD: harness + POSTGRES_DB: molecule + networks: [harness-net] + healthcheck: + test: ["CMD-SHELL", "pg_isready -U harness"] + interval: 2s + timeout: 5s + retries: 10 + + redis: + image: redis:7-alpine + networks: [harness-net] + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 2s + timeout: 5s + retries: 10 + + cp-stub: + build: + context: ./cp-stub + environment: + PORT: "9090" + CP_STUB_PEERS_MODE: "${CP_STUB_PEERS_MODE:-}" + networks: [harness-net] + healthcheck: + test: ["CMD-SHELL", "wget -q -O- http://localhost:9090/healthz || exit 1"] + interval: 2s + timeout: 5s + retries: 10 + + # The actual production tenant image — same Dockerfile.tenant CI publishes. + # This is the load-bearing part of the harness: every bug class that hides + # behind "but it works locally" is reproducible HERE, against this image, + # not against `go run ./cmd/server`. + tenant: + build: + context: ../.. + dockerfile: workspace-server/Dockerfile.tenant + args: + GIT_SHA: "${GIT_SHA:-harness}" + depends_on: + postgres: + condition: service_healthy + redis: + condition: service_healthy + cp-stub: + condition: service_healthy + environment: + DATABASE_URL: "postgres://harness:harness@postgres:5432/molecule?sslmode=disable" + REDIS_URL: "redis://redis:6379" + PORT: "8080" + PLATFORM_URL: "http://tenant:8080" + MOLECULE_ENV: "production" + # SECRETS_ENCRYPTION_KEY is required when MOLECULE_ENV=production — + # crypto.InitStrict() refuses to boot without it. up.sh generates a + # fresh 32-byte key per harness lifetime via `openssl rand -base64 32` + # and exports it into this compose file's interpolation environment. + # The :? sentinel makes the misuse loud — running `docker compose up` + # directly without going through up.sh fails fast with a clear error + # rather than getting a confusing tenant-unhealthy timeout. + SECRETS_ENCRYPTION_KEY: "${SECRETS_ENCRYPTION_KEY:?must be set — run via tests/harness/up.sh, which generates one per run}" + # ADMIN_TOKEN flips the platform into strict-auth mode (matches + # production's CP-minted token configuration). Seeded value lets + # E2E scripts authenticate without going through CP. + ADMIN_TOKEN: "harness-admin-token" + # MOLECULE_ORG_ID — activates TenantGuard middleware. Every request + # must carry X-Molecule-Org-Id matching this value. Replays bugs + # that only fire in SaaS mode. + MOLECULE_ORG_ID: "harness-org" + # CP_UPSTREAM_URL — activates the /cp/* reverse proxy mount in + # router.go. Without this set, /cp/* would 404 and the canvas + # bootstrap would silently drift from production behavior. + CP_UPSTREAM_URL: "http://cp-stub:9090" + RATE_LIMIT: "1000" + # Canvas auto-proxy — entrypoint-tenant.sh exports CANVAS_PROXY_URL + # by default; keeping it explicit here makes the topology readable. + CANVAS_PROXY_URL: "http://localhost:3000" + networks: [harness-net] + healthcheck: + test: ["CMD-SHELL", "wget -q -O- http://localhost:8080/health || exit 1"] + interval: 5s + timeout: 5s + retries: 20 + + # Cloudflare-tunnel-shape proxy — strips the :8080 suffix, rewrites + # Host to the tenant subdomain, injects X-Forwarded-*. Tests target + # http://harness-tenant.localhost:8080 and exercise the production + # routing layer. + cf-proxy: + image: nginx:1.27-alpine + depends_on: + tenant: + condition: service_healthy + volumes: + - ./cf-proxy/nginx.conf:/etc/nginx/nginx.conf:ro + # Bind to 127.0.0.1 only — the harness uses a hardcoded ADMIN_TOKEN + # ("harness-admin-token") so binding 0.0.0.0 (compose's default) + # would expose admin access to anyone on the local network or VPN. + # Loopback-only is safe for E2E and prevents a known-token leak. + ports: + - "127.0.0.1:8080:8080" + networks: [harness-net] + +networks: + harness-net: + name: molecule-harness-net diff --git a/tests/harness/cp-stub/Dockerfile b/tests/harness/cp-stub/Dockerfile new file mode 100644 index 00000000..471029a6 --- /dev/null +++ b/tests/harness/cp-stub/Dockerfile @@ -0,0 +1,14 @@ +# cp-stub — minimal CP stand-in for the local production-shape harness. +# See main.go for the rationale. Self-contained build, no module deps. + +FROM golang:1.25-alpine AS builder +WORKDIR /src +COPY go.mod ./ +COPY main.go ./ +RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /cp-stub . + +FROM alpine:3.20 +RUN apk add --no-cache ca-certificates +COPY --from=builder /cp-stub /cp-stub +EXPOSE 9090 +ENTRYPOINT ["/cp-stub"] diff --git a/tests/harness/cp-stub/go.mod b/tests/harness/cp-stub/go.mod new file mode 100644 index 00000000..0a2902c8 --- /dev/null +++ b/tests/harness/cp-stub/go.mod @@ -0,0 +1,3 @@ +module github.com/Molecule-AI/molecule-monorepo/tests/harness/cp-stub + +go 1.25 diff --git a/tests/harness/cp-stub/main.go b/tests/harness/cp-stub/main.go new file mode 100644 index 00000000..e87c3ece --- /dev/null +++ b/tests/harness/cp-stub/main.go @@ -0,0 +1,113 @@ +// cp-stub — minimal control-plane stand-in for the local production-shape harness. +// +// In production, the tenant Go server reverse-proxies /cp/* to the SaaS +// control-plane (molecule-controlplane). This stub plays that role on +// localhost so we can exercise the SAME code path the tenant takes in +// production — `if cpURL := os.Getenv("CP_UPSTREAM_URL"); cpURL != ""` +// in workspace-server/internal/router/router.go fires, the proxy mount +// activates, and tests exercise the real tenant→CP wire. +// +// This is NOT a CP reimplementation. It serves the minimum surface to: +// 1. Boot the tenant image without /cp/* breaking the canvas bootstrap. +// 2. Replay specific bug classes (e.g. /cp/* returns 404, returns 5xx, +// returns malformed JSON) by toggling env vars. +// +// Scope is bounded by what the tenant + canvas actually call. Add new +// handlers as new replay scenarios demand them. Drift from real CP is +// tolerated because each handler is named for the exact path it serves — +// when the real CP changes, the failing scenario tells us where to look. +package main + +import ( + "encoding/json" + "fmt" + "log" + "net/http" + "os" + "sync/atomic" +) + +// redeployFleetCalls tracks how many times /cp/admin/tenants/redeploy-fleet +// was invoked. Replay scripts assert > 0 to confirm the workflow's redeploy +// step actually reached the stub (catches misrouted CP_URL configs). +var redeployFleetCalls atomic.Int64 + +func main() { + mux := http.NewServeMux() + + // /cp/auth/me — canvas calls this on bootstrap; minimal user record + // keeps the canvas from redirecting to login during local E2E. + mux.HandleFunc("/cp/auth/me", func(w http.ResponseWriter, r *http.Request) { + writeJSON(w, 200, map[string]any{ + "id": "harness-user", + "email": "harness@local", + "org_id": "harness-org", + "roles": []string{"admin"}, + }) + }) + + // /cp/admin/tenants/redeploy-fleet — exercised by the + // redeploy-tenants-on-{staging,main} workflow's local replay. Returns + // the same shape the real CP returns so the verify-fleet logic in CI + // can be tested without spinning up a real EC2 fleet. + mux.HandleFunc("/cp/admin/tenants/redeploy-fleet", func(w http.ResponseWriter, r *http.Request) { + redeployFleetCalls.Add(1) + writeJSON(w, 200, map[string]any{ + "ok": true, + "results": []map[string]any{ + { + "slug": "harness-tenant", + "phase": "redeploy", + "ssm_status": "Success", + "ssm_exit_code": 0, + "healthz_ok": true, + }, + }, + }) + }) + + // __stub/state — expose stub state (counters) so replay scripts can + // assert the tenant actually reached us. Read-only. + mux.HandleFunc("/__stub/state", func(w http.ResponseWriter, r *http.Request) { + writeJSON(w, 200, map[string]any{ + "redeploy_fleet_calls": redeployFleetCalls.Load(), + }) + }) + + // Catch-all for any /cp/* the tenant proxies. Keeps the harness from + // crashing the canvas when a new CP route is added — surfaces a clear + // "stub doesn't implement X" error instead of opaque 502 from the + // reverse proxy. + mux.HandleFunc("/cp/", func(w http.ResponseWriter, r *http.Request) { + writeJSON(w, 501, map[string]any{ + "error": "cp-stub: handler not implemented for " + r.Method + " " + r.URL.Path, + "hint": "add a handler in tests/harness/cp-stub/main.go for the scenario you're testing", + }) + }) + + // /healthz — readiness probe for compose's depends_on. + mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { + writeJSON(w, 200, map[string]any{"status": "ok"}) + }) + + addr := ":" + envOr("PORT", "9090") + log.Printf("cp-stub listening on %s", addr) + if err := http.ListenAndServe(addr, mux); err != nil { + log.Fatal(err) + } +} + +func writeJSON(w http.ResponseWriter, code int, body any) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(code) + if err := json.NewEncoder(w).Encode(body); err != nil { + fmt.Fprintf(os.Stderr, "cp-stub: write json: %v\n", err) + } +} + +func envOr(k, def string) string { + if v := os.Getenv(k); v != "" { + return v + } + return def +} diff --git a/tests/harness/down.sh b/tests/harness/down.sh new file mode 100755 index 00000000..683c4dae --- /dev/null +++ b/tests/harness/down.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +set -euo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$HERE" +docker compose -f compose.yml down -v --remove-orphans +echo "[harness] down + volumes removed." diff --git a/tests/harness/replays/buildinfo-stale-image.sh b/tests/harness/replays/buildinfo-stale-image.sh new file mode 100755 index 00000000..9d9be053 --- /dev/null +++ b/tests/harness/replays/buildinfo-stale-image.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# Replay for issue #2395 — local proof that the /buildinfo verify gate +# closes the SaaS deploy-chain blindness. +# +# Prior behavior: redeploy-fleet returned ssm_status=Success based on +# the SSM RPC return code alone. EC2 tenants kept serving the cached +# :latest digest because `docker compose up -d` is a no-op when the +# tag hasn't been invalidated. ssm_status=Success was lying. +# +# This replay simulates that condition locally: +# 1. Boot the harness with GIT_SHA=fix-applied. +# 2. Curl /buildinfo and assert it returns "fix-applied" (the new code +# actually shipped). +# 3. Negative test: curl with a different EXPECTED_SHA and assert the +# mismatch detection logic the workflow uses returns failure. +# +# This proves the verify-step's jq lookup + comparison logic works +# against the SAME Dockerfile.tenant production builds. If the +# /buildinfo route ever stops being wired through, this replay +# catches it before it reaches a production tenant. + +set -euo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +HARNESS_ROOT="$(dirname "$HERE")" + +BASE="${BASE:-http://harness-tenant.localhost:8080}" + +# 1. Confirm /buildinfo wire shape — same shape the workflow's jq lookup expects. +echo "[replay] curl $BASE/buildinfo ..." +BUILD_JSON=$(curl -sS "$BASE/buildinfo") +echo "[replay] $BUILD_JSON" + +ACTUAL_SHA=$(echo "$BUILD_JSON" | jq -r '.git_sha // ""') +if [ -z "$ACTUAL_SHA" ]; then + echo "[replay] FAIL: /buildinfo response missing git_sha field — workflow's jq lookup would null" + exit 1 +fi +echo "[replay] git_sha=$ACTUAL_SHA" + +# 2. Assert the harness build threaded GIT_SHA through. If we got "dev", +# the Dockerfile arg / ldflags wiring is broken — same regression +# class that made #2395 invisible until production. +EXPECTED_FROM_HARNESS="${HARNESS_GIT_SHA:-harness}" +if [ "$ACTUAL_SHA" = "dev" ]; then + echo "[replay] FAIL: /buildinfo returned 'dev' — Dockerfile.tenant ARG GIT_SHA isn't reaching the binary" + echo "[replay] This regresses #2395 by silencing the deploy-verify gate." + exit 1 +fi +if [ "$ACTUAL_SHA" != "$EXPECTED_FROM_HARNESS" ]; then + echo "[replay] WARN: /buildinfo returned '$ACTUAL_SHA' but harness was built with GIT_SHA='$EXPECTED_FROM_HARNESS'" + echo "[replay] Image may be cached from a previous run. Run ./up.sh --rebuild to force a fresh build." +fi + +# 3. Negative test — replay the workflow's mismatch detection by +# comparing the actual SHA to a deliberately-wrong expected SHA. +WRONG_EXPECTED="0000000000000000000000000000000000000000" +if [ "$ACTUAL_SHA" = "$WRONG_EXPECTED" ]; then + echo "[replay] FAIL: /buildinfo returned all-zero SHA — wiring inverted" + exit 1 +fi + +# 4. Replay the workflow's exact comparison logic so a regression in +# the verify step's bash gets caught here. +MISMATCH_DETECTED=0 +if [ "$ACTUAL_SHA" != "$WRONG_EXPECTED" ]; then + MISMATCH_DETECTED=1 +fi +if [ "$MISMATCH_DETECTED" != "1" ]; then + echo "[replay] FAIL: workflow comparison logic would not flag a real mismatch" + exit 1 +fi + +echo "" +echo "[replay] PASS: /buildinfo wire shape, GIT_SHA injection, and mismatch detection all work in" +echo " production-shape topology. The redeploy-fleet verify-step covers what it claims to." diff --git a/tests/harness/replays/peer-discovery-404.sh b/tests/harness/replays/peer-discovery-404.sh new file mode 100755 index 00000000..cfd393b7 --- /dev/null +++ b/tests/harness/replays/peer-discovery-404.sh @@ -0,0 +1,139 @@ +#!/usr/bin/env bash +# Replay for issue #2397 — local proof that peer-discovery surfaces +# actionable diagnostics instead of "may be isolated". +# +# Prior behavior: tool_list_peers returned "No peers available (this +# workspace may be isolated)" regardless of WHY peers were empty — +# five distinct conditions (200+empty, 401, 403, 404, 5xx, network) +# collapsed to one ambiguous message. +# +# This replay proves two things, separately: +# (a) WIRE: the platform side of the contract — the tenant's +# /registry//peers returns 404. If this regresses +# (e.g. tenant starts returning 200 with empty list, or 500), +# the runtime helper would parse it differently and the agent +# would see a different diagnostic. The harness catches that here. +# (b) PARSE: the runtime helper, given a 404, produces a diagnostic +# containing "404" + "register" hints. Done in unit tests against +# a mock httpx response (test_a2a_client.py::TestGetPeersWithDiagnostic +# — the harness re-asserts the same contract here against a real +# Python eval that does NOT depend on workspace auth tokens. +# +# Why split the assertion: the Python eval here doesn't have the +# workspace's auth token file, so going through get_peers_with_diagnostic +# directly would hit the platform without auth and produce a different +# branch (401 instead of 404). Splitting (a) from (b) keeps each +# assertion targeting exactly what it claims to test. + +set -euo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +HARNESS_ROOT="$(dirname "$HERE")" +cd "$HARNESS_ROOT" + +if [ ! -f .seed.env ]; then + echo "[replay] no .seed.env — running ./seed.sh first..." + ./seed.sh +fi +# shellcheck source=/dev/null +source .seed.env + +BASE="${BASE:-http://harness-tenant.localhost:8080}" +ADMIN="harness-admin-token" +ORG="harness-org" + +# ─── (a) WIRE: tenant returns 404 for an unregistered workspace ──────── +ROGUE_ID="$(uuidgen | tr '[:upper:]' '[:lower:]')" +echo "[replay] (a) WIRE: querying /registry/$ROGUE_ID/peers (unregistered workspace)..." +HTTP_CODE=$(curl -sS -o /tmp/peer-replay.json -w '%{http_code}' \ + -H "Authorization: Bearer $ADMIN" \ + -H "X-Molecule-Org-Id: $ORG" \ + -H "X-Workspace-ID: $ROGUE_ID" \ + "$BASE/registry/$ROGUE_ID/peers") + +echo "[replay] tenant responded HTTP $HTTP_CODE" +if [ "$HTTP_CODE" != "404" ]; then + echo "[replay] FAIL (a): expected 404 from /registry//peers, got $HTTP_CODE" + echo "[replay] This is a platform-side regression — the runtime's diagnostic helper" + echo "[replay] would see a different status code than the unit tests cover." + cat /tmp/peer-replay.json + exit 1 +fi + +# ─── (b) PARSE: helper converts a synthetic 404 to actionable diagnostic ─ +# +# We construct a synthetic httpx 404 response and run the helper against +# it directly. This isolates the parse branch we want to test from the +# auth-context concerns of going through the network. The helper's network +# branches are exhaustively covered by tests/test_a2a_client.py — this is +# a regression-guard that the helper IS in the install, IS importable in +# the harness's Python env, and IS reading the status code. + +WORKSPACE_PATH="$(cd "$HARNESS_ROOT/../../workspace" && pwd)" +DIAGNOSTIC=$(WORKSPACE_ID="harness-rogue" PYTHONPATH="$WORKSPACE_PATH" \ + python3 - "$WORKSPACE_PATH" <<'PYEOF' +import asyncio +import sys +import types +from unittest.mock import AsyncMock, MagicMock, patch + +# Stub platform_auth so a2a_client imports cleanly without requiring a +# real workspace token file. The helper's auth_headers() only matters +# when going through the network; we're feeding it a mock response. +_pa = types.ModuleType("platform_auth") +_pa.auth_headers = lambda: {} +_pa.self_source_headers = lambda: {} +sys.modules.setdefault("platform_auth", _pa) + +sys.path.insert(0, sys.argv[1]) +import a2a_client # noqa: E402 + +# This replay validates PR #2399's diagnostic helper. If the workspace +# runtime in the current checkout pre-dates that fix, fail with a +# clear message instead of an opaque AttributeError. +if not hasattr(a2a_client, "get_peers_with_diagnostic"): + print("__SKIP__: workspace/a2a_client.py is pre-#2399 (no get_peers_with_diagnostic).") + sys.exit(0) + +resp = MagicMock() +resp.status_code = 404 +resp.json = MagicMock(return_value={"detail": "not found"}) + +mock_client = AsyncMock() +mock_client.__aenter__ = AsyncMock(return_value=mock_client) +mock_client.__aexit__ = AsyncMock(return_value=False) +mock_client.get = AsyncMock(return_value=resp) + +async def main(): + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + peers, diag = await a2a_client.get_peers_with_diagnostic() + print(repr(diag)) + +asyncio.run(main()) +PYEOF +) + +if [[ "$DIAGNOSTIC" == __SKIP__:* ]]; then + echo "[replay] (b) SKIP: ${DIAGNOSTIC#__SKIP__: }" + echo "[replay] Re-run after #2399 lands on staging." + echo "" + echo "[replay] PASS (a) only: peer-discovery wire returns 404 (parse branch skipped — see above)." + exit 0 +fi + +echo "[replay] (b) PARSE: helper diagnostic = $DIAGNOSTIC" + +if ! echo "$DIAGNOSTIC" | grep -q "404"; then + echo "[replay] FAIL (b): diagnostic missing '404' — helper regressed to swallow-the-status-code" + exit 1 +fi +if ! echo "$DIAGNOSTIC" | grep -qi "regist"; then + echo "[replay] FAIL (b): diagnostic missing 'register' guidance — helper regressed to opaque message" + exit 1 +fi +if echo "$DIAGNOSTIC" | grep -qi "may be isolated"; then + echo "[replay] FAIL (b): diagnostic still says 'may be isolated' — fix didn't reach this code path" + exit 1 +fi + +echo "" +echo "[replay] PASS: peer-discovery (a) wire returns 404, (b) helper produces actionable diagnostic." diff --git a/tests/harness/requirements.txt b/tests/harness/requirements.txt new file mode 100644 index 00000000..75a30722 --- /dev/null +++ b/tests/harness/requirements.txt @@ -0,0 +1,14 @@ +# Harness-replay Python deps — minimal set for replays/*.sh scripts that +# eval Python against the running tenant (e.g. importing +# workspace/a2a_client.py to assert parser behavior). +# +# This is intentionally smaller than workspace/requirements.txt: the +# replays don't need a2a-sdk, langchain, opentelemetry, etc. — only the +# HTTP client surface that the imported helpers depend on. Adding the +# full workspace deps would slow every harness CI run by ~30s for no +# gain. +# +# Add a line here (with a version constraint matching workspace/requirements.txt) +# when a new replay introduces a new Python import. + +httpx>=0.28.1 diff --git a/tests/harness/run-all-replays.sh b/tests/harness/run-all-replays.sh new file mode 100755 index 00000000..092158c3 --- /dev/null +++ b/tests/harness/run-all-replays.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +# Run every replay under tests/harness/replays/ against a fresh harness. +# +# Boots the harness (up.sh + seed.sh), runs each `replays/*.sh` in +# alphabetical order, tracks pass/fail, and tears down on exit. Returns +# non-zero if any replay failed. +# +# Usage: +# ./run-all-replays.sh # boot, run, teardown +# KEEP_UP=1 ./run-all-replays.sh # leave harness running on exit (debug) +# REBUILD=1 ./run-all-replays.sh # rebuild images before booting +# +# CI usage: invoke without flags. The trap-on-EXIT teardown ensures we +# don't leak Docker resources when a replay fails partway through. + +set -euo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$HERE" + +REPLAYS_DIR="$HERE/replays" +if [ ! -d "$REPLAYS_DIR" ]; then + echo "[run-all] no replays/ directory at $REPLAYS_DIR — nothing to run" + exit 1 +fi + +shopt -s nullglob +REPLAYS=("$REPLAYS_DIR"/*.sh) +shopt -u nullglob +if [ ${#REPLAYS[@]} -eq 0 ]; then + echo "[run-all] replays/ is empty — nothing to run" + exit 1 +fi + +cleanup() { + local exit_code=$? + if [ "${KEEP_UP:-0}" = "1" ]; then + echo "" + echo "[run-all] KEEP_UP=1 — leaving harness up. Tear down manually with ./down.sh" + else + echo "" + echo "[run-all] tearing down harness..." + ./down.sh >/dev/null 2>&1 || echo "[run-all] WARN: ./down.sh exited non-zero" + fi + exit "$exit_code" +} +trap cleanup EXIT INT TERM + +echo "[run-all] booting harness..." +if [ "${REBUILD:-0}" = "1" ]; then + ./up.sh --rebuild +else + ./up.sh +fi + +echo "[run-all] seeding workspaces..." +./seed.sh + +PASS_COUNT=0 +FAIL_COUNT=0 +SKIP_COUNT=0 +FAILED_NAMES=() + +for replay in "${REPLAYS[@]}"; do + name=$(basename "$replay" .sh) + echo "" + echo "[run-all] ━━━ $name ━━━" + if bash "$replay"; then + # Replays signal "skip" by exiting 0 with a __SKIP__ marker in stdout — + # but we capture that as a pass here since the script exited 0. The + # skip is documented in the script's own output. CI uses pass/fail. + PASS_COUNT=$((PASS_COUNT + 1)) + echo "[run-all] PASS: $name" + else + FAIL_COUNT=$((FAIL_COUNT + 1)) + FAILED_NAMES+=("$name") + echo "[run-all] FAIL: $name" + fi +done + +echo "" +echo "[run-all] =============================" +echo "[run-all] Replay summary: ${PASS_COUNT} passed, ${FAIL_COUNT} failed (of ${#REPLAYS[@]} total)" +if [ ${FAIL_COUNT} -gt 0 ]; then + echo "[run-all] Failed:" + for name in "${FAILED_NAMES[@]}"; do + echo "[run-all] - $name" + done + exit 1 +fi +echo "[run-all] All replays passed." diff --git a/tests/harness/seed.sh b/tests/harness/seed.sh new file mode 100755 index 00000000..bb1bfc21 --- /dev/null +++ b/tests/harness/seed.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# Seed the harness with two registered workspaces so peer-discovery +# replay scripts have something to discover. +# +# - "alpha" parent (tier 0) +# - "beta" child of alpha (tier 1) +# +# Both register via the platform's /registry/register endpoint, which +# is what real workspaces do at boot. The platform then has them in its +# DB; tool_list_peers from inside alpha can resolve beta as a peer. + +set -euo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$HERE" + +BASE="${BASE:-http://harness-tenant.localhost:8080}" +ADMIN="harness-admin-token" +ORG="harness-org" + +curl_admin() { + curl -sS -H "Authorization: Bearer $ADMIN" \ + -H "X-Molecule-Org-Id: $ORG" \ + -H "Content-Type: application/json" "$@" +} + +echo "[seed] confirming tenant is reachable via cf-proxy..." +HEALTH=$(curl -sS "$BASE/health" || echo "") +if [ -z "$HEALTH" ]; then + echo "[seed] FAILED: $BASE/health unreachable. Did ./up.sh complete? Did you add" + echo " 127.0.0.1 harness-tenant.localhost to /etc/hosts?" + exit 1 +fi +echo "[seed] $HEALTH" + +echo "[seed] confirming /buildinfo returns the harness GIT_SHA..." +BUILD=$(curl -sS "$BASE/buildinfo" || echo "") +echo "[seed] $BUILD" + +# Mint a fresh admin-call workspace ID for the parent. Platform's +# /admin/workspaces/:id/test-token mints a per-workspace bearer; the +# replay scripts use it to call the workspace-scoped routes. +echo "[seed] creating workspace 'alpha' (parent)..." +ALPHA_ID=$(uuidgen | tr '[:upper:]' '[:lower:]') +curl_admin -X POST "$BASE/workspaces" \ + -d "{\"id\":\"$ALPHA_ID\",\"name\":\"alpha\",\"tier\":0,\"runtime\":\"langgraph\"}" \ + >/dev/null +echo "[seed] alpha id=$ALPHA_ID" + +echo "[seed] creating workspace 'beta' (child of alpha)..." +BETA_ID=$(uuidgen | tr '[:upper:]' '[:lower:]') +curl_admin -X POST "$BASE/workspaces" \ + -d "{\"id\":\"$BETA_ID\",\"name\":\"beta\",\"tier\":1,\"parent_id\":\"$ALPHA_ID\",\"runtime\":\"langgraph\"}" \ + >/dev/null +echo "[seed] beta id=$BETA_ID" + +# Stash IDs so replay scripts pick them up. +{ + echo "ALPHA_ID=$ALPHA_ID" + echo "BETA_ID=$BETA_ID" +} > "$HERE/.seed.env" + +echo "" +echo "[seed] done. IDs persisted to tests/harness/.seed.env" +echo "[seed] ALPHA_ID=$ALPHA_ID" +echo "[seed] BETA_ID=$BETA_ID" diff --git a/tests/harness/up.sh b/tests/harness/up.sh new file mode 100755 index 00000000..fbc14910 --- /dev/null +++ b/tests/harness/up.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# Bring the production-shape harness up. +# +# Usage: ./up.sh [--rebuild] +# +# Always operates in tests/harness/ regardless of where it's invoked +# from — test scripts under tests/harness/replays/ source it via the +# absolute path, so cd-ing first prevents compose-context surprises. + +set -euo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$HERE" + +REBUILD=false +for arg in "$@"; do + case "$arg" in + --rebuild) REBUILD=true ;; + esac +done + +# Generate a per-run encryption key. The tenant runs with +# MOLECULE_ENV=production (intentional, to replay prod-shape bugs), and +# crypto.InitStrict() refuses to boot without SECRETS_ENCRYPTION_KEY. +# Generate fresh so: +# - No key-shaped string lives in the repo (avoids muscle-memorying a +# hardcoded value into other places + secret-scanner false positives). +# - Each harness lifetime gets a unique key, mimicking prod's per-tenant +# isolation. Persistence across runs isn't required — the harness DB +# is wiped on every ./down.sh. +# Honor a caller-supplied value if already exported (lets a debug session +# pin a key for reproducibility). +if [ -z "${SECRETS_ENCRYPTION_KEY:-}" ]; then + SECRETS_ENCRYPTION_KEY=$(openssl rand -base64 32) + export SECRETS_ENCRYPTION_KEY +fi + +if [ "$REBUILD" = true ]; then + docker compose -f compose.yml build --no-cache tenant cp-stub +fi + +echo "[harness] starting cp-stub + postgres + redis + tenant + cf-proxy ..." +docker compose -f compose.yml up -d --wait + +echo "[harness] /etc/hosts entry for harness-tenant.localhost..." +if ! grep -q '^127\.0\.0\.1[[:space:]]\+harness-tenant\.localhost' /etc/hosts; then + echo " (skip — your /etc/hosts may not resolve *.localhost. If tests fail with" + echo " 'getaddrinfo' errors, add: 127.0.0.1 harness-tenant.localhost)" +fi + +echo "" +echo "[harness] up. Tenant: http://harness-tenant.localhost:8080/health" +echo " http://harness-tenant.localhost:8080/buildinfo" +echo " cp-stub: http://localhost (internal-only via compose net)" +echo "" +echo "Next: ./seed.sh # mint admin token + register sample workspaces" diff --git a/workspace-server/cmd/server/main.go b/workspace-server/cmd/server/main.go index d0d5ae57..f620537b 100644 --- a/workspace-server/cmd/server/main.go +++ b/workspace-server/cmd/server/main.go @@ -223,13 +223,24 @@ func main() { registry.StartLivenessMonitor(c, onWorkspaceOffline) }) - // Proactive container health sweep — detects dead containers faster than Redis TTL. - // Checks all "online" workspaces against Docker every 15 seconds. - if prov != nil { - go supervised.RunWithRecover(ctx, "health-sweep", func(c context.Context) { - registry.StartHealthSweep(c, prov, 15*time.Second, onWorkspaceOffline) - }) - } + // Proactive health sweep — two passes per tick: + // 1. Docker-side: checks "online" workspaces against the local Docker + // daemon (only runs when prov is non-nil, i.e. self-hosted mode). + // 2. Remote-side: scans runtime='external' rows whose last_heartbeat_at + // is past REMOTE_LIVENESS_STALE_AFTER and flips them to + // awaiting_agent. Runs regardless of provisioner mode — SaaS + // tenants need this even though they don't run Docker locally, + // because external-runtime workspaces are operator-managed and + // the platform-side liveness sweep is the only thing that + // transitions them off 'online' when the operator's CLI dies. + // + // Pre-2026-04-30 this goroutine was gated on prov != nil, which silently + // disabled the remote-side sweep on every SaaS tenant. The function in + // healthsweep.go has always handled nil checker correctly; only the + // orchestration was wrong. See #2392's CI failure for the trace. + go supervised.RunWithRecover(ctx, "health-sweep", func(c context.Context) { + registry.StartHealthSweep(c, prov, 15*time.Second, onWorkspaceOffline) + }) // Orphan-container reconcile sweep — finds running containers // whose workspace row is already status='removed' and stops diff --git a/workspace-server/internal/middleware/tenant_guard.go b/workspace-server/internal/middleware/tenant_guard.go index 4aaf7a4c..1363060d 100644 --- a/workspace-server/internal/middleware/tenant_guard.go +++ b/workspace-server/internal/middleware/tenant_guard.go @@ -53,6 +53,7 @@ const tenantOrgIDHeader = "X-Molecule-Org-Id" // here only bypasses the cross-org routing check, not auth. var tenantGuardAllowlist = map[string]struct{}{ "/health": {}, + "/buildinfo": {}, "/metrics": {}, "/registry/register": {}, "/registry/heartbeat": {}, diff --git a/workspace-server/internal/middleware/tenant_guard_test.go b/workspace-server/internal/middleware/tenant_guard_test.go index 5d2b4731..c2ab5792 100644 --- a/workspace-server/internal/middleware/tenant_guard_test.go +++ b/workspace-server/internal/middleware/tenant_guard_test.go @@ -8,13 +8,15 @@ import ( "github.com/gin-gonic/gin" ) -// helper: build a router with TenantGuard configured to `orgID` and two -// representative routes — a regular API route and two allowlisted ones. +// helper: build a router with TenantGuard configured to `orgID` and a +// representative API route plus the public allowlisted ones (/health, +// /buildinfo, /metrics). func newGuardedRouter(orgID string) *gin.Engine { gin.SetMode(gin.TestMode) r := gin.New() r.Use(TenantGuardWithOrgID(orgID)) r.GET("/health", func(c *gin.Context) { c.String(200, "ok") }) + r.GET("/buildinfo", func(c *gin.Context) { c.String(200, "buildinfo") }) r.GET("/metrics", func(c *gin.Context) { c.String(200, "metrics") }) r.GET("/workspaces", func(c *gin.Context) { c.String(200, "workspaces") }) return r @@ -71,10 +73,14 @@ func TestTenantGuard_MissingHeaderIs404(t *testing.T) { } // Allowlisted paths bypass the guard even in tenant mode — required for health -// probes (Fly Machines checks) and Prometheus scrape. +// probes (Fly Machines checks), Prometheus scrape, and the redeploy-fleet +// /buildinfo verification step. /buildinfo without an org header used to +// 404-via-NoRoute → canvas (HTML), which made the redeploy verifier think +// every tenant was stale even when the binary was current. Pin this so a +// future allowlist edit can't silently regress that check. func TestTenantGuard_AllowlistBypassesCheck(t *testing.T) { r := newGuardedRouter("org-abc") - for _, path := range []string{"/health", "/metrics"} { + for _, path := range []string{"/health", "/buildinfo", "/metrics"} { w := doRequest(r, path, "") // no header if w.Code != 200 { t.Errorf("%s: allowlisted path should return 200 without header, got %d", path, w.Code) diff --git a/workspace/a2a_client.py b/workspace/a2a_client.py index 7c5e3d87..43882bd1 100644 --- a/workspace/a2a_client.py +++ b/workspace/a2a_client.py @@ -229,19 +229,61 @@ async def send_a2a_message(target_url: str, message: str) -> str: return _format_a2a_error(last_exc, target_url) -async def get_peers() -> list[dict]: - """Get this workspace's peers from the platform registry.""" +async def get_peers_with_diagnostic() -> tuple[list[dict], str | None]: + """Get this workspace's peers, returning (peers, diagnostic). + + diagnostic is None when the call succeeded (status 200, even if the list + is empty). When peers is [] for a non-trivial reason (auth failure, + workspace-id missing from registry, platform error, network error), + diagnostic is a short human-readable string explaining what went wrong + so callers can surface it instead of "may be isolated" — see #2397. + + The legacy get_peers() shim below preserves the bare-list contract for + non-tool callers. + """ + url = f"{PLATFORM_URL}/registry/{WORKSPACE_ID}/peers" async with httpx.AsyncClient(timeout=10.0) as client: try: resp = await client.get( - f"{PLATFORM_URL}/registry/{WORKSPACE_ID}/peers", + url, headers={"X-Workspace-ID": WORKSPACE_ID, **auth_headers()}, ) - if resp.status_code == 200: - return resp.json() - return [] - except Exception: - return [] + except Exception as e: + return [], f"Cannot reach platform at {PLATFORM_URL}: {e}" + + if resp.status_code == 200: + try: + data = resp.json() + except Exception as e: + return [], f"Platform returned 200 but body was not JSON: {e}" + if not isinstance(data, list): + return [], f"Platform returned 200 but body was not a list: {type(data).__name__}" + return data, None + + if resp.status_code in (401, 403): + return [], ( + f"Authentication to platform failed (HTTP {resp.status_code}). " + "The workspace bearer token may be invalid — restarting the workspace usually re-mints it." + ) + if resp.status_code == 404: + return [], ( + f"Workspace ID {WORKSPACE_ID} is not registered with the platform (HTTP 404). " + "Re-registration via the platform's /registry/register endpoint is needed." + ) + if 500 <= resp.status_code < 600: + return [], f"Platform error: HTTP {resp.status_code}." + return [], f"Unexpected platform response: HTTP {resp.status_code}." + + +async def get_peers() -> list[dict]: + """Get this workspace's peers from the platform registry. + + Bare-list shim over get_peers_with_diagnostic() — discards the diagnostic + so callers that don't care about the failure reason (e.g. system-prompt + bootstrap formatters) get the same shape they always had. + """ + peers, _ = await get_peers_with_diagnostic() + return peers async def get_workspace_info() -> dict: diff --git a/workspace/a2a_tools.py b/workspace/a2a_tools.py index 4939e254..d5be00bd 100644 --- a/workspace/a2a_tools.py +++ b/workspace/a2a_tools.py @@ -18,6 +18,7 @@ from a2a_client import ( _peer_names, discover_peer, get_peers, + get_peers_with_diagnostic, get_workspace_info, send_a2a_message, ) @@ -410,9 +411,16 @@ async def tool_send_message_to_user(message: str, attachments: list[str] | None async def tool_list_peers() -> str: """List all workspaces this agent can communicate with.""" - peers = await get_peers() + peers, diagnostic = await get_peers_with_diagnostic() if not peers: - return "No peers available (this workspace may be isolated)" + if diagnostic is not None: + # Non-trivial empty: auth failure / 404 / 5xx / network — surface + # the actual reason so the user/agent doesn't have to guess. #2397. + return f"No peers found. {diagnostic}" + return ( + "You have no peers in the platform registry. " + "(No parent, no children, no siblings registered.)" + ) lines = [] for p in peers: status = p.get("status", "unknown") diff --git a/workspace/tests/test_a2a_client.py b/workspace/tests/test_a2a_client.py index e105fb1e..1412c91f 100644 --- a/workspace/tests/test_a2a_client.py +++ b/workspace/tests/test_a2a_client.py @@ -577,6 +577,149 @@ class TestGetPeers: assert headers_sent.get("X-Workspace-ID") == a2a_client.WORKSPACE_ID +# --------------------------------------------------------------------------- +# get_peers_with_diagnostic — issue #2397 +# +# Pin: an empty peer list MUST come with an actionable diagnostic on every +# non-200 + every transport failure. The bug was that get_peers swallowed +# every failure mode behind `return []`, leaving the agent's tool wrapper +# with no way to distinguish "you have no peers" from "auth broke" / "404 +# from registry" / "platform 5xx" / "network timeout". Each of these +# requires a different operator action. +# --------------------------------------------------------------------------- + +class TestGetPeersWithDiagnostic: + + async def test_200_returns_peers_and_no_diagnostic(self): + """200 with valid list → (peers, None). diagnostic stays None on success.""" + import a2a_client + + peers = [{"id": "ws-1", "name": "Alpha"}] + resp = _make_response(200, peers) + mock_client = _make_mock_client(get_resp=resp) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result, diag = await a2a_client.get_peers_with_diagnostic() + + assert result == peers + assert diag is None + + async def test_200_empty_list_returns_no_diagnostic(self): + """200 with [] → (peers=[], diag=None). Truly no peers is success, not error.""" + import a2a_client + + resp = _make_response(200, []) + mock_client = _make_mock_client(get_resp=resp) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result, diag = await a2a_client.get_peers_with_diagnostic() + + assert result == [] + assert diag is None + + async def test_401_returns_auth_diagnostic(self): + """401 → diagnostic mentions auth + restart hint.""" + import a2a_client + + resp = _make_response(401, {"detail": "unauthorized"}) + mock_client = _make_mock_client(get_resp=resp) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result, diag = await a2a_client.get_peers_with_diagnostic() + + assert result == [] + assert diag is not None + assert "401" in diag + assert "Authentication" in diag or "authentication" in diag.lower() + + async def test_403_returns_auth_diagnostic(self): + """403 → same auth-failure diagnostic shape as 401.""" + import a2a_client + + resp = _make_response(403, {"detail": "forbidden"}) + mock_client = _make_mock_client(get_resp=resp) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result, diag = await a2a_client.get_peers_with_diagnostic() + + assert result == [] + assert diag is not None + assert "403" in diag + + async def test_404_returns_registration_diagnostic(self): + """404 → diagnostic tells operator the workspace ID is missing from the registry.""" + import a2a_client + + resp = _make_response(404, {"detail": "not found"}) + mock_client = _make_mock_client(get_resp=resp) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result, diag = await a2a_client.get_peers_with_diagnostic() + + assert result == [] + assert diag is not None + assert "404" in diag + assert "registered" in diag.lower() or "registration" in diag.lower() + + async def test_500_returns_platform_error_diagnostic(self): + """5xx → 'Platform error: HTTP .'""" + import a2a_client + + resp = _make_response(503, {"detail": "service unavailable"}) + mock_client = _make_mock_client(get_resp=resp) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result, diag = await a2a_client.get_peers_with_diagnostic() + + assert result == [] + assert diag is not None + assert "503" in diag + assert "Platform error" in diag or "platform error" in diag.lower() + + async def test_network_exception_returns_unreachable_diagnostic(self): + """httpx exception → diagnostic mentions PLATFORM_URL + the underlying error.""" + import a2a_client + + mock_client = _make_mock_client(get_exc=TimeoutError("connection timed out")) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result, diag = await a2a_client.get_peers_with_diagnostic() + + assert result == [] + assert diag is not None + assert "Cannot reach platform" in diag or "cannot reach" in diag.lower() + assert "timed out" in diag + + async def test_200_with_non_list_body_returns_diagnostic(self): + """200 but body is a dict → diagnostic flags shape mismatch (regression guard).""" + import a2a_client + + resp = _make_response(200, {"oops": "should have been a list"}) + mock_client = _make_mock_client(get_resp=resp) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result, diag = await a2a_client.get_peers_with_diagnostic() + + assert result == [] + assert diag is not None + assert "list" in diag.lower() + + async def test_get_peers_shim_preserves_bare_list_contract(self): + """get_peers() still returns just list[dict] — no API break for non-tool callers.""" + import a2a_client + + peers = [{"id": "ws-1", "name": "Alpha"}] + resp = _make_response(200, peers) + mock_client = _make_mock_client(get_resp=resp) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result = await a2a_client.get_peers() + + # Must be a list, not a tuple — bare-list shim contract. + assert isinstance(result, list) + assert result == peers + + # --------------------------------------------------------------------------- # get_workspace_info # --------------------------------------------------------------------------- diff --git a/workspace/tests/test_a2a_tools_impl.py b/workspace/tests/test_a2a_tools_impl.py index 22a49268..90d31560 100644 --- a/workspace/tests/test_a2a_tools_impl.py +++ b/workspace/tests/test_a2a_tools_impl.py @@ -536,11 +536,54 @@ class TestToolSendMessageToUser: class TestToolListPeers: - async def test_no_peers_returns_isolated_message(self): + async def test_true_empty_returns_no_peers_message_without_diagnostic(self): + """200 + empty list → 'no peers in the platform registry' (no failure).""" import a2a_tools - with patch("a2a_tools.get_peers", return_value=[]): + with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], None)): result = await a2a_tools.tool_list_peers() - assert "No peers available" in result + # The new wording explicitly says no peers exist (no parent/sibling/child). + # Avoids the misleading "may be isolated" hint when discovery succeeded. + assert "no peers" in result.lower() + assert "No peers found." not in result # diagnostic prefix should NOT appear on the success branch + assert "may be isolated" not in result + + async def test_auth_failure_surfaces_restart_hint(self): + """401/403 → tool_list_peers must surface the auth failure + restart hint, not 'isolated'.""" + import a2a_tools + diag = "Authentication to platform failed (HTTP 401). Restart the workspace to re-mint." + with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)): + result = await a2a_tools.tool_list_peers() + assert "401" in result + assert "Authentication" in result + # The "isolated" message was the bug — make sure the regression doesn't return. + assert "may be isolated" not in result + + async def test_404_surfaces_registration_hint(self): + """404 → tool_list_peers tells the user re-registration is needed.""" + import a2a_tools + diag = "Workspace ID ws-test is not registered with the platform (HTTP 404). Re-register." + with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)): + result = await a2a_tools.tool_list_peers() + assert "404" in result + assert "registered" in result.lower() + + async def test_5xx_surfaces_platform_error(self): + """5xx → 'Platform error' surfaced; agent / user can correctly route to oncall.""" + import a2a_tools + diag = "Platform error: HTTP 503." + with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)): + result = await a2a_tools.tool_list_peers() + assert "503" in result + assert "Platform error" in result + + async def test_network_error_surfaces_unreachable(self): + """Network error → operator can tell that the workspace can't reach the platform at all.""" + import a2a_tools + diag = "Cannot reach platform at http://platform.example: timed out" + with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)): + result = await a2a_tools.tool_list_peers() + assert "Cannot reach platform" in result + assert "timed out" in result async def test_peers_returned_formatted_lines(self): """Peers list is formatted as '- name (ID: ..., status: ..., role: ...)'.""" @@ -550,7 +593,7 @@ class TestToolListPeers: {"id": "ws-1", "name": "Alpha", "status": "online", "role": "worker"}, {"id": "ws-2", "name": "Beta", "status": "idle", "role": "analyst"}, ] - with patch("a2a_tools.get_peers", return_value=peers): + with patch("a2a_tools.get_peers_with_diagnostic", return_value=(peers, None)): result = await a2a_tools.tool_list_peers() assert "Alpha" in result @@ -567,7 +610,7 @@ class TestToolListPeers: # Clear any prior cache entries for these IDs a2a_tools._peer_names.pop("ws-cache-test", None) peers = [{"id": "ws-cache-test", "name": "CacheMe", "status": "online", "role": "w"}] - with patch("a2a_tools.get_peers", return_value=peers): + with patch("a2a_tools.get_peers_with_diagnostic", return_value=(peers, None)): await a2a_tools.tool_list_peers() assert a2a_tools._peer_names.get("ws-cache-test") == "CacheMe" @@ -577,7 +620,7 @@ class TestToolListPeers: import a2a_tools peers = [{"id": "ws-3", "name": "Gamma"}] # no status, no role - with patch("a2a_tools.get_peers", return_value=peers): + with patch("a2a_tools.get_peers_with_diagnostic", return_value=(peers, None)): result = await a2a_tools.tool_list_peers() assert "Gamma" in result