Merge pull request #2404 from Molecule-AI/staging

staging → main: auto-promote 6159429
2026-04-30 13:56:04 -07:00 · 2026-04-30 13:56:04 -07:00 · 0e3544d7b8
commit 0e3544d7b8
parent 281c84fcde c68ec23d3c
31 changed files with 2226 additions and 159 deletions
--- a/.github/workflows/e2e-staging-external.yml
+++ b/.github/workflows/e2e-staging-external.yml
@ -0,0 +1,164 @@
+name: E2E Staging External Runtime
+
+# Regression for the four/five workspaces.status=awaiting_agent transitions
+# that silently failed in production for five days before migration 046
+# extended the workspace_status enum (see
+# workspace-server/migrations/046_workspace_status_awaiting_agent.up.sql).
+#
+# Why this is its own workflow (not folded into e2e-staging-saas.yml):
+#   - The full-saas harness defaults to runtime=hermes, never exercises
+#     external-runtime. Adding an `external` parameter to that script
+#     would force every push to staging through both lifecycles in
+#     series, doubling the EC2 cold-start budget.
+#   - The external lifecycle has unique timing (REMOTE_LIVENESS_STALE_AFTER
+#     window, 90s default + sweep interval), which we wait through
+#     deliberately. Folding it into hermes would make the long path
+#     even longer.
+#   - It can run in parallel with the hermes E2E since both create
+#     fresh tenant orgs with distinct slug prefixes (`e2e-ext-...` vs
+#     `e2e-...`).
+#
+# Triggers:
+#   - Push to staging when any source affecting external runtime,
+#     hibernation, or the migration set changes.
+#   - PR review for the same set.
+#   - Manual workflow_dispatch.
+#   - Daily cron at 07:30 UTC (catches drift on quiet days; staggered
+#     30 min after e2e-staging-saas.yml's 07:00 UTC cron).
+#
+# Concurrency: serialized so two staging pushes don't fight for the
+# same EC2 quota window. cancel-in-progress=false so a half-rolled
+# tenant always finishes its teardown.
+
+on:
+  push:
+    branches: [staging, main]
+    paths:
+      - 'workspace-server/internal/handlers/workspace.go'
+      - 'workspace-server/internal/handlers/registry.go'
+      - 'workspace-server/internal/handlers/workspace_restart.go'
+      - 'workspace-server/internal/registry/healthsweep.go'
+      - 'workspace-server/internal/registry/liveness.go'
+      - 'workspace-server/migrations/**'
+      - 'workspace-server/internal/db/workspace_status_enum_drift_test.go'
+      - 'tests/e2e/test_staging_external_runtime.sh'
+      - '.github/workflows/e2e-staging-external.yml'
+  pull_request:
+    branches: [staging, main]
+    paths:
+      - 'workspace-server/internal/handlers/workspace.go'
+      - 'workspace-server/internal/handlers/registry.go'
+      - 'workspace-server/internal/handlers/workspace_restart.go'
+      - 'workspace-server/internal/registry/healthsweep.go'
+      - 'workspace-server/internal/registry/liveness.go'
+      - 'workspace-server/migrations/**'
+      - 'workspace-server/internal/db/workspace_status_enum_drift_test.go'
+      - 'tests/e2e/test_staging_external_runtime.sh'
+      - '.github/workflows/e2e-staging-external.yml'
+  workflow_dispatch:
+    inputs:
+      keep_org:
+        description: "Skip teardown for debugging (only via manual dispatch)"
+        required: false
+        type: boolean
+        default: false
+      stale_wait_secs:
+        description: "Seconds to wait for the heartbeat-staleness sweep (default 180 = 90s window + 90s buffer)"
+        required: false
+        default: "180"
+  schedule:
+    - cron: '30 7 * * *'
+
+concurrency:
+  group: e2e-staging-external
+  cancel-in-progress: false
+
+permissions:
+  contents: read
+
+jobs:
+  e2e-staging-external:
+    name: E2E Staging External Runtime
+    runs-on: ubuntu-latest
+    timeout-minutes: 25
+
+    env:
+      MOLECULE_CP_URL: https://staging-api.moleculesai.app
+      MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
+      E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
+      E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
+      E2E_STALE_WAIT_SECS: ${{ github.event.inputs.stale_wait_secs || '180' }}
+
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+
+      - name: Verify admin token present
+        run: |
+          if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
+            # Schedule + push triggers must hard-fail when the token is
+            # missing — silent skip would mask infra rot. Manual dispatch
+            # gets the same hard-fail; an operator running this on a fork
+            # without secrets configured needs to know up-front.
+            echo "::error::MOLECULE_STAGING_ADMIN_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
+            exit 2
+          fi
+          echo "Admin token present ✓"
+
+      - name: CP staging health preflight
+        run: |
+          code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health")
+          if [ "$code" != "200" ]; then
+            echo "::error::Staging CP unhealthy (got HTTP $code). Skipping — not a workspace bug."
+            exit 1
+          fi
+          echo "Staging CP healthy ✓"
+
+      - name: Run external-runtime E2E
+        id: e2e
+        run: bash tests/e2e/test_staging_external_runtime.sh
+
+      # Mirror the e2e-staging-saas.yml safety net: if the runner is
+      # cancelled (e.g. concurrent staging push), the test script's
+      # EXIT trap may not fire, so we sweep e2e-ext-* slugs scoped to
+      # *this* run id.
+      - name: Teardown safety net (runs on cancel/failure)
+        if: always()
+        env:
+          ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
+        run: |
+          set +e
+          orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
+            -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
+            | python3 -c "
+          import json, sys, os, datetime
+          run_id = os.environ.get('GITHUB_RUN_ID', '')
+          d = json.load(sys.stdin)
+          # Scope STRICTLY to this run id (e2e-ext-YYYYMMDD-<runid>-...)
+          # so concurrent runs and unrelated dev probes are not touched.
+          # Sweep today AND yesterday so a midnight-crossing run still
+          # cleans up its own slug.
+          today = datetime.date.today()
+          yesterday = today - datetime.timedelta(days=1)
+          dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
+          if not run_id:
+              # Without a run id we cannot scope safely; bail rather
+              # than risk deleting unrelated tenants.
+              sys.exit(0)
+          prefixes = tuple(f'e2e-ext-{d}-{run_id}-' for d in dates)
+          for o in d.get('orgs', []):
+              s = o.get('slug', '')
+              if s.startswith(prefixes) and o.get('status') != 'purged':
+                  print(s)
+          " 2>/dev/null)
+          if [ -n "$orgs" ]; then
+            echo "Safety-net sweep: deleting leftover orgs:"
+            echo "$orgs"
+            for slug in $orgs; do
+              curl -sS -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
+                -H "Authorization: Bearer $ADMIN_TOKEN" \
+                -H "Content-Type: application/json" \
+                -d "{\"confirm\":\"$slug\"}" >/dev/null 2>&1
+            done
+          else
+            echo "Safety-net sweep: no leftover orgs to clean."
+          fi
--- a/.github/workflows/harness-replays.yml
+++ b/.github/workflows/harness-replays.yml
@ -0,0 +1,167 @@
+name: Harness Replays
+
+# Boots tests/harness (production-shape compose topology with TenantGuard,
+# /cp/* proxy, canvas proxy, real production Dockerfile.tenant) and runs
+# every replay under tests/harness/replays/. Fails the PR if any replay
+# fails.
+#
+# Why this exists: 2026-04-30 we shipped #2398 which added /buildinfo as
+# a public route in router.go but forgot to add it to TenantGuard's
+# allowlist. The handler-level test in buildinfo_test.go constructed a
+# minimal gin engine without TenantGuard — green. The harness's
+# buildinfo-stale-image.sh replay would have caught it (cf-proxy doesn't
+# inject X-Molecule-Org-Id, so the curl path is identical to production's
+# redeploy verifier), but no one ran the harness pre-merge. The bug
+# shipped; the redeploy verifier silently soft-warned every tenant as
+# "unreachable" for ~1 day before being noticed.
+#
+# This gate makes "did you actually run the harness?" a CI invariant
+# instead of a memory-discipline thing.
+#
+# Trigger model — match e2e-api.yml: always FIRES on push/pull_request
+# to staging+main, real work is gated per-step on detect-changes output.
+# One job → one check run → branch-protection-clean (the SKIPPED-in-set
+# trap from PR #2264 is documented in e2e-api.yml's e2e-api job comment).
+
+on:
+  push:
+    branches: [main, staging]
+    paths:
+      - 'workspace-server/**'
+      - 'canvas/**'
+      - 'tests/harness/**'
+      - '.github/workflows/harness-replays.yml'
+  pull_request:
+    branches: [main, staging]
+    paths:
+      - 'workspace-server/**'
+      - 'canvas/**'
+      - 'tests/harness/**'
+      - '.github/workflows/harness-replays.yml'
+  workflow_dispatch:
+  merge_group:
+    types: [checks_requested]
+
+concurrency:
+  # Per-SHA grouping. Per-ref kept hitting the auto-promote-staging
+  # cancellation deadlock — see e2e-api.yml's concurrency block for
+  # the 2026-04-28 incident that codified this pattern.
+  group: harness-replays-${{ github.event.pull_request.head.sha || github.sha }}
+  cancel-in-progress: false
+
+jobs:
+  detect-changes:
+    runs-on: ubuntu-latest
+    outputs:
+      run: ${{ steps.decide.outputs.run }}
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+      - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1
+        id: filter
+        with:
+          filters: |
+            run:
+              - 'workspace-server/**'
+              - 'canvas/**'
+              - 'tests/harness/**'
+              - '.github/workflows/harness-replays.yml'
+      - id: decide
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            echo "run=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "run=${{ steps.filter.outputs.run }}" >> "$GITHUB_OUTPUT"
+          fi
+
+  # ONE job that always runs. Real work is gated per-step on
+  # detect-changes.outputs.run so an unrelated PR (e.g. doc-only
+  # change to molecule-controlplane wired here later) emits the
+  # required check without spending CI cycles. Single-job pattern
+  # matches e2e-api.yml — see that workflow's comment for why a
+  # job-level `if: false` would block branch protection via the
+  # SKIPPED-in-set bug.
+  harness-replays:
+    needs: detect-changes
+    name: Harness Replays
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+      - name: No-op pass (paths filter excluded this commit)
+        if: needs.detect-changes.outputs.run != 'true'
+        run: |
+          echo "No workspace-server / canvas / tests/harness / workflow changes — Harness Replays gate satisfied without running."
+          echo "::notice::Harness Replays no-op pass (paths filter excluded this commit)."
+
+      - if: needs.detect-changes.outputs.run == 'true'
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+
+      - name: Checkout sibling plugin repo
+        # Dockerfile.tenant copies molecule-ai-plugin-github-app-auth/
+        # at the build-context root (see workspace-server/Dockerfile.tenant
+        # line 19). PLUGIN_REPO_PAT pattern matches publish-workspace-server-image.yml.
+        if: needs.detect-changes.outputs.run == 'true'
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+        with:
+          repository: Molecule-AI/molecule-ai-plugin-github-app-auth
+          path: molecule-ai-plugin-github-app-auth
+          token: ${{ secrets.PLUGIN_REPO_PAT || secrets.GITHUB_TOKEN }}
+
+      - name: Add /etc/hosts entry for harness-tenant.localhost
+        # ubuntu-latest doesn't auto-resolve *.localhost the way macOS
+        # sometimes does. seed.sh + replay scripts curl
+        # http://harness-tenant.localhost:8080 — without the entry
+        # they'd fail with getaddrinfo ENOTFOUND.
+        if: needs.detect-changes.outputs.run == 'true'
+        run: |
+          echo "127.0.0.1 harness-tenant.localhost" | sudo tee -a /etc/hosts >/dev/null
+          getent hosts harness-tenant.localhost
+
+      - name: Install Python deps for replays
+        # peer-discovery-404 (and future replays) eval Python against the
+        # running tenant — importing workspace/a2a_client.py pulls in
+        # httpx. tests/harness/requirements.txt holds just the HTTP-client
+        # surface to keep CI install fast (~3s) vs the full
+        # workspace/requirements.txt (~30s).
+        if: needs.detect-changes.outputs.run == 'true'
+        run: pip install -r tests/harness/requirements.txt
+
+      - name: Run all replays against the harness
+        # run-all-replays.sh: boot via up.sh → seed via seed.sh → run
+        # every replays/*.sh → tear down via down.sh on EXIT (trap).
+        # Non-zero exit on any replay failure.
+        #
+        # KEEP_UP=1: without this, the script's trap-on-EXIT tears
+        # down containers immediately on failure, leaving the dump
+        # step below with nothing to dump (verified on PR #2410's
+        # first run — tenant became unhealthy, trap fired, dump
+        # step saw empty containers). Keeping them up lets the
+        # failure path collect tenant/cp-stub/cf-proxy logs. The
+        # always-run "Force teardown" step does the actual cleanup.
+        if: needs.detect-changes.outputs.run == 'true'
+        working-directory: tests/harness
+        env:
+          KEEP_UP: "1"
+        run: ./run-all-replays.sh
+
+      - name: Dump compose logs on failure
+        if: failure() && needs.detect-changes.outputs.run == 'true'
+        working-directory: tests/harness
+        run: |
+          echo "=== docker compose ps ==="
+          docker compose -f compose.yml ps || true
+          echo "=== tenant logs ==="
+          docker compose -f compose.yml logs tenant || true
+          echo "=== cp-stub logs ==="
+          docker compose -f compose.yml logs cp-stub || true
+          echo "=== cf-proxy logs ==="
+          docker compose -f compose.yml logs cf-proxy || true
+          echo "=== postgres logs (last 100) ==="
+          docker compose -f compose.yml logs --tail 100 postgres || true
+
+      - name: Force teardown
+        # We pass KEEP_UP=1 to run-all-replays.sh so the dump step
+        # above sees real containers — that means we own teardown
+        # explicitly here. Always run.
+        if: always() && needs.detect-changes.outputs.run == 'true'
+        working-directory: tests/harness
+        run: ./down.sh || true
--- a/.github/workflows/publish-runtime.yml
+++ b/.github/workflows/publish-runtime.yml
@ -154,139 +154,15 @@ jobs:

      - name: Verify package contents (sanity)
        working-directory: ${{ runner.temp }}/runtime-build
+        # Smoke logic lives in scripts/wheel_smoke.py so the same gate runs
+        # at both PR-time (runtime-prbuild-compat.yml) and publish-time
+        # (here). Splitting the smoke across two heredocs let them drift
+        # apart historically — one script keeps them locked.
        run: |
          python -m twine check dist/*
-          # Smoke-import the built wheel to catch import-rewrite mistakes
-          # before they hit PyPI. Asserts on STABLE INVARIANTS only —
-          # symbols + classes that are part of the package's public
-          # contract (BaseAdapter interface, the canonical a2a sentinel,
-          # core submodules). Don't add feature-flag-style assertions
-          # here — they fire false-positive every time staging is mid-
-          # release of that feature.
          python -m venv /tmp/smoke
          /tmp/smoke/bin/pip install --quiet dist/*.whl
-          WORKSPACE_ID=00000000-0000-0000-0000-000000000000 \
-          PLATFORM_URL=http://localhost:8080 \
-            /tmp/smoke/bin/python -c "
-          # Importing main is the strongest smoke test we can do here:
-          # main.py is the entry point and pulls every other module
-          # transitively. If the build script missed an import rewrite
-          # (e.g. left a bare \`from transcript_auth import ...\` instead
-          # of \`from molecule_runtime.transcript_auth import ...\` — the
-          # 0.1.16 incident), this fails with ModuleNotFoundError instead
-          # of shipping to PyPI and breaking every workspace startup.
-          # Import the entry-point target by NAME — not just the module.
-          # The wheel's pyproject.toml declares
-          # `molecule-runtime = molecule_runtime.main:main_sync` so if
-          # main_sync goes missing (it did in 0.1.16-0.1.18), every
-          # workspace startup fails with `ImportError: cannot import name
-          # 'main_sync'`. Plain `import molecule_runtime.main` doesn't
-          # catch that because the module loads fine.
-          from molecule_runtime.main import main_sync  # noqa: F401
-          from molecule_runtime import a2a_client, a2a_tools
-          from molecule_runtime.builtin_tools import memory
-          from molecule_runtime.adapters import get_adapter, BaseAdapter, AdapterConfig
-          # Stable invariants: package exports + BaseAdapter shape.
-          assert a2a_client._A2A_ERROR_PREFIX, 'a2a_client missing error sentinel'
-          assert callable(get_adapter), 'adapters.get_adapter must be callable'
-          assert hasattr(BaseAdapter, 'name'), 'BaseAdapter interface broken'
-          assert hasattr(AdapterConfig, '__init__'), 'AdapterConfig dataclass missing'
-
-          # Call-shape smoke for AgentCard. Pure imports don't catch
-          # field-shape regressions in upstream SDKs that only surface
-          # at construction time. Two bugs of this exact class shipped
-          # since the a2a-sdk 1.0 migration:
-          #   - state_transition_history=True (fixed in #2179)
-          #   - supported_protocols=[...] (the protobuf field is
-          #     supported_interfaces — caused every workspace boot
-          #     to crash with `ValueError: Protocol message AgentCard
-          #     has no "supported_protocols" field`; fixed alongside
-          #     this smoke)
-          #
-          # This block instantiates the EXACT classes main.py uses,
-          # with the EXACT keyword arguments. If a future a2a-sdk
-          # upgrade renames any of supported_interfaces / streaming /
-          # push_notifications / etc., the publish fails here instead
-          # of breaking every workspace startup. main.py and this
-          # smoke MUST stay in lockstep — adding a kwarg to one
-          # without mirroring it here is the regression vector.
-          from a2a.types import AgentCard, AgentCapabilities, AgentSkill, AgentInterface
-          AgentCard(
-              name='smoke-agent',
-              description='publish-runtime smoke test',
-              version='0.0.0-smoke',
-              supported_interfaces=[
-                  AgentInterface(protocol_binding='https://a2a.g/v1', url='http://localhost:8080'),
-              ],
-              capabilities=AgentCapabilities(
-                  streaming=True,
-                  push_notifications=False,
-              ),
-              skills=[
-                  AgentSkill(
-                      id='smoke-skill',
-                      name='Smoke',
-                      description='no-op',
-                      tags=['smoke'],
-                      examples=['noop'],
-                  ),
-              ],
-              default_input_modes=['text/plain', 'application/json'],
-              default_output_modes=['text/plain', 'application/json'],
-          )
-          print('✓ AgentCard call-shape smoke passed')
-
-          # Well-known agent-card path probe alignment. main.py's
-          # _send_initial_prompt() polls AGENT_CARD_WELL_KNOWN_PATH
-          # to know when the local A2A server is ready. If the SDK
-          # ever splits the constant value from the path that
-          # create_agent_card_routes() actually mounts at, every
-          # workspace silently drops its initial_prompt:
-          #   - Probe gets 404 every attempt.
-          #   - Falls through to 'server not ready after 30s,
-          #     skipping' even though the server is fine.
-          #   - The user hits a fresh chat with no kickoff context.
-          # This was the #2193 incident class — the v0.x → v1.x
-          # rename of /.well-known/agent.json → /.well-known/agent-card.json
-          # plus the constant itself moving to a2a.utils.constants.
-          # source-tree pytest (test_agent_card_well_known_path.py)
-          # catches main.py-side regressions; this catches the
-          # SDK-side ones BEFORE PyPI upload.
-          from a2a.utils.constants import AGENT_CARD_WELL_KNOWN_PATH
-          from a2a.server.routes import create_agent_card_routes
-          mounted_paths = [
-              getattr(r, 'path', None)
-              for r in create_agent_card_routes(
-                  AgentCard(
-                      name='wk-smoke',
-                      description='well-known mount alignment',
-                      version='0.0.0-smoke',
-                  )
-              )
-          ]
-          assert AGENT_CARD_WELL_KNOWN_PATH in mounted_paths, (
-              f'AGENT_CARD_WELL_KNOWN_PATH ({AGENT_CARD_WELL_KNOWN_PATH!r}) '
-              f'is NOT among paths mounted by create_agent_card_routes '
-              f'({mounted_paths!r}). The SDK constant and its own route '
-              f'factory have drifted — workspace probes will 404 forever, '
-              f'silently dropping every workspace initial_prompt.'
-          )
-          print(f'✓ well-known mount alignment OK ({AGENT_CARD_WELL_KNOWN_PATH})')
-
-          # Message helper smoke. a2a-sdk renamed
-          # new_agent_text_message → new_text_message in the v1.x
-          # protobuf-flat migration (per the v0→v1 cheat sheet). main.py
-          # and a2a_executor.py call new_text_message in hot paths; if
-          # the import breaks, every reply errors with ImportError before
-          # the message even leaves the workspace. Importing here
-          # catches a future v2.x rename at publish time.
-          from a2a.helpers import new_text_message
-          msg = new_text_message('smoke')
-          assert msg is not None, 'new_text_message returned None'
-          print('✓ message helper import + call OK')
-
-          print('✓ smoke import passed')
-          "
+          /tmp/smoke/bin/python "$GITHUB_WORKSPACE/scripts/wheel_smoke.py"

      - name: Publish to PyPI (Trusted Publisher / OIDC)
        # PyPI side is configured: project molecule-ai-workspace-runtime →
--- a/.github/workflows/redeploy-tenants-on-main.yml
+++ b/.github/workflows/redeploy-tenants-on-main.yml
@ -306,6 +306,17 @@ jobs:
          if [ $UNREACHABLE_COUNT -gt 0 ]; then
            echo "::warning::$UNREACHABLE_COUNT tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages."
          fi
+
+          # Belt-and-suspenders sanity floor: same logic as the staging
+          # variant — see that file's comment for the full rationale.
+          # Floor only applies when fleet >= 4; below that, canary-verify
+          # is the actual gate.
+          TOTAL_VERIFIED=${#SLUGS[@]}
+          if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then
+            echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race."
+            exit 1
+          fi
+
          if [ $STALE_COUNT -gt 0 ]; then
            echo "::error::$STALE_COUNT tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary."
            exit 1
--- a/.github/workflows/redeploy-tenants-on-staging.yml
+++ b/.github/workflows/redeploy-tenants-on-staging.yml
@ -283,6 +283,25 @@ jobs:
          if [ $UNREACHABLE_COUNT -gt 0 ]; then
            echo "::warning::$UNREACHABLE_COUNT staging tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages."
          fi
+
+          # Belt-and-suspenders sanity floor: if MORE than half the fleet is
+          # unreachable AND the fleet is large enough that "half down" is
+          # statistically meaningful, this is a real outage (e.g. new image
+          # crashes on startup), not a teardown race. Hard-fail.
+          #
+          # Floor only applies when TOTAL_VERIFIED >= 4 — below that, the
+          # canary-verify step is the actual gate for "all tenants down"
+          # detection (it runs against the canary first and aborts the
+          # rollout if the canary fails to come up). Without the >=4 gate,
+          # a 1-tenant fleet (e.g. a single ephemeral e2e-* tenant on a
+          # quiet staging push) would re-flake on the exact teardown-race
+          # condition #2402 fixed: 1 of 1 unreachable = 100% > 50% → fail.
+          TOTAL_VERIFIED=${#SLUGS[@]}
+          if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then
+            echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED staging tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race."
+            exit 1
+          fi
+
          if [ $STALE_COUNT -gt 0 ]; then
            echo "::error::$STALE_COUNT staging tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary."
            exit 1
--- a/.github/workflows/runtime-prbuild-compat.yml
+++ b/.github/workflows/runtime-prbuild-compat.yml
@ -34,12 +34,14 @@ on:
      # changes (it controls the wheel layout).
      - 'workspace/**'
      - 'scripts/build_runtime_package.py'
+      - 'scripts/wheel_smoke.py'
      - '.github/workflows/runtime-prbuild-compat.yml'
  pull_request:
    branches: [main, staging]
    paths:
      - 'workspace/**'
      - 'scripts/build_runtime_package.py'
+      - 'scripts/wheel_smoke.py'
      - '.github/workflows/runtime-prbuild-compat.yml'
  workflow_dispatch:
  # Required-check support: when this becomes a branch-protection gate,
@ -94,7 +96,9 @@ jobs:
          /tmp/venv-built/bin/pip show molecule-ai-workspace-runtime a2a-sdk \
            | grep -E '^(Name|Version):'
      - name: Smoke import the PR-built wheel
-        env:
-          WORKSPACE_ID: 00000000-0000-0000-0000-000000000001
+        # Same script publish-runtime.yml runs against the to-be-PyPI wheel.
+        # Closes the PR-time vs publish-time gap: a PR adding a new SDK
+        # call-shape no longer passes here (narrow `import main_sync`) only
+        # to fail post-merge in publish-runtime's broader smoke.
        run: |
-          /tmp/venv-built/bin/python -c "from molecule_runtime.main import main_sync; print('PR-built runtime imports OK')"
+          /tmp/venv-built/bin/python "$GITHUB_WORKSPACE/scripts/wheel_smoke.py"
--- a/canvas/src/app/api/buildinfo/tests/route.test.ts
+++ b/canvas/src/app/api/buildinfo/tests/route.test.ts
@ -0,0 +1,48 @@
+/**
+ * Canvas /api/buildinfo — version-display endpoint mirroring
+ * workspace-server's /buildinfo. Lets `curl <url>/api/buildinfo`
+ * confirm which git SHA is live on a canvas deployment.
+ */
+import { describe, it, expect, beforeEach, afterEach } from "vitest";
+import { GET } from "../route";
+
+const ENV_KEYS = ["VERCEL_GIT_COMMIT_SHA", "VERCEL_GIT_COMMIT_REF", "VERCEL_ENV"];
+
+describe("GET /api/buildinfo", () => {
+  let saved: Record<string, string | undefined>;
+
+  beforeEach(() => {
+    saved = Object.fromEntries(ENV_KEYS.map((k) => [k, process.env[k]]));
+    for (const k of ENV_KEYS) delete process.env[k];
+  });
+
+  afterEach(() => {
+    for (const k of ENV_KEYS) {
+      if (saved[k] === undefined) delete process.env[k];
+      else process.env[k] = saved[k];
+    }
+  });
+
+  it("returns dev sentinel when Vercel env vars are unset", async () => {
+    const res = await GET();
+    const body = await res.json();
+    expect(body).toEqual({ git_sha: "dev", git_ref: "", vercel_env: "local" });
+  });
+
+  it("reports the SHA Vercel injected at build time", async () => {
+    process.env.VERCEL_GIT_COMMIT_SHA = "abc1234567890";
+    process.env.VERCEL_GIT_COMMIT_REF = "main";
+    process.env.VERCEL_ENV = "production";
+    const res = await GET();
+    const body = await res.json();
+    expect(body.git_sha).toBe("abc1234567890");
+    expect(body.git_ref).toBe("main");
+    expect(body.vercel_env).toBe("production");
+  });
+
+  it("returns 200 status and JSON content type", async () => {
+    const res = await GET();
+    expect(res.status).toBe(200);
+    expect(res.headers.get("content-type")).toContain("application/json");
+  });
+});
--- a/canvas/src/app/api/buildinfo/route.ts
+++ b/canvas/src/app/api/buildinfo/route.ts
@ -0,0 +1,18 @@
+import { NextResponse } from "next/server";
+
+// Mirror of workspace-server's GET /buildinfo (PR #2398). Lets a developer
+// confirm which git SHA is live on a canvas deployment with the same
+// `curl <url>/buildinfo` flow they use against tenant workspaces.
+//
+// Vercel injects VERCEL_GIT_COMMIT_SHA / _REF / VERCEL_ENV at build time
+// from the deploying commit; outside Vercel (local `next dev`, harness)
+// these are unset and the endpoint reports `git_sha: "dev"`. Same sentinel
+// the workspace-server uses pre-ldflags-injection so both surfaces speak
+// the same vocabulary.
+export async function GET() {
+  return NextResponse.json({
+    git_sha: process.env.VERCEL_GIT_COMMIT_SHA ?? "dev",
+    git_ref: process.env.VERCEL_GIT_COMMIT_REF ?? "",
+    vercel_env: process.env.VERCEL_ENV ?? "local",
+  });
+}
--- a/scripts/ops/check-prod-versions.sh
+++ b/scripts/ops/check-prod-versions.sh
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# Check whether production tenants and canvas are running latest main.
+#
+# Usage:
+#   ./scripts/ops/check-prod-versions.sh                # production
+#   ENV=staging ./scripts/ops/check-prod-versions.sh    # staging tenants
+#
+# Outputs a table of {surface, current_sha, expected_sha, status}. Returns
+# non-zero if any surface is stale so this can be wired into a periodic
+# alert.
+#
+# Why this exists: every time someone hits a "is the fix live?" question,
+# they have to remember the curl pattern + cross-reference with
+# `git rev-parse origin/main`. This script does that check uniformly across
+# every public surface (workspace tenants + canvas) and gives a one-line
+# verdict instead of a stack of one-off curls.
+
+set -euo pipefail
+
+ENV="${ENV:-production}"
+EXPECTED_REF="${EXPECTED_REF:-main}"
+
+case "$ENV" in
+    production)
+        TENANT_DOMAIN="moleculesai.app"
+        CANVAS_URL="https://canvas.moleculesai.app"
+        # Default canary tenant for production. Override via TENANT_SLUGS=
+        # to cover a custom set.
+        DEFAULT_TENANTS="hongmingwang reno-stars"
+        ;;
+    staging)
+        TENANT_DOMAIN="staging.moleculesai.app"
+        CANVAS_URL="https://canvas-staging.moleculesai.app"
+        DEFAULT_TENANTS=""  # staging tenants are ephemeral; user must specify
+        ;;
+    *)
+        echo "Unknown ENV=$ENV (expected: production | staging)" >&2
+        exit 2
+        ;;
+esac
+
+TENANT_SLUGS="${TENANT_SLUGS:-$DEFAULT_TENANTS}"
+
+# Pull EXPECTED_SHA from GitHub. Falls back to local git if gh isn't
+# logged in — local main may lag origin but is usually close enough for
+# debugging, and we still report the comparison clearly.
+EXPECTED_SHA=""
+if command -v gh >/dev/null 2>&1; then
+    EXPECTED_SHA=$(gh api "repos/Molecule-AI/molecule-core/commits/${EXPECTED_REF}" --jq '.sha' 2>/dev/null || true)
+fi
+if [ -z "$EXPECTED_SHA" ]; then
+    if git rev-parse "origin/${EXPECTED_REF}" >/dev/null 2>&1; then
+        EXPECTED_SHA=$(git rev-parse "origin/${EXPECTED_REF}")
+        echo "[check-prod-versions] WARN: gh unavailable, using local origin/${EXPECTED_REF}=${EXPECTED_SHA:0:7} (may lag)"
+    else
+        echo "[check-prod-versions] ERROR: cannot resolve expected SHA — gh not logged in and origin/${EXPECTED_REF} not fetched" >&2
+        exit 2
+    fi
+fi
+EXPECTED_SHORT="${EXPECTED_SHA:0:7}"
+
+echo "Checking ${ENV} surfaces against ${EXPECTED_REF}=${EXPECTED_SHORT}"
+echo ""
+printf "%-25s  %-9s  %-9s  %s\n" "Surface" "Live" "Expected" "Status"
+printf "%-25s  %-9s  %-9s  %s\n" "-------" "----" "--------" "------"
+
+STALE_COUNT=0
+UNREACHABLE_COUNT=0
+
+# Tenant surfaces — workspace-server /buildinfo (added in PR #2398).
+for slug in $TENANT_SLUGS; do
+    URL="https://${slug}.${TENANT_DOMAIN}/buildinfo"
+    BODY=$(curl -sS --max-time 15 "$URL" 2>/dev/null || echo "")
+    ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
+    if [ -z "$ACTUAL_SHA" ]; then
+        printf "%-25s  %-9s  %-9s  ⚠ unreachable\n" "tenant: $slug" "—" "$EXPECTED_SHORT"
+        UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
+    elif [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then
+        printf "%-25s  %-9s  %-9s  ✓ current\n" "tenant: $slug" "${ACTUAL_SHA:0:7}" "$EXPECTED_SHORT"
+    else
+        printf "%-25s  %-9s  %-9s  ✗ stale\n" "tenant: $slug" "${ACTUAL_SHA:0:7}" "$EXPECTED_SHORT"
+        STALE_COUNT=$((STALE_COUNT + 1))
+    fi
+done
+
+# Canvas — Next.js /api/buildinfo (PR #2407). Vercel injects
+# VERCEL_GIT_COMMIT_SHA at build time so this reflects the deployed
+# commit, not the request time.
+CANVAS_BODY=$(curl -sS --max-time 15 "${CANVAS_URL}/api/buildinfo" 2>/dev/null || echo "")
+CANVAS_SHA=$(echo "$CANVAS_BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
+if [ -z "$CANVAS_SHA" ]; then
+    printf "%-25s  %-9s  %-9s  ⚠ unreachable (route may not be deployed yet)\n" "canvas" "—" "$EXPECTED_SHORT"
+    UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
+elif [ "$CANVAS_SHA" = "dev" ]; then
+    printf "%-25s  %-9s  %-9s  ⚠ dev sentinel (Vercel env not injected — check VERCEL_GIT_COMMIT_SHA)\n" "canvas" "dev" "$EXPECTED_SHORT"
+    UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
+elif [ "$CANVAS_SHA" = "$EXPECTED_SHA" ]; then
+    printf "%-25s  %-9s  %-9s  ✓ current\n" "canvas" "${CANVAS_SHA:0:7}" "$EXPECTED_SHORT"
+else
+    printf "%-25s  %-9s  %-9s  ✗ stale\n" "canvas" "${CANVAS_SHA:0:7}" "$EXPECTED_SHORT"
+    STALE_COUNT=$((STALE_COUNT + 1))
+fi
+
+echo ""
+if [ $STALE_COUNT -eq 0 ] && [ $UNREACHABLE_COUNT -eq 0 ]; then
+    echo "All surfaces current."
+    exit 0
+fi
+echo "Summary: ${STALE_COUNT} stale, ${UNREACHABLE_COUNT} unreachable."
+# Stale is a deploy gap; unreachable is operational (DNS, CF, route absent).
+# Both are signal — exit non-zero so cron / CI can alert.
+exit 1
--- a/scripts/wheel_smoke.py
+++ b/scripts/wheel_smoke.py
@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+"""Smoke-test an installed molecule-ai-workspace-runtime wheel.
+
+Runs the same invariant assertions in two workflows:
+  * publish-runtime.yml — after building dist/*.whl, before PyPI upload
+  * runtime-prbuild-compat.yml — after building the PR's wheel, before merge
+
+Splitting the smoke across two inline heredocs let PR-time and publish-time
+drift apart. After 2026-04 we kept hitting publish-time failures for
+regressions a PR-time check could have caught. One script, both gates.
+
+Failure here intentionally exits non-zero so the workflow's `run:` step fails.
+Each block prints a single ✓ line on success so the GH summary log stays
+readable; assertion errors propagate with their own message.
+
+Run directly: `python scripts/wheel_smoke.py` after `pip install <wheel>`.
+"""
+
+import os
+import sys
+
+
+def smoke_imports_and_invariants() -> None:
+    """Module imports + stable contract assertions.
+
+    Importing main_sync by name is the strongest pre-PyPI gate we have for
+    import-rewrite mistakes (the 0.1.16 incident, where main.py loaded but
+    main_sync was missing because the build script dropped a re-export).
+    """
+    from molecule_runtime.main import main_sync  # noqa: F401
+    from molecule_runtime import a2a_client, a2a_tools  # noqa: F401
+    from molecule_runtime.builtin_tools import memory  # noqa: F401
+    from molecule_runtime.adapters import get_adapter, BaseAdapter, AdapterConfig
+
+    assert a2a_client._A2A_ERROR_PREFIX, "a2a_client missing error sentinel"
+    assert callable(get_adapter), "adapters.get_adapter must be callable"
+    assert hasattr(BaseAdapter, "name"), "BaseAdapter interface broken"
+    assert hasattr(AdapterConfig, "__init__"), "AdapterConfig dataclass missing"
+    print("✓ module imports + invariants OK")
+
+
+def smoke_agent_card_call_shape() -> None:
+    """Construct AgentCard with the EXACT kwargs main.py uses.
+
+    Pure imports don't catch field-shape regressions in upstream SDKs that
+    only surface at construction time. Two bugs of this exact class shipped
+    since the a2a-sdk 1.0 migration:
+      - state_transition_history=True (#2179)
+      - supported_protocols=[...] (the protobuf field is supported_interfaces;
+        every workspace boot crashed with `ValueError: Protocol message
+        AgentCard has no "supported_protocols" field`)
+
+    main.py and this block MUST stay in lockstep — adding a kwarg there
+    without mirroring it here is the regression vector.
+    """
+    from a2a.types import AgentCard, AgentCapabilities, AgentSkill, AgentInterface
+
+    AgentCard(
+        name="smoke-agent",
+        description="wheel-smoke: AgentCard call-shape",
+        version="0.0.0-smoke",
+        supported_interfaces=[
+            AgentInterface(protocol_binding="https://a2a.g/v1", url="http://localhost:8080"),
+        ],
+        capabilities=AgentCapabilities(
+            streaming=True,
+            push_notifications=False,
+        ),
+        skills=[
+            AgentSkill(
+                id="smoke-skill",
+                name="Smoke",
+                description="no-op",
+                tags=["smoke"],
+                examples=["noop"],
+            ),
+        ],
+        default_input_modes=["text/plain", "application/json"],
+        default_output_modes=["text/plain", "application/json"],
+    )
+    print("✓ AgentCard call-shape smoke passed")
+
+
+def smoke_well_known_path_alignment() -> None:
+    """The SDK's published constant must match the path it actually mounts.
+
+    main.py polls AGENT_CARD_WELL_KNOWN_PATH to detect server readiness. If
+    the constant and create_agent_card_routes() drift, every workspace's
+    initial_prompt silently drops (probe 404s, falls through to "skipping").
+    This was the #2193 incident class.
+    """
+    from a2a.types import AgentCard
+    from a2a.utils.constants import AGENT_CARD_WELL_KNOWN_PATH
+    from a2a.server.routes import create_agent_card_routes
+
+    mounted_paths = [
+        getattr(r, "path", None)
+        for r in create_agent_card_routes(
+            AgentCard(
+                name="wk-smoke",
+                description="well-known mount alignment",
+                version="0.0.0-smoke",
+            )
+        )
+    ]
+    assert AGENT_CARD_WELL_KNOWN_PATH in mounted_paths, (
+        f"AGENT_CARD_WELL_KNOWN_PATH ({AGENT_CARD_WELL_KNOWN_PATH!r}) is NOT among "
+        f"paths mounted by create_agent_card_routes ({mounted_paths!r}). The SDK "
+        "constant and its own route factory have drifted — workspace probes will "
+        "404 forever, silently dropping every workspace initial_prompt."
+    )
+    print(f"✓ well-known mount alignment OK ({AGENT_CARD_WELL_KNOWN_PATH})")
+
+
+def smoke_message_helper() -> None:
+    """new_text_message is the v1.x rename of new_agent_text_message.
+
+    main.py and a2a_executor.py call new_text_message in hot paths; if the
+    import breaks, every reply errors with ImportError before the message
+    even leaves the workspace. Importing here catches a future v2.x rename
+    at publish time.
+    """
+    from a2a.helpers import new_text_message
+
+    msg = new_text_message("smoke")
+    assert msg is not None, "new_text_message returned None"
+    print("✓ message helper import + call OK")
+
+
+def main() -> int:
+    # main.py validates WORKSPACE_ID at module-import time via platform_auth.
+    # Set placeholders so the smoke doesn't trip on the env-var guard.
+    os.environ.setdefault("WORKSPACE_ID", "00000000-0000-0000-0000-000000000000")
+    os.environ.setdefault("PLATFORM_URL", "http://localhost:8080")
+
+    smoke_imports_and_invariants()
+    smoke_agent_card_call_shape()
+    smoke_well_known_path_alignment()
+    smoke_message_helper()
+    print("✓ wheel smoke passed")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tests/e2e/test_staging_external_runtime.sh
+++ b/tests/e2e/test_staging_external_runtime.sh
@ -0,0 +1,348 @@
+#!/bin/bash
+# test_staging_external_runtime.sh — E2E regression for the
+# external-runtime workspace lifecycle on a real staging tenant.
+#
+# Why this test exists: the four/five sites that write 'awaiting_agent'
+# / 'hibernating' to workspaces.status had been silently failing in
+# production for five days (see migration 046) before a static drift
+# gate caught the enum gap. Unit tests passed because sqlmock matched
+# the SQL by regex but didn't enforce the live enum constraint, and
+# every existing E2E exercised hermes (not external) so the silent
+# failures never surfaced. This test pins the four awaiting_agent
+# transitions in real Postgres on a real staging tenant.
+#
+# Verification path:
+#   1. Provision a fresh tenant (test_staging_full_saas.sh harness shape).
+#   2. Create an external-runtime workspace with NO URL → assert
+#      response status == 'awaiting_agent' AND GET on the workspace
+#      returns the same. (Pre-fix the row stuck on 'provisioning'
+#      because the UPDATE in workspace.go:333 silently failed.)
+#   3. Register a fake URL via /registry/register → assert transition
+#      to 'online'. (Pre-fix this branch worked because it writes
+#      'online' which IS in the enum.)
+#   4. Stop heartbeating; wait past REMOTE_LIVENESS_STALE_AFTER (90s
+#      default) + a sweep interval → assert transition back to
+#      'awaiting_agent'. (Pre-fix the sweep UPDATE failed silently and
+#      the workspace stuck on 'online' indefinitely.)
+#
+# Hibernation is intentionally NOT covered here — it has its own timing
+# model (idle threshold) and warrants a separate harness.
+#
+# Required env (mirrors test_staging_full_saas.sh):
+#   MOLECULE_CP_URL          default: https://staging-api.moleculesai.app
+#   MOLECULE_ADMIN_TOKEN     CP admin bearer (Railway CP_ADMIN_API_TOKEN)
+#
+# Optional env:
+#   E2E_PROVISION_TIMEOUT_SECS  default 900 (15 min cold EC2 budget)
+#   E2E_KEEP_ORG                1 → skip teardown (debugging only)
+#   E2E_RUN_ID                  Slug suffix; CI: ${GITHUB_RUN_ID}
+#   E2E_STALE_WAIT_SECS         default 180 (90s window + 90s buffer)
+#   E2E_INTENTIONAL_FAILURE     1 → break a step on purpose to verify
+#                               the EXIT trap still tears down (mirrors
+#                               the full-saas harness's safety net).
+#
+# Exit codes: 0 happy, 1 generic, 2 missing env, 3 provision timeout,
+# 4 teardown leak.
+
+set -euo pipefail
+
+CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}"
+ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}"
+PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}"
+RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}"
+STALE_WAIT_SECS="${E2E_STALE_WAIT_SECS:-180}"
+
+SLUG="e2e-ext-$(date +%Y%m%d)-${RUN_ID_SUFFIX}"
+SLUG=$(echo "$SLUG" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-' | head -c 32)
+
+log()  { echo "[$(date +%H:%M:%S)] $*"; }
+fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; }
+ok()   { echo "[$(date +%H:%M:%S)] ✅ $*"; }
+
+CURL_COMMON=(-sS --fail-with-body --max-time 30)
+
+# ─── cleanup trap (mirrors full-saas) ────────────────────────────────────
+CLEANUP_DONE=0
+cleanup_org() {
+  local entry_rc=$?
+  if [ "$CLEANUP_DONE" = "1" ]; then return 0; fi
+  CLEANUP_DONE=1
+
+  if [ "${E2E_KEEP_ORG:-0}" = "1" ]; then
+    log "E2E_KEEP_ORG=1 → leaving $SLUG behind for inspection"
+    return 0
+  fi
+
+  log "Cleanup: deleting tenant $SLUG..."
+  curl "${CURL_COMMON[@]}" --max-time 120 -X DELETE "$CP_URL/cp/admin/tenants/$SLUG" \
+    -H "Authorization: Bearer $ADMIN_TOKEN" \
+    -H "Content-Type: application/json" \
+    -d "{\"confirm\":\"$SLUG\"}" >/dev/null 2>&1 \
+    && ok "Teardown request accepted" \
+    || log "Teardown returned non-2xx (may already be gone)"
+
+  local leak_count=1 elapsed=0
+  while [ "$elapsed" -lt 60 ]; do
+    leak_count=$(curl "${CURL_COMMON[@]}" "$CP_URL/cp/admin/orgs" \
+      -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
+      | python3 -c "import json,sys; d=json.load(sys.stdin); print(sum(1 for o in d.get('orgs', []) if o.get('slug')=='$SLUG' and o.get('status') != 'purged'))" \
+      2>/dev/null || echo 1)
+    [ "$leak_count" = "0" ] && break
+    sleep 5
+    elapsed=$((elapsed + 5))
+  done
+
+  if [ "$leak_count" != "0" ]; then
+    echo "⚠️  LEAK: org $SLUG still present post-teardown (count=$leak_count)" >&2
+    exit 4
+  fi
+  ok "Teardown clean — no orphan resources for $SLUG (${elapsed}s)"
+
+  case "$entry_rc" in
+    0|1|2|3|4) ;;
+    *) exit 1 ;;
+  esac
+}
+trap cleanup_org EXIT INT TERM
+
+# ─── 0. Preflight ───────────────────────────────────────────────────────
+log "═══════════════════════════════════════════════════════════════════"
+log " Staging external-runtime E2E (regression for migration 046)"
+log "   CP:    $CP_URL"
+log "   Slug:  $SLUG"
+log "   Stale: ${STALE_WAIT_SECS}s wait window"
+log "═══════════════════════════════════════════════════════════════════"
+
+curl "${CURL_COMMON[@]}" "$CP_URL/health" >/dev/null || fail "CP health check failed"
+ok "CP reachable"
+
+admin_call() {
+  local method="$1"; shift; local path="$1"; shift
+  curl "${CURL_COMMON[@]}" -X "$method" "$CP_URL$path" \
+    -H "Authorization: Bearer $ADMIN_TOKEN" \
+    -H "Content-Type: application/json" "$@"
+}
+
+# ─── 1. Create org ──────────────────────────────────────────────────────
+log "1/8 Creating org $SLUG..."
+CREATE_RESP=$(admin_call POST /cp/admin/orgs \
+  -d "{\"slug\":\"$SLUG\",\"name\":\"E2E ext $SLUG\",\"owner_user_id\":\"e2e-runner:$SLUG\"}")
+ORG_ID=$(echo "$CREATE_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))")
+[ -z "$ORG_ID" ] && fail "Org create response missing 'id'"
+ok "Org created (id=$ORG_ID)"
+
+# ─── 2. Wait for tenant provisioning ────────────────────────────────────
+# Terminal status from /cp/admin/orgs is 'running' (org_instances.status),
+# NOT 'ready' — same field the full-saas harness polls. 'failed' surfaces
+# diagnostic dump and aborts. See test_staging_full_saas.sh step 2 for
+# the field-bugfix history (2026-04-21, last_error path).
+log "2/8 Waiting for tenant (up to ${PROVISION_TIMEOUT_SECS}s)..."
+DEADLINE=$(( $(date +%s) + PROVISION_TIMEOUT_SECS ))
+LAST_STATUS=""
+while true; do
+  if [ "$(date +%s)" -gt "$DEADLINE" ]; then
+    fail "Tenant provisioning timed out (last: $LAST_STATUS)"
+  fi
+  LIST_JSON=$(admin_call GET /cp/admin/orgs 2>/dev/null || echo '{"orgs":[]}')
+  STATUS=$(echo "$LIST_JSON" | python3 -c "
+import json, sys
+d = json.load(sys.stdin)
+for o in d.get('orgs', []):
+    if o.get('slug') == '$SLUG':
+        print(o.get('instance_status', ''))
+        sys.exit(0)
+print('')
+" 2>/dev/null || echo "")
+  if [ "$STATUS" != "$LAST_STATUS" ]; then
+    log "   instance_status: $STATUS"
+    LAST_STATUS="$STATUS"
+  fi
+  case "$STATUS" in
+    running) break ;;
+    failed)
+      log "── DIAGNOSTIC BURST (step 2 — tenant provisioning failed) ──"
+      echo "$LIST_JSON" | python3 -c "
+import json, sys
+d = json.load(sys.stdin)
+for o in d.get('orgs', []):
+    if o.get('slug') == '$SLUG':
+        print(json.dumps(o, indent=2))
+        sys.exit(0)
+print('(no org row found for slug=$SLUG — DB drift?)')
+" 2>&1 | sed 's/^/  /'
+      log "── END DIAGNOSTIC ──"
+      fail "Tenant provisioning failed for $SLUG (see diagnostic above)"
+      ;;
+    *) sleep 15 ;;
+  esac
+done
+ok "Tenant provisioning complete"
+
+# Derive tenant URL the same way the full-saas harness does.
+CP_HOST=$(echo "$CP_URL" | sed -E 's#^https?://##; s#/.*$##')
+case "$CP_HOST" in
+  api.*)         DERIVED_DOMAIN="${CP_HOST#api.}" ;;
+  staging-api.*) DERIVED_DOMAIN="staging.${CP_HOST#staging-api.}" ;;
+  *)             DERIVED_DOMAIN="$CP_HOST" ;;
+esac
+TENANT_DOMAIN="${MOLECULE_TENANT_DOMAIN:-$DERIVED_DOMAIN}"
+TENANT_URL="https://$SLUG.$TENANT_DOMAIN"
+log "    TENANT_URL=$TENANT_URL"
+
+# ─── 3. Per-tenant admin token + TLS readiness ──────────────────────────
+log "3/8 Fetching per-tenant admin token..."
+TENANT_TOKEN_RESP=$(admin_call GET "/cp/admin/orgs/$SLUG/admin-token")
+TENANT_TOKEN=$(echo "$TENANT_TOKEN_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('admin_token',''))")
+[ -z "$TENANT_TOKEN" ] && fail "Could not retrieve per-tenant admin token"
+ok "Token retrieved (len=${#TENANT_TOKEN})"
+
+log "Waiting for tenant TLS / DNS..."
+TLS_DEADLINE=$(( $(date +%s) + 15 * 60 ))
+while true; do
+  if curl -sSfk --max-time 5 "$TENANT_URL/health" >/dev/null 2>&1; then break; fi
+  if [ "$(date +%s)" -gt "$TLS_DEADLINE" ]; then
+    fail "Tenant URL never responded 2xx on /health within 15min"
+  fi
+  sleep 5
+done
+ok "Tenant reachable"
+
+tenant_call() {
+  local method="$1"; shift; local path="$1"; shift
+  curl "${CURL_COMMON[@]}" -X "$method" "$TENANT_URL$path" \
+    -H "Authorization: Bearer $TENANT_TOKEN" \
+    -H "X-Molecule-Org-Id: $ORG_ID" \
+    "$@"
+}
+
+# ─── 4. Create external workspace (no URL) ──────────────────────────────
+# This is the FIRST silent-failure path (workspace.go:333). Pre-migration
+# 046, the response would say status=awaiting_agent but the row stuck
+# on whatever the create handler set first (typically 'provisioning')
+# because the follow-up UPDATE failed the enum cast.
+log "4/8 Creating external workspace (no URL — exercises workspace.go:333)..."
+WS_CREATE_RESP=$(tenant_call POST /workspaces \
+  -d '{"name":"ext-e2e","runtime":"external","external":true}')
+
+WS_ID=$(echo "$WS_CREATE_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))")
+WS_RESP_STATUS=$(echo "$WS_CREATE_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
+WS_AUTH_TOKEN=$(echo "$WS_CREATE_RESP" | python3 -c "
+import json,sys
+try:
+    d = json.load(sys.stdin)
+    conn = d.get('connection') or {}
+    print(conn.get('auth_token','') or d.get('auth_token',''))
+except Exception:
+    print('')
+")
+[ -z "$WS_ID" ] && fail "Workspace create missing id: $WS_CREATE_RESP"
+[ "$WS_RESP_STATUS" != "awaiting_agent" ] && fail "Expected response status=awaiting_agent, got $WS_RESP_STATUS"
+ok "Workspace created (id=$WS_ID, response status=awaiting_agent)"
+
+# This GET is the proof that the row actually has the value (not just
+# the response body lying). Pre-migration-046 the UPDATE would have
+# silently failed and this would return whatever 'provisioning' the
+# initial INSERT left. Post-fix it must be 'awaiting_agent'.
+log "    Verifying DB row..."
+GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
+DB_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
+[ "$DB_STATUS" != "awaiting_agent" ] && fail "DB row status=$DB_STATUS (expected awaiting_agent — migration 046 likely not applied)"
+ok "DB row stored as awaiting_agent (proof migration 046 applied)"
+
+# ─── 5. Register the workspace (transitions to online) ──────────────────
+# Pre-fix this path was actually fine because it writes 'online', a value
+# already in the enum. We exercise it anyway because the registration
+# implicitly walks resolveDeliveryMode (registry.go:resolveDeliveryMode),
+# which DOES read runtime + apply the new poll-default introduced by
+# PR #2382.
+log "5/8 Registering workspace via /registry/register..."
+[ -z "$WS_AUTH_TOKEN" ] && fail "No workspace auth token returned — register impossible"
+# Payload contract (workspace-server/internal/models/workspace.go RegisterPayload):
+#   id            — required, the workspace UUID (NOT "workspace_id" — that's the
+#                   heartbeat payload field; mixing them yields a 400 from
+#                   ShouldBindJSON because `id` has binding:"required").
+#   agent_card    — required (binding:"required"); minimal valid card is name+skills.
+#   delivery_mode — set explicitly to "poll" so url validation is skipped
+#                   regardless of whether the deployed image has the
+#                   runtime=external→poll default from PR #2382. Observed
+#                   2026-04-30 17:18Z: a freshly-provisioned staging tenant
+#                   was running an older workspace-server :latest image
+#                   that lacked resolveDeliveryMode's external→poll branch,
+#                   so the implicit default was push and validateAgentURL
+#                   400'd on example.invalid. Asserting on the implicit
+#                   default makes the *register call* itself fragile to
+#                   image-tag drift on the fleet — verify the default
+#                   separately (step 5b assertion) without depending on it
+#                   here.
+#   url           — accepted but not dispatched-to in poll mode, so
+#                   example.invalid is a valid sentinel.
+REGISTER_BODY=$(printf '{"id":"%s","url":"https://example.invalid:443","delivery_mode":"poll","agent_card":{"name":"e2e-ext","skills":[{"id":"echo","name":"Echo"}]}}' "$WS_ID")
+# Disable --fail-with-body for this one call so a 4xx surfaces the response
+# body (the bare CURL_COMMON would `set -e`-kill before we could log it).
+REGISTER_RESP=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST "$TENANT_URL/registry/register" \
+  -H "Authorization: Bearer $WS_AUTH_TOKEN" \
+  -H "X-Molecule-Org-Id: $ORG_ID" \
+  -H "Content-Type: application/json" \
+  -d "$REGISTER_BODY") || true
+log "    register response: $(echo "$REGISTER_RESP" | head -c 300)"
+echo "$REGISTER_RESP" | grep -q "HTTP_CODE=200" || fail "register returned non-200 — see body above"
+
+GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
+ONLINE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
+[ "$ONLINE_STATUS" != "online" ] && fail "Expected online after register, got $ONLINE_STATUS"
+ok "Workspace transitioned to online"
+
+# Confirm the register handler echoed back delivery_mode=poll. We read
+# this from the register RESPONSE, not the workspace GET response, because
+# the GET handler's SELECT (workspace.go:597) doesn't fetch delivery_mode
+# — its column list pre-dates the delivery_mode column from #2339 PR 1.
+# Surfacing delivery_mode in GET is tracked separately; not gating on it
+# here keeps this test focused on the awaiting_agent transitions.
+REGISTER_BODY_JSON=$(echo "$REGISTER_RESP" | head -n 1)
+REGISTER_DELIVERY_MODE=$(echo "$REGISTER_BODY_JSON" | python3 -c "import json,sys; print(json.load(sys.stdin).get('delivery_mode',''))")
+if [ "$REGISTER_DELIVERY_MODE" = "poll" ]; then
+  ok "delivery_mode=poll (register response echoed explicit value)"
+else
+  fail "Register response delivery_mode=$REGISTER_DELIVERY_MODE (expected poll). Body: $REGISTER_BODY_JSON"
+fi
+
+# ─── 6. Stop heartbeating; wait past REMOTE_LIVENESS_STALE_AFTER ────────
+# This is the SECOND silent-failure path (registry/healthsweep.go's
+# sweepStaleRemoteWorkspaces). Pre-migration-046 the heartbeat-staleness
+# UPDATE silently failed and the workspace stuck on 'online' forever
+# even though no agent was alive. We wait the full window + a sweep
+# interval and assert the row transitions back to 'awaiting_agent'.
+log "6/8 Waiting ${STALE_WAIT_SECS}s for heartbeat-staleness sweep (no heartbeat sent)..."
+sleep "$STALE_WAIT_SECS"
+
+GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
+STALE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
+[ "$STALE_STATUS" != "awaiting_agent" ] && \
+  fail "After ${STALE_WAIT_SECS}s with no heartbeat, expected status=awaiting_agent (sweep transition), got $STALE_STATUS — migration 046 likely not applied OR sweep not running"
+ok "Heartbeat-staleness sweep transitioned online → awaiting_agent (proof healthsweep.go fix working)"
+
+# ─── 7. Re-register and confirm we can come back online ─────────────────
+# This proves the awaiting_agent state is recoverable (re-registrable),
+# which is the whole point of using it instead of 'offline'.
+log "7/8 Re-registering after stale → confirming recovery to online..."
+# Same payload contract as step 5 (id + agent_card both required). See note
+# there for why workspace_id would 400.
+REREG_RESP=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST "$TENANT_URL/registry/register" \
+  -H "Authorization: Bearer $WS_AUTH_TOKEN" \
+  -H "X-Molecule-Org-Id: $ORG_ID" \
+  -H "Content-Type: application/json" \
+  -d "$REGISTER_BODY") || true
+log "    re-register response: $(echo "$REREG_RESP" | head -c 300)"
+echo "$REREG_RESP" | grep -q "HTTP_CODE=200" || fail "re-register returned non-200 — see body above"
+
+GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
+RECOVERED_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
+[ "$RECOVERED_STATUS" != "online" ] && \
+  fail "Expected re-register to return workspace to online, got $RECOVERED_STATUS"
+ok "Re-register succeeded — awaiting_agent → online (operator-recoverable)"
+
+# ─── 8. Done — cleanup runs in the EXIT trap ───────────────────────────
+log "8/8 All four awaiting_agent transitions verified."
+log "═══════════════════════════════════════════════════════════════════"
+ok "External-runtime E2E PASSED on $SLUG"
+log "═══════════════════════════════════════════════════════════════════"
--- a/tests/harness/README.md
+++ b/tests/harness/README.md
@ -0,0 +1,119 @@
+# Production-shape local harness
+
+The harness brings up the SaaS tenant topology on localhost using the
+same `Dockerfile.tenant` image that ships to production. Tests run
+against `http://harness-tenant.localhost:8080` and exercise the
+SAME code path a real tenant takes — including TenantGuard middleware,
+the `/cp/*` reverse proxy, the canvas reverse proxy, and a
+Cloudflare-tunnel-shape header rewrite layer.
+
+## Why this exists
+
+Local `go run ./cmd/server` skips:
+- `TenantGuard` middleware (no `MOLECULE_ORG_ID` env)
+- `/cp/*` reverse proxy mount (no `CP_UPSTREAM_URL` env)
+- `CANVAS_PROXY_URL` (canvas runs separately on `:3000`)
+- Header rewrites that production's CF tunnel + LB perform
+- Strict-auth mode (no live `ADMIN_TOKEN`)
+
+Bugs that survive `go run` and ship to production almost always live
+in one of those layers. The harness activates ALL of them.
+
+## Topology
+
+```
+client
+  ↓
+cf-proxy        nginx, mirrors CF tunnel header rewrites
+  ↓ (Host:harness-tenant.localhost, X-Forwarded-*)
+tenant          workspace-server/Dockerfile.tenant — same image as prod
+  ↓ (CP_UPSTREAM_URL=http://cp-stub:9090, /cp/* proxied)
+cp-stub         minimal Go service, mocks CP wire surface
+postgres        same version as production
+redis           same version as production
+```
+
+## Quickstart
+
+```bash
+cd tests/harness
+./up.sh                 # builds + starts all services
+./seed.sh               # mints admin token, registers two sample workspaces
+./replays/peer-discovery-404.sh
+./replays/buildinfo-stale-image.sh
+./down.sh               # tear down + remove volumes
+```
+
+To run every replay in one shot (boot, seed, run-all, teardown):
+
+```bash
+cd tests/harness
+./run-all-replays.sh    # full lifecycle; non-zero exit if any replay fails
+KEEP_UP=1 ./run-all-replays.sh   # leave harness up for debugging
+REBUILD=1 ./run-all-replays.sh   # rebuild images before booting
+```
+
+First-time setup needs an `/etc/hosts` entry so `harness-tenant.localhost`
+resolves to the local cf-proxy:
+
+```bash
+echo "127.0.0.1 harness-tenant.localhost" | sudo tee -a /etc/hosts
+```
+
+(macOS resolves `*.localhost` automatically in some setups; Linux
+typically does not.)
+
+## Replay scripts
+
+Each replay script reproduces a real bug class against the harness so
+fixes can be verified locally before deploy. The bar for adding a
+replay is "this bug shipped to production despite local E2E being
+green" — the script becomes the regression gate that closes that gap.
+
+| Replay | Closes | What it proves |
+|--------|--------|----------------|
+| `peer-discovery-404.sh` | #2397 | tool_list_peers surfaces the actual reason instead of "may be isolated" |
+| `buildinfo-stale-image.sh` | #2395 | GIT_SHA reaches the binary; verify-step comparison logic works |
+
+To add a new replay:
+1. Drop a script under `replays/` named after the issue.
+2. The script's purpose: reproduce the production failure mode against
+   the harness, then assert the fix is present. PASS criterion is the
+   post-fix behavior.
+3. The `run-all-replays.sh` runner picks up every `replays/*.sh` script
+   automatically — no per-replay registration needed.
+
+## Extending the cp-stub
+
+`cp-stub/main.go` serves the minimum surface for the existing replays
+plus a catch-all that returns 501 + a clear message when the tenant
+asks for a route the stub doesn't implement. To add a new CP route:
+
+1. Add a `mux.HandleFunc` in `cp-stub/main.go` for the path.
+2. Return the same wire shape the real CP returns. The contract is
+   "wire compatibility with the staging CP at the time of writing" —
+   document it with a comment pointing at the real CP handler.
+3. Add a replay script that exercises the path.
+
+## What the harness does NOT cover
+
+- Real TLS / cert handling (CF terminates TLS in production; harness is
+  HTTP-only).
+- Cloudflare API edge cases (rate limits, DNS propagation timing).
+- Real EC2 / SSM / EBS behavior (image-cache replay simulates the
+  outcome but not the AWS API surface).
+- Cross-region or multi-AZ topology.
+- Real production data scale.
+
+These are intentional Phase 1 limits. If a bug class hits one of these
+gaps, escalate to staging E2E rather than expanding the harness past
+its mandate of "exercise the tenant binary in production-shape topology."
+
+## Roadmap
+
+- **Phase 1 (shipped):** harness + cp-stub + cf-proxy + 2 replays + `run-all-replays.sh` runner.
+- **Phase 2:** convert `tests/e2e/test_api.sh` to run against the
+  harness instead of localhost. Make harness-based E2E a required CI
+  check (a workflow that invokes `run-all-replays.sh` on every PR).
+- **Phase 3:** config-coherence lint that diffs harness env list
+  against production CP's env list, fails CI on drift.
--- a/tests/harness/cf-proxy/nginx.conf
+++ b/tests/harness/cf-proxy/nginx.conf
@ -0,0 +1,68 @@
+# cf-proxy — Cloudflare-tunnel-shape reverse proxy for the local harness.
+#
+# Production path: agent → CF tunnel → AWS LB → tenant container.
+# This config replays the same header rewrites the CF tunnel does so
+# the tenant sees the same Host + X-Forwarded-* it would in production.
+#
+# The tenant's TenantGuard middleware activates on MOLECULE_ORG_ID; the
+# canvas's same-origin fetches use the Host header for cookie scoping.
+# Both behave correctly in production because CF rewrites Host to the
+# tenant subdomain — this proxy reproduces that locally.
+#
+# How tests reach it:
+#   curl --resolve 'harness-tenant.localhost:8443:127.0.0.1' \
+#        https://harness-tenant.localhost:8443/health
+# or via /etc/hosts (added automatically by ./up.sh on first boot).
+
+worker_processes 1;
+events { worker_connections 256; }
+
+http {
+    # Map the wildcard <slug>.localhost to the tenant container. The
+    # tenant container itself doesn't care which slug routed to it —
+    # what matters is that the Host header it sees matches what
+    # production's CF tunnel sets, so cookie/CORS/TenantGuard logic
+    # exercises the same code path.
+    server {
+        listen 8080;
+        server_name *.localhost localhost;
+
+        # Cap upload at 50MB to mirror the staging tenant nginx limit;
+        # chat upload tests will fail closed if the platform handler
+        # ever silently expands its limit (catches the failure mode
+        # opposite of the chat-files lazy-heal incident).
+        client_max_body_size 50m;
+
+        location / {
+            proxy_pass http://tenant:8080;
+
+            # Header parity with CF tunnel + AWS LB. Production CF sets
+            # X-Forwarded-Proto=https; we keep http here because TLS
+            # termination in compose is unnecessary for testing the
+            # tenant logic — TLS is a CF concern, not a tenant bug
+            # surface. If TLS-specific bugs ever bite, add cert-manager
+            # + listen 8443 ssl here.
+            proxy_set_header Host              $host;
+            proxy_set_header X-Real-IP         $remote_addr;
+            proxy_set_header X-Forwarded-For   $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Host  $host;
+            proxy_set_header X-Forwarded-Proto $scheme;
+
+            # Streamable HTTP / SSE / WebSocket — the tenant exposes /ws
+            # and /events/stream + MCP /mcp/stream. Disabling buffering
+            # reproduces CF tunnel's pass-through streaming semantics
+            # (CF tunnel = no buffering by default; nginx default IS
+            # buffering, which would mask issue #2397-class streaming
+            # bugs by accumulating output until the client disconnects).
+            proxy_buffering         off;
+            proxy_request_buffering off;
+            proxy_http_version      1.1;
+            proxy_set_header        Connection "";
+
+            # Read timeout — CF tunnel default is 100s. Setting this to
+            # the same value catches "long agent run finishes after the
+            # proxy already closed the upstream" failure mode.
+            proxy_read_timeout      100s;
+        }
+    }
+}
--- a/tests/harness/compose.yml
+++ b/tests/harness/compose.yml
@ -0,0 +1,140 @@
+# Production-shape harness for local E2E.
+#
+# Reproduces the SaaS tenant topology on localhost using the SAME
+# images that ship to production:
+#
+#   client → cf-proxy (nginx, mimics CF tunnel headers)
+#          → tenant (workspace-server/Dockerfile.tenant — combined platform + canvas)
+#          → cp-stub (control-plane stand-in) for /cp/* and CP-callback paths
+#          → postgres + redis (same versions as production)
+#
+# Why this matters: the workspace-server binary IS identical between
+# local and production. The bugs that survive local E2E are topology
+# bugs — env-gated middleware (TenantGuard, CP proxy, Canvas proxy),
+# auth state, header rewrites, real production image. This harness
+# activates ALL of them.
+#
+# Quickstart:
+#   cd tests/harness && ./up.sh
+#   ./seed.sh
+#   ./replays/peer-discovery-404.sh   # reproduces issue #2397
+#
+# Env config:
+#   GIT_SHA — passed to the tenant build for /buildinfo verification.
+#             Defaults to "harness" so /buildinfo distinguishes the
+#             harness build from any cached image.
+#   CP_STUB_PEERS_MODE — peers failure mode for replay scripts.
+#                       "" / "404" / "401" / "500" / "timeout".
+
+services:
+  postgres:
+    image: postgres:16-alpine
+    environment:
+      POSTGRES_USER: harness
+      POSTGRES_PASSWORD: harness
+      POSTGRES_DB: molecule
+    networks: [harness-net]
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U harness"]
+      interval: 2s
+      timeout: 5s
+      retries: 10
+
+  redis:
+    image: redis:7-alpine
+    networks: [harness-net]
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 2s
+      timeout: 5s
+      retries: 10
+
+  cp-stub:
+    build:
+      context: ./cp-stub
+    environment:
+      PORT: "9090"
+      CP_STUB_PEERS_MODE: "${CP_STUB_PEERS_MODE:-}"
+    networks: [harness-net]
+    healthcheck:
+      test: ["CMD-SHELL", "wget -q -O- http://localhost:9090/healthz || exit 1"]
+      interval: 2s
+      timeout: 5s
+      retries: 10
+
+  # The actual production tenant image — same Dockerfile.tenant CI publishes.
+  # This is the load-bearing part of the harness: every bug class that hides
+  # behind "but it works locally" is reproducible HERE, against this image,
+  # not against `go run ./cmd/server`.
+  tenant:
+    build:
+      context: ../..
+      dockerfile: workspace-server/Dockerfile.tenant
+      args:
+        GIT_SHA: "${GIT_SHA:-harness}"
+    depends_on:
+      postgres:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+      cp-stub:
+        condition: service_healthy
+    environment:
+      DATABASE_URL: "postgres://harness:harness@postgres:5432/molecule?sslmode=disable"
+      REDIS_URL: "redis://redis:6379"
+      PORT: "8080"
+      PLATFORM_URL: "http://tenant:8080"
+      MOLECULE_ENV: "production"
+      # SECRETS_ENCRYPTION_KEY is required when MOLECULE_ENV=production —
+      # crypto.InitStrict() refuses to boot without it. up.sh generates a
+      # fresh 32-byte key per harness lifetime via `openssl rand -base64 32`
+      # and exports it into this compose file's interpolation environment.
+      # The :? sentinel makes the misuse loud — running `docker compose up`
+      # directly without going through up.sh fails fast with a clear error
+      # rather than getting a confusing tenant-unhealthy timeout.
+      SECRETS_ENCRYPTION_KEY: "${SECRETS_ENCRYPTION_KEY:?must be set — run via tests/harness/up.sh, which generates one per run}"
+      # ADMIN_TOKEN flips the platform into strict-auth mode (matches
+      # production's CP-minted token configuration). Seeded value lets
+      # E2E scripts authenticate without going through CP.
+      ADMIN_TOKEN: "harness-admin-token"
+      # MOLECULE_ORG_ID — activates TenantGuard middleware. Every request
+      # must carry X-Molecule-Org-Id matching this value. Replays bugs
+      # that only fire in SaaS mode.
+      MOLECULE_ORG_ID: "harness-org"
+      # CP_UPSTREAM_URL — activates the /cp/* reverse proxy mount in
+      # router.go. Without this set, /cp/* would 404 and the canvas
+      # bootstrap would silently drift from production behavior.
+      CP_UPSTREAM_URL: "http://cp-stub:9090"
+      RATE_LIMIT: "1000"
+      # Canvas auto-proxy — entrypoint-tenant.sh exports CANVAS_PROXY_URL
+      # by default; keeping it explicit here makes the topology readable.
+      CANVAS_PROXY_URL: "http://localhost:3000"
+    networks: [harness-net]
+    healthcheck:
+      test: ["CMD-SHELL", "wget -q -O- http://localhost:8080/health || exit 1"]
+      interval: 5s
+      timeout: 5s
+      retries: 20
+
+  # Cloudflare-tunnel-shape proxy — strips the :8080 suffix, rewrites
+  # Host to the tenant subdomain, injects X-Forwarded-*. Tests target
+  # http://harness-tenant.localhost:8080 and exercise the production
+  # routing layer.
+  cf-proxy:
+    image: nginx:1.27-alpine
+    depends_on:
+      tenant:
+        condition: service_healthy
+    volumes:
+      - ./cf-proxy/nginx.conf:/etc/nginx/nginx.conf:ro
+    # Bind to 127.0.0.1 only — the harness uses a hardcoded ADMIN_TOKEN
+    # ("harness-admin-token") so binding 0.0.0.0 (compose's default)
+    # would expose admin access to anyone on the local network or VPN.
+    # Loopback-only is safe for E2E and prevents a known-token leak.
+    ports:
+      - "127.0.0.1:8080:8080"
+    networks: [harness-net]
+
+networks:
+  harness-net:
+    name: molecule-harness-net
--- a/tests/harness/cp-stub/Dockerfile
+++ b/tests/harness/cp-stub/Dockerfile
@ -0,0 +1,14 @@
+# cp-stub — minimal CP stand-in for the local production-shape harness.
+# See main.go for the rationale. Self-contained build, no module deps.
+
+FROM golang:1.25-alpine AS builder
+WORKDIR /src
+COPY go.mod ./
+COPY main.go ./
+RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /cp-stub .
+
+FROM alpine:3.20
+RUN apk add --no-cache ca-certificates
+COPY --from=builder /cp-stub /cp-stub
+EXPOSE 9090
+ENTRYPOINT ["/cp-stub"]
--- a/tests/harness/cp-stub/go.mod
+++ b/tests/harness/cp-stub/go.mod
@ -0,0 +1,3 @@
+module github.com/Molecule-AI/molecule-monorepo/tests/harness/cp-stub
+
+go 1.25
--- a/tests/harness/cp-stub/main.go
+++ b/tests/harness/cp-stub/main.go
@ -0,0 +1,113 @@
+// cp-stub — minimal control-plane stand-in for the local production-shape harness.
+//
+// In production, the tenant Go server reverse-proxies /cp/* to the SaaS
+// control-plane (molecule-controlplane). This stub plays that role on
+// localhost so we can exercise the SAME code path the tenant takes in
+// production — `if cpURL := os.Getenv("CP_UPSTREAM_URL"); cpURL != ""`
+// in workspace-server/internal/router/router.go fires, the proxy mount
+// activates, and tests exercise the real tenant→CP wire.
+//
+// This is NOT a CP reimplementation. It serves the minimum surface to:
+//   1. Boot the tenant image without /cp/* breaking the canvas bootstrap.
+//   2. Replay specific bug classes (e.g. /cp/* returns 404, returns 5xx,
+//      returns malformed JSON) by toggling env vars.
+//
+// Scope is bounded by what the tenant + canvas actually call. Add new
+// handlers as new replay scenarios demand them. Drift from real CP is
+// tolerated because each handler is named for the exact path it serves —
+// when the real CP changes, the failing scenario tells us where to look.
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"log"
+	"net/http"
+	"os"
+	"sync/atomic"
+)
+
+// redeployFleetCalls tracks how many times /cp/admin/tenants/redeploy-fleet
+// was invoked. Replay scripts assert > 0 to confirm the workflow's redeploy
+// step actually reached the stub (catches misrouted CP_URL configs).
+var redeployFleetCalls atomic.Int64
+
+func main() {
+	mux := http.NewServeMux()
+
+	// /cp/auth/me — canvas calls this on bootstrap; minimal user record
+	// keeps the canvas from redirecting to login during local E2E.
+	mux.HandleFunc("/cp/auth/me", func(w http.ResponseWriter, r *http.Request) {
+		writeJSON(w, 200, map[string]any{
+			"id":     "harness-user",
+			"email":  "harness@local",
+			"org_id": "harness-org",
+			"roles":  []string{"admin"},
+		})
+	})
+
+	// /cp/admin/tenants/redeploy-fleet — exercised by the
+	// redeploy-tenants-on-{staging,main} workflow's local replay. Returns
+	// the same shape the real CP returns so the verify-fleet logic in CI
+	// can be tested without spinning up a real EC2 fleet.
+	mux.HandleFunc("/cp/admin/tenants/redeploy-fleet", func(w http.ResponseWriter, r *http.Request) {
+		redeployFleetCalls.Add(1)
+		writeJSON(w, 200, map[string]any{
+			"ok": true,
+			"results": []map[string]any{
+				{
+					"slug":          "harness-tenant",
+					"phase":         "redeploy",
+					"ssm_status":    "Success",
+					"ssm_exit_code": 0,
+					"healthz_ok":    true,
+				},
+			},
+		})
+	})
+
+	// __stub/state — expose stub state (counters) so replay scripts can
+	// assert the tenant actually reached us. Read-only.
+	mux.HandleFunc("/__stub/state", func(w http.ResponseWriter, r *http.Request) {
+		writeJSON(w, 200, map[string]any{
+			"redeploy_fleet_calls": redeployFleetCalls.Load(),
+		})
+	})
+
+	// Catch-all for any /cp/* the tenant proxies. Keeps the harness from
+	// crashing the canvas when a new CP route is added — surfaces a clear
+	// "stub doesn't implement X" error instead of opaque 502 from the
+	// reverse proxy.
+	mux.HandleFunc("/cp/", func(w http.ResponseWriter, r *http.Request) {
+		writeJSON(w, 501, map[string]any{
+			"error": "cp-stub: handler not implemented for " + r.Method + " " + r.URL.Path,
+			"hint":  "add a handler in tests/harness/cp-stub/main.go for the scenario you're testing",
+		})
+	})
+
+	// /healthz — readiness probe for compose's depends_on.
+	mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) {
+		writeJSON(w, 200, map[string]any{"status": "ok"})
+	})
+
+	addr := ":" + envOr("PORT", "9090")
+	log.Printf("cp-stub listening on %s", addr)
+	if err := http.ListenAndServe(addr, mux); err != nil {
+		log.Fatal(err)
+	}
+}
+
+func writeJSON(w http.ResponseWriter, code int, body any) {
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(code)
+	if err := json.NewEncoder(w).Encode(body); err != nil {
+		fmt.Fprintf(os.Stderr, "cp-stub: write json: %v\n", err)
+	}
+}
+
+func envOr(k, def string) string {
+	if v := os.Getenv(k); v != "" {
+		return v
+	}
+	return def
+}
--- a/tests/harness/down.sh
+++ b/tests/harness/down.sh
@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$HERE"
+docker compose -f compose.yml down -v --remove-orphans
+echo "[harness] down + volumes removed."
--- a/tests/harness/replays/buildinfo-stale-image.sh
+++ b/tests/harness/replays/buildinfo-stale-image.sh
@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+# Replay for issue #2395 — local proof that the /buildinfo verify gate
+# closes the SaaS deploy-chain blindness.
+#
+# Prior behavior: redeploy-fleet returned ssm_status=Success based on
+# the SSM RPC return code alone. EC2 tenants kept serving the cached
+# :latest digest because `docker compose up -d` is a no-op when the
+# tag hasn't been invalidated. ssm_status=Success was lying.
+#
+# This replay simulates that condition locally:
+#   1. Boot the harness with GIT_SHA=fix-applied.
+#   2. Curl /buildinfo and assert it returns "fix-applied" (the new code
+#      actually shipped).
+#   3. Negative test: curl with a different EXPECTED_SHA and assert the
+#      mismatch detection logic the workflow uses returns failure.
+#
+# This proves the verify-step's jq lookup + comparison logic works
+# against the SAME Dockerfile.tenant production builds. If the
+# /buildinfo route ever stops being wired through, this replay
+# catches it before it reaches a production tenant.
+
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+HARNESS_ROOT="$(dirname "$HERE")"
+
+BASE="${BASE:-http://harness-tenant.localhost:8080}"
+
+# 1. Confirm /buildinfo wire shape — same shape the workflow's jq lookup expects.
+echo "[replay] curl $BASE/buildinfo ..."
+BUILD_JSON=$(curl -sS "$BASE/buildinfo")
+echo "[replay]   $BUILD_JSON"
+
+ACTUAL_SHA=$(echo "$BUILD_JSON" | jq -r '.git_sha // ""')
+if [ -z "$ACTUAL_SHA" ]; then
+    echo "[replay] FAIL: /buildinfo response missing git_sha field — workflow's jq lookup would null"
+    exit 1
+fi
+echo "[replay] git_sha=$ACTUAL_SHA"
+
+# 2. Assert the harness build threaded GIT_SHA through. If we got "dev",
+#    the Dockerfile arg / ldflags wiring is broken — same regression
+#    class that made #2395 invisible until production.
+EXPECTED_FROM_HARNESS="${HARNESS_GIT_SHA:-harness}"
+if [ "$ACTUAL_SHA" = "dev" ]; then
+    echo "[replay] FAIL: /buildinfo returned 'dev' — Dockerfile.tenant ARG GIT_SHA isn't reaching the binary"
+    echo "[replay]       This regresses #2395 by silencing the deploy-verify gate."
+    exit 1
+fi
+if [ "$ACTUAL_SHA" != "$EXPECTED_FROM_HARNESS" ]; then
+    echo "[replay] WARN: /buildinfo returned '$ACTUAL_SHA' but harness was built with GIT_SHA='$EXPECTED_FROM_HARNESS'"
+    echo "[replay]       Image may be cached from a previous run. Run ./up.sh --rebuild to force a fresh build."
+fi
+
+# 3. Negative test — replay the workflow's mismatch detection by
+#    comparing the actual SHA to a deliberately-wrong expected SHA.
+WRONG_EXPECTED="0000000000000000000000000000000000000000"
+if [ "$ACTUAL_SHA" = "$WRONG_EXPECTED" ]; then
+    echo "[replay] FAIL: /buildinfo returned all-zero SHA — wiring inverted"
+    exit 1
+fi
+
+# 4. Replay the workflow's exact comparison logic so a regression in
+#    the verify step's bash gets caught here.
+MISMATCH_DETECTED=0
+if [ "$ACTUAL_SHA" != "$WRONG_EXPECTED" ]; then
+    MISMATCH_DETECTED=1
+fi
+if [ "$MISMATCH_DETECTED" != "1" ]; then
+    echo "[replay] FAIL: workflow comparison logic would not flag a real mismatch"
+    exit 1
+fi
+
+echo ""
+echo "[replay] PASS: /buildinfo wire shape, GIT_SHA injection, and mismatch detection all work in"
+echo "        production-shape topology. The redeploy-fleet verify-step covers what it claims to."
--- a/tests/harness/replays/peer-discovery-404.sh
+++ b/tests/harness/replays/peer-discovery-404.sh
@ -0,0 +1,139 @@
+#!/usr/bin/env bash
+# Replay for issue #2397 — local proof that peer-discovery surfaces
+# actionable diagnostics instead of "may be isolated".
+#
+# Prior behavior: tool_list_peers returned "No peers available (this
+# workspace may be isolated)" regardless of WHY peers were empty —
+# five distinct conditions (200+empty, 401, 403, 404, 5xx, network)
+# collapsed to one ambiguous message.
+#
+# This replay proves two things, separately:
+#   (a) WIRE: the platform side of the contract — the tenant's
+#       /registry/<unregistered>/peers returns 404. If this regresses
+#       (e.g. tenant starts returning 200 with empty list, or 500),
+#       the runtime helper would parse it differently and the agent
+#       would see a different diagnostic. The harness catches that here.
+#   (b) PARSE: the runtime helper, given a 404, produces a diagnostic
+#       containing "404" + "register" hints. Done in unit tests against
+#       a mock httpx response (test_a2a_client.py::TestGetPeersWithDiagnostic
+#       — the harness re-asserts the same contract here against a real
+#       Python eval that does NOT depend on workspace auth tokens.
+#
+# Why split the assertion: the Python eval here doesn't have the
+# workspace's auth token file, so going through get_peers_with_diagnostic
+# directly would hit the platform without auth and produce a different
+# branch (401 instead of 404). Splitting (a) from (b) keeps each
+# assertion targeting exactly what it claims to test.
+
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+HARNESS_ROOT="$(dirname "$HERE")"
+cd "$HARNESS_ROOT"
+
+if [ ! -f .seed.env ]; then
+    echo "[replay] no .seed.env — running ./seed.sh first..."
+    ./seed.sh
+fi
+# shellcheck source=/dev/null
+source .seed.env
+
+BASE="${BASE:-http://harness-tenant.localhost:8080}"
+ADMIN="harness-admin-token"
+ORG="harness-org"
+
+# ─── (a) WIRE: tenant returns 404 for an unregistered workspace ────────
+ROGUE_ID="$(uuidgen | tr '[:upper:]' '[:lower:]')"
+echo "[replay] (a) WIRE: querying /registry/$ROGUE_ID/peers (unregistered workspace)..."
+HTTP_CODE=$(curl -sS -o /tmp/peer-replay.json -w '%{http_code}' \
+    -H "Authorization: Bearer $ADMIN" \
+    -H "X-Molecule-Org-Id: $ORG" \
+    -H "X-Workspace-ID: $ROGUE_ID" \
+    "$BASE/registry/$ROGUE_ID/peers")
+
+echo "[replay]     tenant responded HTTP $HTTP_CODE"
+if [ "$HTTP_CODE" != "404" ]; then
+    echo "[replay] FAIL (a): expected 404 from /registry/<unregistered>/peers, got $HTTP_CODE"
+    echo "[replay]   This is a platform-side regression — the runtime's diagnostic helper"
+    echo "[replay]   would see a different status code than the unit tests cover."
+    cat /tmp/peer-replay.json
+    exit 1
+fi
+
+# ─── (b) PARSE: helper converts a synthetic 404 to actionable diagnostic ─
+#
+# We construct a synthetic httpx 404 response and run the helper against
+# it directly. This isolates the parse branch we want to test from the
+# auth-context concerns of going through the network. The helper's network
+# branches are exhaustively covered by tests/test_a2a_client.py — this is
+# a regression-guard that the helper IS in the install, IS importable in
+# the harness's Python env, and IS reading the status code.
+
+WORKSPACE_PATH="$(cd "$HARNESS_ROOT/../../workspace" && pwd)"
+DIAGNOSTIC=$(WORKSPACE_ID="harness-rogue" PYTHONPATH="$WORKSPACE_PATH" \
+    python3 - "$WORKSPACE_PATH" <<'PYEOF'
+import asyncio
+import sys
+import types
+from unittest.mock import AsyncMock, MagicMock, patch
+
+# Stub platform_auth so a2a_client imports cleanly without requiring a
+# real workspace token file. The helper's auth_headers() only matters
+# when going through the network; we're feeding it a mock response.
+_pa = types.ModuleType("platform_auth")
+_pa.auth_headers = lambda: {}
+_pa.self_source_headers = lambda: {}
+sys.modules.setdefault("platform_auth", _pa)
+
+sys.path.insert(0, sys.argv[1])
+import a2a_client  # noqa: E402
+
+# This replay validates PR #2399's diagnostic helper. If the workspace
+# runtime in the current checkout pre-dates that fix, fail with a
+# clear message instead of an opaque AttributeError.
+if not hasattr(a2a_client, "get_peers_with_diagnostic"):
+    print("__SKIP__: workspace/a2a_client.py is pre-#2399 (no get_peers_with_diagnostic).")
+    sys.exit(0)
+
+resp = MagicMock()
+resp.status_code = 404
+resp.json = MagicMock(return_value={"detail": "not found"})
+
+mock_client = AsyncMock()
+mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+mock_client.__aexit__ = AsyncMock(return_value=False)
+mock_client.get = AsyncMock(return_value=resp)
+
+async def main():
+    with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
+        peers, diag = await a2a_client.get_peers_with_diagnostic()
+    print(repr(diag))
+
+asyncio.run(main())
+PYEOF
+)
+
+if [[ "$DIAGNOSTIC" == __SKIP__:* ]]; then
+    echo "[replay] (b) SKIP: ${DIAGNOSTIC#__SKIP__: }"
+    echo "[replay]            Re-run after #2399 lands on staging."
+    echo ""
+    echo "[replay] PASS (a) only: peer-discovery wire returns 404 (parse branch skipped — see above)."
+    exit 0
+fi
+
+echo "[replay] (b) PARSE: helper diagnostic = $DIAGNOSTIC"
+
+if ! echo "$DIAGNOSTIC" | grep -q "404"; then
+    echo "[replay] FAIL (b): diagnostic missing '404' — helper regressed to swallow-the-status-code"
+    exit 1
+fi
+if ! echo "$DIAGNOSTIC" | grep -qi "regist"; then
+    echo "[replay] FAIL (b): diagnostic missing 'register' guidance — helper regressed to opaque message"
+    exit 1
+fi
+if echo "$DIAGNOSTIC" | grep -qi "may be isolated"; then
+    echo "[replay] FAIL (b): diagnostic still says 'may be isolated' — fix didn't reach this code path"
+    exit 1
+fi
+
+echo ""
+echo "[replay] PASS: peer-discovery (a) wire returns 404, (b) helper produces actionable diagnostic."
--- a/tests/harness/requirements.txt
+++ b/tests/harness/requirements.txt
@ -0,0 +1,14 @@
+# Harness-replay Python deps — minimal set for replays/*.sh scripts that
+# eval Python against the running tenant (e.g. importing
+# workspace/a2a_client.py to assert parser behavior).
+#
+# This is intentionally smaller than workspace/requirements.txt: the
+# replays don't need a2a-sdk, langchain, opentelemetry, etc. — only the
+# HTTP client surface that the imported helpers depend on. Adding the
+# full workspace deps would slow every harness CI run by ~30s for no
+# gain.
+#
+# Add a line here (with a version constraint matching workspace/requirements.txt)
+# when a new replay introduces a new Python import.
+
+httpx>=0.28.1
--- a/tests/harness/run-all-replays.sh
+++ b/tests/harness/run-all-replays.sh
@ -0,0 +1,90 @@
+#!/usr/bin/env bash
+# Run every replay under tests/harness/replays/ against a fresh harness.
+#
+# Boots the harness (up.sh + seed.sh), runs each `replays/*.sh` in
+# alphabetical order, tracks pass/fail, and tears down on exit. Returns
+# non-zero if any replay failed.
+#
+# Usage:
+#   ./run-all-replays.sh                # boot, run, teardown
+#   KEEP_UP=1 ./run-all-replays.sh      # leave harness running on exit (debug)
+#   REBUILD=1 ./run-all-replays.sh      # rebuild images before booting
+#
+# CI usage: invoke without flags. The trap-on-EXIT teardown ensures we
+# don't leak Docker resources when a replay fails partway through.
+
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$HERE"
+
+REPLAYS_DIR="$HERE/replays"
+if [ ! -d "$REPLAYS_DIR" ]; then
+    echo "[run-all] no replays/ directory at $REPLAYS_DIR — nothing to run"
+    exit 1
+fi
+
+shopt -s nullglob
+REPLAYS=("$REPLAYS_DIR"/*.sh)
+shopt -u nullglob
+if [ ${#REPLAYS[@]} -eq 0 ]; then
+    echo "[run-all] replays/ is empty — nothing to run"
+    exit 1
+fi
+
+cleanup() {
+    local exit_code=$?
+    if [ "${KEEP_UP:-0}" = "1" ]; then
+        echo ""
+        echo "[run-all] KEEP_UP=1 — leaving harness up. Tear down manually with ./down.sh"
+    else
+        echo ""
+        echo "[run-all] tearing down harness..."
+        ./down.sh >/dev/null 2>&1 || echo "[run-all] WARN: ./down.sh exited non-zero"
+    fi
+    exit "$exit_code"
+}
+trap cleanup EXIT INT TERM
+
+echo "[run-all] booting harness..."
+if [ "${REBUILD:-0}" = "1" ]; then
+    ./up.sh --rebuild
+else
+    ./up.sh
+fi
+
+echo "[run-all] seeding workspaces..."
+./seed.sh
+
+PASS_COUNT=0
+FAIL_COUNT=0
+SKIP_COUNT=0
+FAILED_NAMES=()
+
+for replay in "${REPLAYS[@]}"; do
+    name=$(basename "$replay" .sh)
+    echo ""
+    echo "[run-all] ━━━ $name ━━━"
+    if bash "$replay"; then
+        # Replays signal "skip" by exiting 0 with a __SKIP__ marker in stdout —
+        # but we capture that as a pass here since the script exited 0. The
+        # skip is documented in the script's own output. CI uses pass/fail.
+        PASS_COUNT=$((PASS_COUNT + 1))
+        echo "[run-all] PASS: $name"
+    else
+        FAIL_COUNT=$((FAIL_COUNT + 1))
+        FAILED_NAMES+=("$name")
+        echo "[run-all] FAIL: $name"
+    fi
+done
+
+echo ""
+echo "[run-all] ============================="
+echo "[run-all] Replay summary: ${PASS_COUNT} passed, ${FAIL_COUNT} failed (of ${#REPLAYS[@]} total)"
+if [ ${FAIL_COUNT} -gt 0 ]; then
+    echo "[run-all] Failed:"
+    for name in "${FAILED_NAMES[@]}"; do
+        echo "[run-all]   - $name"
+    done
+    exit 1
+fi
+echo "[run-all] All replays passed."
--- a/tests/harness/seed.sh
+++ b/tests/harness/seed.sh
@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+# Seed the harness with two registered workspaces so peer-discovery
+# replay scripts have something to discover.
+#
+# - "alpha"  parent (tier 0)
+# - "beta"   child of alpha (tier 1)
+#
+# Both register via the platform's /registry/register endpoint, which
+# is what real workspaces do at boot. The platform then has them in its
+# DB; tool_list_peers from inside alpha can resolve beta as a peer.
+
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$HERE"
+
+BASE="${BASE:-http://harness-tenant.localhost:8080}"
+ADMIN="harness-admin-token"
+ORG="harness-org"
+
+curl_admin() {
+    curl -sS -H "Authorization: Bearer $ADMIN" \
+            -H "X-Molecule-Org-Id: $ORG" \
+            -H "Content-Type: application/json" "$@"
+}
+
+echo "[seed] confirming tenant is reachable via cf-proxy..."
+HEALTH=$(curl -sS "$BASE/health" || echo "")
+if [ -z "$HEALTH" ]; then
+    echo "[seed] FAILED: $BASE/health unreachable. Did ./up.sh complete? Did you add"
+    echo "       127.0.0.1 harness-tenant.localhost to /etc/hosts?"
+    exit 1
+fi
+echo "[seed]   $HEALTH"
+
+echo "[seed] confirming /buildinfo returns the harness GIT_SHA..."
+BUILD=$(curl -sS "$BASE/buildinfo" || echo "")
+echo "[seed]   $BUILD"
+
+# Mint a fresh admin-call workspace ID for the parent. Platform's
+# /admin/workspaces/:id/test-token mints a per-workspace bearer; the
+# replay scripts use it to call the workspace-scoped routes.
+echo "[seed] creating workspace 'alpha' (parent)..."
+ALPHA_ID=$(uuidgen | tr '[:upper:]' '[:lower:]')
+curl_admin -X POST "$BASE/workspaces" \
+    -d "{\"id\":\"$ALPHA_ID\",\"name\":\"alpha\",\"tier\":0,\"runtime\":\"langgraph\"}" \
+    >/dev/null
+echo "[seed]   alpha id=$ALPHA_ID"
+
+echo "[seed] creating workspace 'beta' (child of alpha)..."
+BETA_ID=$(uuidgen | tr '[:upper:]' '[:lower:]')
+curl_admin -X POST "$BASE/workspaces" \
+    -d "{\"id\":\"$BETA_ID\",\"name\":\"beta\",\"tier\":1,\"parent_id\":\"$ALPHA_ID\",\"runtime\":\"langgraph\"}" \
+    >/dev/null
+echo "[seed]   beta id=$BETA_ID"
+
+# Stash IDs so replay scripts pick them up.
+{
+    echo "ALPHA_ID=$ALPHA_ID"
+    echo "BETA_ID=$BETA_ID"
+} > "$HERE/.seed.env"
+
+echo ""
+echo "[seed] done. IDs persisted to tests/harness/.seed.env"
+echo "[seed]   ALPHA_ID=$ALPHA_ID"
+echo "[seed]   BETA_ID=$BETA_ID"
--- a/tests/harness/up.sh
+++ b/tests/harness/up.sh
@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Bring the production-shape harness up.
+#
+# Usage: ./up.sh [--rebuild]
+#
+# Always operates in tests/harness/ regardless of where it's invoked
+# from — test scripts under tests/harness/replays/ source it via the
+# absolute path, so cd-ing first prevents compose-context surprises.
+
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$HERE"
+
+REBUILD=false
+for arg in "$@"; do
+    case "$arg" in
+        --rebuild) REBUILD=true ;;
+    esac
+done
+
+# Generate a per-run encryption key. The tenant runs with
+# MOLECULE_ENV=production (intentional, to replay prod-shape bugs), and
+# crypto.InitStrict() refuses to boot without SECRETS_ENCRYPTION_KEY.
+# Generate fresh so:
+#   - No key-shaped string lives in the repo (avoids muscle-memorying a
+#     hardcoded value into other places + secret-scanner false positives).
+#   - Each harness lifetime gets a unique key, mimicking prod's per-tenant
+#     isolation. Persistence across runs isn't required — the harness DB
+#     is wiped on every ./down.sh.
+# Honor a caller-supplied value if already exported (lets a debug session
+# pin a key for reproducibility).
+if [ -z "${SECRETS_ENCRYPTION_KEY:-}" ]; then
+    SECRETS_ENCRYPTION_KEY=$(openssl rand -base64 32)
+    export SECRETS_ENCRYPTION_KEY
+fi
+
+if [ "$REBUILD" = true ]; then
+    docker compose -f compose.yml build --no-cache tenant cp-stub
+fi
+
+echo "[harness] starting cp-stub + postgres + redis + tenant + cf-proxy ..."
+docker compose -f compose.yml up -d --wait
+
+echo "[harness] /etc/hosts entry for harness-tenant.localhost..."
+if ! grep -q '^127\.0\.0\.1[[:space:]]\+harness-tenant\.localhost' /etc/hosts; then
+    echo "  (skip — your /etc/hosts may not resolve *.localhost. If tests fail with"
+    echo "   'getaddrinfo' errors, add: 127.0.0.1 harness-tenant.localhost)"
+fi
+
+echo ""
+echo "[harness] up. Tenant: http://harness-tenant.localhost:8080/health"
+echo "                     http://harness-tenant.localhost:8080/buildinfo"
+echo "          cp-stub:    http://localhost (internal-only via compose net)"
+echo ""
+echo "Next: ./seed.sh   # mint admin token + register sample workspaces"
--- a/workspace-server/cmd/server/main.go
+++ b/workspace-server/cmd/server/main.go
@ -223,13 +223,24 @@ func main() {
 		registry.StartLivenessMonitor(c, onWorkspaceOffline)
 	})

-	// Proactive container health sweep — detects dead containers faster than Redis TTL.
-	// Checks all "online" workspaces against Docker every 15 seconds.
-	if prov != nil {
-		go supervised.RunWithRecover(ctx, "health-sweep", func(c context.Context) {
-			registry.StartHealthSweep(c, prov, 15*time.Second, onWorkspaceOffline)
-		})
-	}
+	// Proactive health sweep — two passes per tick:
+	//   1. Docker-side: checks "online" workspaces against the local Docker
+	//      daemon (only runs when prov is non-nil, i.e. self-hosted mode).
+	//   2. Remote-side: scans runtime='external' rows whose last_heartbeat_at
+	//      is past REMOTE_LIVENESS_STALE_AFTER and flips them to
+	//      awaiting_agent. Runs regardless of provisioner mode — SaaS
+	//      tenants need this even though they don't run Docker locally,
+	//      because external-runtime workspaces are operator-managed and
+	//      the platform-side liveness sweep is the only thing that
+	//      transitions them off 'online' when the operator's CLI dies.
+	//
+	// Pre-2026-04-30 this goroutine was gated on prov != nil, which silently
+	// disabled the remote-side sweep on every SaaS tenant. The function in
+	// healthsweep.go has always handled nil checker correctly; only the
+	// orchestration was wrong. See #2392's CI failure for the trace.
+	go supervised.RunWithRecover(ctx, "health-sweep", func(c context.Context) {
+		registry.StartHealthSweep(c, prov, 15*time.Second, onWorkspaceOffline)
+	})

 	// Orphan-container reconcile sweep — finds running containers
 	// whose workspace row is already status='removed' and stops
--- a/workspace-server/internal/middleware/tenant_guard.go
+++ b/workspace-server/internal/middleware/tenant_guard.go
@ -53,6 +53,7 @@ const tenantOrgIDHeader = "X-Molecule-Org-Id"
 // here only bypasses the cross-org routing check, not auth.
 var tenantGuardAllowlist = map[string]struct{}{
 	"/health":             {},
+	"/buildinfo":          {},
 	"/metrics":            {},
 	"/registry/register":  {},
 	"/registry/heartbeat": {},
--- a/workspace-server/internal/middleware/tenant_guard_test.go
+++ b/workspace-server/internal/middleware/tenant_guard_test.go
@ -8,13 +8,15 @@ import (
 	"github.com/gin-gonic/gin"
 )

-// helper: build a router with TenantGuard configured to `orgID` and two
-// representative routes — a regular API route and two allowlisted ones.
+// helper: build a router with TenantGuard configured to `orgID` and a
+// representative API route plus the public allowlisted ones (/health,
+// /buildinfo, /metrics).
 func newGuardedRouter(orgID string) *gin.Engine {
 	gin.SetMode(gin.TestMode)
 	r := gin.New()
 	r.Use(TenantGuardWithOrgID(orgID))
 	r.GET("/health", func(c *gin.Context) { c.String(200, "ok") })
+	r.GET("/buildinfo", func(c *gin.Context) { c.String(200, "buildinfo") })
 	r.GET("/metrics", func(c *gin.Context) { c.String(200, "metrics") })
 	r.GET("/workspaces", func(c *gin.Context) { c.String(200, "workspaces") })
 	return r
@ -71,10 +73,14 @@ func TestTenantGuard_MissingHeaderIs404(t *testing.T) {
 }

 // Allowlisted paths bypass the guard even in tenant mode — required for health
-// probes (Fly Machines checks) and Prometheus scrape.
+// probes (Fly Machines checks), Prometheus scrape, and the redeploy-fleet
+// /buildinfo verification step. /buildinfo without an org header used to
+// 404-via-NoRoute → canvas (HTML), which made the redeploy verifier think
+// every tenant was stale even when the binary was current. Pin this so a
+// future allowlist edit can't silently regress that check.
 func TestTenantGuard_AllowlistBypassesCheck(t *testing.T) {
 	r := newGuardedRouter("org-abc")
-	for _, path := range []string{"/health", "/metrics"} {
+	for _, path := range []string{"/health", "/buildinfo", "/metrics"} {
 		w := doRequest(r, path, "") // no header
 		if w.Code != 200 {
 			t.Errorf("%s: allowlisted path should return 200 without header, got %d", path, w.Code)
--- a/workspace/a2a_client.py
+++ b/workspace/a2a_client.py
@ -229,19 +229,61 @@ async def send_a2a_message(target_url: str, message: str) -> str:
    return _format_a2a_error(last_exc, target_url)


-async def get_peers() -> list[dict]:
-    """Get this workspace's peers from the platform registry."""
+async def get_peers_with_diagnostic() -> tuple[list[dict], str | None]:
+    """Get this workspace's peers, returning (peers, diagnostic).
+
+    diagnostic is None when the call succeeded (status 200, even if the list
+    is empty). When peers is [] for a non-trivial reason (auth failure,
+    workspace-id missing from registry, platform error, network error),
+    diagnostic is a short human-readable string explaining what went wrong
+    so callers can surface it instead of "may be isolated" — see #2397.
+
+    The legacy get_peers() shim below preserves the bare-list contract for
+    non-tool callers.
+    """
+    url = f"{PLATFORM_URL}/registry/{WORKSPACE_ID}/peers"
    async with httpx.AsyncClient(timeout=10.0) as client:
        try:
            resp = await client.get(
-                f"{PLATFORM_URL}/registry/{WORKSPACE_ID}/peers",
+                url,
                headers={"X-Workspace-ID": WORKSPACE_ID, **auth_headers()},
            )
-            if resp.status_code == 200:
-                return resp.json()
-            return []
-        except Exception:
-            return []
+        except Exception as e:
+            return [], f"Cannot reach platform at {PLATFORM_URL}: {e}"
+
+        if resp.status_code == 200:
+            try:
+                data = resp.json()
+            except Exception as e:
+                return [], f"Platform returned 200 but body was not JSON: {e}"
+            if not isinstance(data, list):
+                return [], f"Platform returned 200 but body was not a list: {type(data).__name__}"
+            return data, None
+
+        if resp.status_code in (401, 403):
+            return [], (
+                f"Authentication to platform failed (HTTP {resp.status_code}). "
+                "The workspace bearer token may be invalid — restarting the workspace usually re-mints it."
+            )
+        if resp.status_code == 404:
+            return [], (
+                f"Workspace ID {WORKSPACE_ID} is not registered with the platform (HTTP 404). "
+                "Re-registration via the platform's /registry/register endpoint is needed."
+            )
+        if 500 <= resp.status_code < 600:
+            return [], f"Platform error: HTTP {resp.status_code}."
+        return [], f"Unexpected platform response: HTTP {resp.status_code}."
+
+
+async def get_peers() -> list[dict]:
+    """Get this workspace's peers from the platform registry.
+
+    Bare-list shim over get_peers_with_diagnostic() — discards the diagnostic
+    so callers that don't care about the failure reason (e.g. system-prompt
+    bootstrap formatters) get the same shape they always had.
+    """
+    peers, _ = await get_peers_with_diagnostic()
+    return peers


 async def get_workspace_info() -> dict:
--- a/workspace/a2a_tools.py
+++ b/workspace/a2a_tools.py
@ -18,6 +18,7 @@ from a2a_client import (
    _peer_names,
    discover_peer,
    get_peers,
+    get_peers_with_diagnostic,
    get_workspace_info,
    send_a2a_message,
 )
@ -410,9 +411,16 @@ async def tool_send_message_to_user(message: str, attachments: list[str] | None

 async def tool_list_peers() -> str:
    """List all workspaces this agent can communicate with."""
-    peers = await get_peers()
+    peers, diagnostic = await get_peers_with_diagnostic()
    if not peers:
-        return "No peers available (this workspace may be isolated)"
+        if diagnostic is not None:
+            # Non-trivial empty: auth failure / 404 / 5xx / network — surface
+            # the actual reason so the user/agent doesn't have to guess. #2397.
+            return f"No peers found. {diagnostic}"
+        return (
+            "You have no peers in the platform registry. "
+            "(No parent, no children, no siblings registered.)"
+        )
    lines = []
    for p in peers:
        status = p.get("status", "unknown")
--- a/workspace/tests/test_a2a_client.py
+++ b/workspace/tests/test_a2a_client.py
@ -577,6 +577,149 @@ class TestGetPeers:
        assert headers_sent.get("X-Workspace-ID") == a2a_client.WORKSPACE_ID


+# ---------------------------------------------------------------------------
+# get_peers_with_diagnostic — issue #2397
+#
+# Pin: an empty peer list MUST come with an actionable diagnostic on every
+# non-200 + every transport failure. The bug was that get_peers swallowed
+# every failure mode behind `return []`, leaving the agent's tool wrapper
+# with no way to distinguish "you have no peers" from "auth broke" / "404
+# from registry" / "platform 5xx" / "network timeout". Each of these
+# requires a different operator action.
+# ---------------------------------------------------------------------------
+
+class TestGetPeersWithDiagnostic:
+
+    async def test_200_returns_peers_and_no_diagnostic(self):
+        """200 with valid list → (peers, None). diagnostic stays None on success."""
+        import a2a_client
+
+        peers = [{"id": "ws-1", "name": "Alpha"}]
+        resp = _make_response(200, peers)
+        mock_client = _make_mock_client(get_resp=resp)
+
+        with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
+            result, diag = await a2a_client.get_peers_with_diagnostic()
+
+        assert result == peers
+        assert diag is None
+
+    async def test_200_empty_list_returns_no_diagnostic(self):
+        """200 with [] → (peers=[], diag=None). Truly no peers is success, not error."""
+        import a2a_client
+
+        resp = _make_response(200, [])
+        mock_client = _make_mock_client(get_resp=resp)
+
+        with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
+            result, diag = await a2a_client.get_peers_with_diagnostic()
+
+        assert result == []
+        assert diag is None
+
+    async def test_401_returns_auth_diagnostic(self):
+        """401 → diagnostic mentions auth + restart hint."""
+        import a2a_client
+
+        resp = _make_response(401, {"detail": "unauthorized"})
+        mock_client = _make_mock_client(get_resp=resp)
+
+        with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
+            result, diag = await a2a_client.get_peers_with_diagnostic()
+
+        assert result == []
+        assert diag is not None
+        assert "401" in diag
+        assert "Authentication" in diag or "authentication" in diag.lower()
+
+    async def test_403_returns_auth_diagnostic(self):
+        """403 → same auth-failure diagnostic shape as 401."""
+        import a2a_client
+
+        resp = _make_response(403, {"detail": "forbidden"})
+        mock_client = _make_mock_client(get_resp=resp)
+
+        with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
+            result, diag = await a2a_client.get_peers_with_diagnostic()
+
+        assert result == []
+        assert diag is not None
+        assert "403" in diag
+
+    async def test_404_returns_registration_diagnostic(self):
+        """404 → diagnostic tells operator the workspace ID is missing from the registry."""
+        import a2a_client
+
+        resp = _make_response(404, {"detail": "not found"})
+        mock_client = _make_mock_client(get_resp=resp)
+
+        with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
+            result, diag = await a2a_client.get_peers_with_diagnostic()
+
+        assert result == []
+        assert diag is not None
+        assert "404" in diag
+        assert "registered" in diag.lower() or "registration" in diag.lower()
+
+    async def test_500_returns_platform_error_diagnostic(self):
+        """5xx → 'Platform error: HTTP <code>.'"""
+        import a2a_client
+
+        resp = _make_response(503, {"detail": "service unavailable"})
+        mock_client = _make_mock_client(get_resp=resp)
+
+        with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
+            result, diag = await a2a_client.get_peers_with_diagnostic()
+
+        assert result == []
+        assert diag is not None
+        assert "503" in diag
+        assert "Platform error" in diag or "platform error" in diag.lower()
+
+    async def test_network_exception_returns_unreachable_diagnostic(self):
+        """httpx exception → diagnostic mentions PLATFORM_URL + the underlying error."""
+        import a2a_client
+
+        mock_client = _make_mock_client(get_exc=TimeoutError("connection timed out"))
+
+        with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
+            result, diag = await a2a_client.get_peers_with_diagnostic()
+
+        assert result == []
+        assert diag is not None
+        assert "Cannot reach platform" in diag or "cannot reach" in diag.lower()
+        assert "timed out" in diag
+
+    async def test_200_with_non_list_body_returns_diagnostic(self):
+        """200 but body is a dict → diagnostic flags shape mismatch (regression guard)."""
+        import a2a_client
+
+        resp = _make_response(200, {"oops": "should have been a list"})
+        mock_client = _make_mock_client(get_resp=resp)
+
+        with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
+            result, diag = await a2a_client.get_peers_with_diagnostic()
+
+        assert result == []
+        assert diag is not None
+        assert "list" in diag.lower()
+
+    async def test_get_peers_shim_preserves_bare_list_contract(self):
+        """get_peers() still returns just list[dict] — no API break for non-tool callers."""
+        import a2a_client
+
+        peers = [{"id": "ws-1", "name": "Alpha"}]
+        resp = _make_response(200, peers)
+        mock_client = _make_mock_client(get_resp=resp)
+
+        with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
+            result = await a2a_client.get_peers()
+
+        # Must be a list, not a tuple — bare-list shim contract.
+        assert isinstance(result, list)
+        assert result == peers
+
+
 # ---------------------------------------------------------------------------
 # get_workspace_info
 # ---------------------------------------------------------------------------
--- a/workspace/tests/test_a2a_tools_impl.py
+++ b/workspace/tests/test_a2a_tools_impl.py
@ -536,11 +536,54 @@ class TestToolSendMessageToUser:

 class TestToolListPeers:

-    async def test_no_peers_returns_isolated_message(self):
+    async def test_true_empty_returns_no_peers_message_without_diagnostic(self):
+        """200 + empty list → 'no peers in the platform registry' (no failure)."""
        import a2a_tools
-        with patch("a2a_tools.get_peers", return_value=[]):
+        with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], None)):
            result = await a2a_tools.tool_list_peers()
-        assert "No peers available" in result
+        # The new wording explicitly says no peers exist (no parent/sibling/child).
+        # Avoids the misleading "may be isolated" hint when discovery succeeded.
+        assert "no peers" in result.lower()
+        assert "No peers found." not in result  # diagnostic prefix should NOT appear on the success branch
+        assert "may be isolated" not in result
+
+    async def test_auth_failure_surfaces_restart_hint(self):
+        """401/403 → tool_list_peers must surface the auth failure + restart hint, not 'isolated'."""
+        import a2a_tools
+        diag = "Authentication to platform failed (HTTP 401). Restart the workspace to re-mint."
+        with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)):
+            result = await a2a_tools.tool_list_peers()
+        assert "401" in result
+        assert "Authentication" in result
+        # The "isolated" message was the bug — make sure the regression doesn't return.
+        assert "may be isolated" not in result
+
+    async def test_404_surfaces_registration_hint(self):
+        """404 → tool_list_peers tells the user re-registration is needed."""
+        import a2a_tools
+        diag = "Workspace ID ws-test is not registered with the platform (HTTP 404). Re-register."
+        with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)):
+            result = await a2a_tools.tool_list_peers()
+        assert "404" in result
+        assert "registered" in result.lower()
+
+    async def test_5xx_surfaces_platform_error(self):
+        """5xx → 'Platform error' surfaced; agent / user can correctly route to oncall."""
+        import a2a_tools
+        diag = "Platform error: HTTP 503."
+        with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)):
+            result = await a2a_tools.tool_list_peers()
+        assert "503" in result
+        assert "Platform error" in result
+
+    async def test_network_error_surfaces_unreachable(self):
+        """Network error → operator can tell that the workspace can't reach the platform at all."""
+        import a2a_tools
+        diag = "Cannot reach platform at http://platform.example: timed out"
+        with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)):
+            result = await a2a_tools.tool_list_peers()
+        assert "Cannot reach platform" in result
+        assert "timed out" in result

    async def test_peers_returned_formatted_lines(self):
        """Peers list is formatted as '- name (ID: ..., status: ..., role: ...)'."""
@ -550,7 +593,7 @@ class TestToolListPeers:
            {"id": "ws-1", "name": "Alpha", "status": "online", "role": "worker"},
            {"id": "ws-2", "name": "Beta", "status": "idle", "role": "analyst"},
        ]
-        with patch("a2a_tools.get_peers", return_value=peers):
+        with patch("a2a_tools.get_peers_with_diagnostic", return_value=(peers, None)):
            result = await a2a_tools.tool_list_peers()

        assert "Alpha" in result
@ -567,7 +610,7 @@ class TestToolListPeers:
        # Clear any prior cache entries for these IDs
        a2a_tools._peer_names.pop("ws-cache-test", None)
        peers = [{"id": "ws-cache-test", "name": "CacheMe", "status": "online", "role": "w"}]
-        with patch("a2a_tools.get_peers", return_value=peers):
+        with patch("a2a_tools.get_peers_with_diagnostic", return_value=(peers, None)):
            await a2a_tools.tool_list_peers()

        assert a2a_tools._peer_names.get("ws-cache-test") == "CacheMe"
@ -577,7 +620,7 @@ class TestToolListPeers:
        import a2a_tools

        peers = [{"id": "ws-3", "name": "Gamma"}]  # no status, no role
-        with patch("a2a_tools.get_peers", return_value=peers):
+        with patch("a2a_tools.get_peers_with_diagnostic", return_value=(peers, None)):
            result = await a2a_tools.tool_list_peers()

        assert "Gamma" in result