forked from molecule-ai/molecule-core
Merge pull request #2404 from Molecule-AI/staging
staging → main: auto-promote 6159429
This commit is contained in:
commit
0e3544d7b8
164
.github/workflows/e2e-staging-external.yml
vendored
Normal file
164
.github/workflows/e2e-staging-external.yml
vendored
Normal file
@ -0,0 +1,164 @@
|
||||
name: E2E Staging External Runtime
|
||||
|
||||
# Regression for the four/five workspaces.status=awaiting_agent transitions
|
||||
# that silently failed in production for five days before migration 046
|
||||
# extended the workspace_status enum (see
|
||||
# workspace-server/migrations/046_workspace_status_awaiting_agent.up.sql).
|
||||
#
|
||||
# Why this is its own workflow (not folded into e2e-staging-saas.yml):
|
||||
# - The full-saas harness defaults to runtime=hermes, never exercises
|
||||
# external-runtime. Adding an `external` parameter to that script
|
||||
# would force every push to staging through both lifecycles in
|
||||
# series, doubling the EC2 cold-start budget.
|
||||
# - The external lifecycle has unique timing (REMOTE_LIVENESS_STALE_AFTER
|
||||
# window, 90s default + sweep interval), which we wait through
|
||||
# deliberately. Folding it into hermes would make the long path
|
||||
# even longer.
|
||||
# - It can run in parallel with the hermes E2E since both create
|
||||
# fresh tenant orgs with distinct slug prefixes (`e2e-ext-...` vs
|
||||
# `e2e-...`).
|
||||
#
|
||||
# Triggers:
|
||||
# - Push to staging when any source affecting external runtime,
|
||||
# hibernation, or the migration set changes.
|
||||
# - PR review for the same set.
|
||||
# - Manual workflow_dispatch.
|
||||
# - Daily cron at 07:30 UTC (catches drift on quiet days; staggered
|
||||
# 30 min after e2e-staging-saas.yml's 07:00 UTC cron).
|
||||
#
|
||||
# Concurrency: serialized so two staging pushes don't fight for the
|
||||
# same EC2 quota window. cancel-in-progress=false so a half-rolled
|
||||
# tenant always finishes its teardown.
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [staging, main]
|
||||
paths:
|
||||
- 'workspace-server/internal/handlers/workspace.go'
|
||||
- 'workspace-server/internal/handlers/registry.go'
|
||||
- 'workspace-server/internal/handlers/workspace_restart.go'
|
||||
- 'workspace-server/internal/registry/healthsweep.go'
|
||||
- 'workspace-server/internal/registry/liveness.go'
|
||||
- 'workspace-server/migrations/**'
|
||||
- 'workspace-server/internal/db/workspace_status_enum_drift_test.go'
|
||||
- 'tests/e2e/test_staging_external_runtime.sh'
|
||||
- '.github/workflows/e2e-staging-external.yml'
|
||||
pull_request:
|
||||
branches: [staging, main]
|
||||
paths:
|
||||
- 'workspace-server/internal/handlers/workspace.go'
|
||||
- 'workspace-server/internal/handlers/registry.go'
|
||||
- 'workspace-server/internal/handlers/workspace_restart.go'
|
||||
- 'workspace-server/internal/registry/healthsweep.go'
|
||||
- 'workspace-server/internal/registry/liveness.go'
|
||||
- 'workspace-server/migrations/**'
|
||||
- 'workspace-server/internal/db/workspace_status_enum_drift_test.go'
|
||||
- 'tests/e2e/test_staging_external_runtime.sh'
|
||||
- '.github/workflows/e2e-staging-external.yml'
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
keep_org:
|
||||
description: "Skip teardown for debugging (only via manual dispatch)"
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
stale_wait_secs:
|
||||
description: "Seconds to wait for the heartbeat-staleness sweep (default 180 = 90s window + 90s buffer)"
|
||||
required: false
|
||||
default: "180"
|
||||
schedule:
|
||||
- cron: '30 7 * * *'
|
||||
|
||||
concurrency:
|
||||
group: e2e-staging-external
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
e2e-staging-external:
|
||||
name: E2E Staging External Runtime
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 25
|
||||
|
||||
env:
|
||||
MOLECULE_CP_URL: https://staging-api.moleculesai.app
|
||||
MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
||||
E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
|
||||
E2E_STALE_WAIT_SECS: ${{ github.event.inputs.stale_wait_secs || '180' }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
|
||||
|
||||
- name: Verify admin token present
|
||||
run: |
|
||||
if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
|
||||
# Schedule + push triggers must hard-fail when the token is
|
||||
# missing — silent skip would mask infra rot. Manual dispatch
|
||||
# gets the same hard-fail; an operator running this on a fork
|
||||
# without secrets configured needs to know up-front.
|
||||
echo "::error::MOLECULE_STAGING_ADMIN_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
|
||||
exit 2
|
||||
fi
|
||||
echo "Admin token present ✓"
|
||||
|
||||
- name: CP staging health preflight
|
||||
run: |
|
||||
code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health")
|
||||
if [ "$code" != "200" ]; then
|
||||
echo "::error::Staging CP unhealthy (got HTTP $code). Skipping — not a workspace bug."
|
||||
exit 1
|
||||
fi
|
||||
echo "Staging CP healthy ✓"
|
||||
|
||||
- name: Run external-runtime E2E
|
||||
id: e2e
|
||||
run: bash tests/e2e/test_staging_external_runtime.sh
|
||||
|
||||
# Mirror the e2e-staging-saas.yml safety net: if the runner is
|
||||
# cancelled (e.g. concurrent staging push), the test script's
|
||||
# EXIT trap may not fire, so we sweep e2e-ext-* slugs scoped to
|
||||
# *this* run id.
|
||||
- name: Teardown safety net (runs on cancel/failure)
|
||||
if: always()
|
||||
env:
|
||||
ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
|
||||
run: |
|
||||
set +e
|
||||
orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
|
||||
| python3 -c "
|
||||
import json, sys, os, datetime
|
||||
run_id = os.environ.get('GITHUB_RUN_ID', '')
|
||||
d = json.load(sys.stdin)
|
||||
# Scope STRICTLY to this run id (e2e-ext-YYYYMMDD-<runid>-...)
|
||||
# so concurrent runs and unrelated dev probes are not touched.
|
||||
# Sweep today AND yesterday so a midnight-crossing run still
|
||||
# cleans up its own slug.
|
||||
today = datetime.date.today()
|
||||
yesterday = today - datetime.timedelta(days=1)
|
||||
dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
|
||||
if not run_id:
|
||||
# Without a run id we cannot scope safely; bail rather
|
||||
# than risk deleting unrelated tenants.
|
||||
sys.exit(0)
|
||||
prefixes = tuple(f'e2e-ext-{d}-{run_id}-' for d in dates)
|
||||
for o in d.get('orgs', []):
|
||||
s = o.get('slug', '')
|
||||
if s.startswith(prefixes) and o.get('status') != 'purged':
|
||||
print(s)
|
||||
" 2>/dev/null)
|
||||
if [ -n "$orgs" ]; then
|
||||
echo "Safety-net sweep: deleting leftover orgs:"
|
||||
echo "$orgs"
|
||||
for slug in $orgs; do
|
||||
curl -sS -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"confirm\":\"$slug\"}" >/dev/null 2>&1
|
||||
done
|
||||
else
|
||||
echo "Safety-net sweep: no leftover orgs to clean."
|
||||
fi
|
||||
167
.github/workflows/harness-replays.yml
vendored
Normal file
167
.github/workflows/harness-replays.yml
vendored
Normal file
@ -0,0 +1,167 @@
|
||||
name: Harness Replays
|
||||
|
||||
# Boots tests/harness (production-shape compose topology with TenantGuard,
|
||||
# /cp/* proxy, canvas proxy, real production Dockerfile.tenant) and runs
|
||||
# every replay under tests/harness/replays/. Fails the PR if any replay
|
||||
# fails.
|
||||
#
|
||||
# Why this exists: 2026-04-30 we shipped #2398 which added /buildinfo as
|
||||
# a public route in router.go but forgot to add it to TenantGuard's
|
||||
# allowlist. The handler-level test in buildinfo_test.go constructed a
|
||||
# minimal gin engine without TenantGuard — green. The harness's
|
||||
# buildinfo-stale-image.sh replay would have caught it (cf-proxy doesn't
|
||||
# inject X-Molecule-Org-Id, so the curl path is identical to production's
|
||||
# redeploy verifier), but no one ran the harness pre-merge. The bug
|
||||
# shipped; the redeploy verifier silently soft-warned every tenant as
|
||||
# "unreachable" for ~1 day before being noticed.
|
||||
#
|
||||
# This gate makes "did you actually run the harness?" a CI invariant
|
||||
# instead of a memory-discipline thing.
|
||||
#
|
||||
# Trigger model — match e2e-api.yml: always FIRES on push/pull_request
|
||||
# to staging+main, real work is gated per-step on detect-changes output.
|
||||
# One job → one check run → branch-protection-clean (the SKIPPED-in-set
|
||||
# trap from PR #2264 is documented in e2e-api.yml's e2e-api job comment).
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main, staging]
|
||||
paths:
|
||||
- 'workspace-server/**'
|
||||
- 'canvas/**'
|
||||
- 'tests/harness/**'
|
||||
- '.github/workflows/harness-replays.yml'
|
||||
pull_request:
|
||||
branches: [main, staging]
|
||||
paths:
|
||||
- 'workspace-server/**'
|
||||
- 'canvas/**'
|
||||
- 'tests/harness/**'
|
||||
- '.github/workflows/harness-replays.yml'
|
||||
workflow_dispatch:
|
||||
merge_group:
|
||||
types: [checks_requested]
|
||||
|
||||
concurrency:
|
||||
# Per-SHA grouping. Per-ref kept hitting the auto-promote-staging
|
||||
# cancellation deadlock — see e2e-api.yml's concurrency block for
|
||||
# the 2026-04-28 incident that codified this pattern.
|
||||
group: harness-replays-${{ github.event.pull_request.head.sha || github.sha }}
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
detect-changes:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
run: ${{ steps.decide.outputs.run }}
|
||||
steps:
|
||||
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
|
||||
- uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1
|
||||
id: filter
|
||||
with:
|
||||
filters: |
|
||||
run:
|
||||
- 'workspace-server/**'
|
||||
- 'canvas/**'
|
||||
- 'tests/harness/**'
|
||||
- '.github/workflows/harness-replays.yml'
|
||||
- id: decide
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
|
||||
echo "run=true" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "run=${{ steps.filter.outputs.run }}" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
# ONE job that always runs. Real work is gated per-step on
|
||||
# detect-changes.outputs.run so an unrelated PR (e.g. doc-only
|
||||
# change to molecule-controlplane wired here later) emits the
|
||||
# required check without spending CI cycles. Single-job pattern
|
||||
# matches e2e-api.yml — see that workflow's comment for why a
|
||||
# job-level `if: false` would block branch protection via the
|
||||
# SKIPPED-in-set bug.
|
||||
harness-replays:
|
||||
needs: detect-changes
|
||||
name: Harness Replays
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 30
|
||||
steps:
|
||||
- name: No-op pass (paths filter excluded this commit)
|
||||
if: needs.detect-changes.outputs.run != 'true'
|
||||
run: |
|
||||
echo "No workspace-server / canvas / tests/harness / workflow changes — Harness Replays gate satisfied without running."
|
||||
echo "::notice::Harness Replays no-op pass (paths filter excluded this commit)."
|
||||
|
||||
- if: needs.detect-changes.outputs.run == 'true'
|
||||
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
|
||||
|
||||
- name: Checkout sibling plugin repo
|
||||
# Dockerfile.tenant copies molecule-ai-plugin-github-app-auth/
|
||||
# at the build-context root (see workspace-server/Dockerfile.tenant
|
||||
# line 19). PLUGIN_REPO_PAT pattern matches publish-workspace-server-image.yml.
|
||||
if: needs.detect-changes.outputs.run == 'true'
|
||||
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
|
||||
with:
|
||||
repository: Molecule-AI/molecule-ai-plugin-github-app-auth
|
||||
path: molecule-ai-plugin-github-app-auth
|
||||
token: ${{ secrets.PLUGIN_REPO_PAT || secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Add /etc/hosts entry for harness-tenant.localhost
|
||||
# ubuntu-latest doesn't auto-resolve *.localhost the way macOS
|
||||
# sometimes does. seed.sh + replay scripts curl
|
||||
# http://harness-tenant.localhost:8080 — without the entry
|
||||
# they'd fail with getaddrinfo ENOTFOUND.
|
||||
if: needs.detect-changes.outputs.run == 'true'
|
||||
run: |
|
||||
echo "127.0.0.1 harness-tenant.localhost" | sudo tee -a /etc/hosts >/dev/null
|
||||
getent hosts harness-tenant.localhost
|
||||
|
||||
- name: Install Python deps for replays
|
||||
# peer-discovery-404 (and future replays) eval Python against the
|
||||
# running tenant — importing workspace/a2a_client.py pulls in
|
||||
# httpx. tests/harness/requirements.txt holds just the HTTP-client
|
||||
# surface to keep CI install fast (~3s) vs the full
|
||||
# workspace/requirements.txt (~30s).
|
||||
if: needs.detect-changes.outputs.run == 'true'
|
||||
run: pip install -r tests/harness/requirements.txt
|
||||
|
||||
- name: Run all replays against the harness
|
||||
# run-all-replays.sh: boot via up.sh → seed via seed.sh → run
|
||||
# every replays/*.sh → tear down via down.sh on EXIT (trap).
|
||||
# Non-zero exit on any replay failure.
|
||||
#
|
||||
# KEEP_UP=1: without this, the script's trap-on-EXIT tears
|
||||
# down containers immediately on failure, leaving the dump
|
||||
# step below with nothing to dump (verified on PR #2410's
|
||||
# first run — tenant became unhealthy, trap fired, dump
|
||||
# step saw empty containers). Keeping them up lets the
|
||||
# failure path collect tenant/cp-stub/cf-proxy logs. The
|
||||
# always-run "Force teardown" step does the actual cleanup.
|
||||
if: needs.detect-changes.outputs.run == 'true'
|
||||
working-directory: tests/harness
|
||||
env:
|
||||
KEEP_UP: "1"
|
||||
run: ./run-all-replays.sh
|
||||
|
||||
- name: Dump compose logs on failure
|
||||
if: failure() && needs.detect-changes.outputs.run == 'true'
|
||||
working-directory: tests/harness
|
||||
run: |
|
||||
echo "=== docker compose ps ==="
|
||||
docker compose -f compose.yml ps || true
|
||||
echo "=== tenant logs ==="
|
||||
docker compose -f compose.yml logs tenant || true
|
||||
echo "=== cp-stub logs ==="
|
||||
docker compose -f compose.yml logs cp-stub || true
|
||||
echo "=== cf-proxy logs ==="
|
||||
docker compose -f compose.yml logs cf-proxy || true
|
||||
echo "=== postgres logs (last 100) ==="
|
||||
docker compose -f compose.yml logs --tail 100 postgres || true
|
||||
|
||||
- name: Force teardown
|
||||
# We pass KEEP_UP=1 to run-all-replays.sh so the dump step
|
||||
# above sees real containers — that means we own teardown
|
||||
# explicitly here. Always run.
|
||||
if: always() && needs.detect-changes.outputs.run == 'true'
|
||||
working-directory: tests/harness
|
||||
run: ./down.sh || true
|
||||
134
.github/workflows/publish-runtime.yml
vendored
134
.github/workflows/publish-runtime.yml
vendored
@ -154,139 +154,15 @@ jobs:
|
||||
|
||||
- name: Verify package contents (sanity)
|
||||
working-directory: ${{ runner.temp }}/runtime-build
|
||||
# Smoke logic lives in scripts/wheel_smoke.py so the same gate runs
|
||||
# at both PR-time (runtime-prbuild-compat.yml) and publish-time
|
||||
# (here). Splitting the smoke across two heredocs let them drift
|
||||
# apart historically — one script keeps them locked.
|
||||
run: |
|
||||
python -m twine check dist/*
|
||||
# Smoke-import the built wheel to catch import-rewrite mistakes
|
||||
# before they hit PyPI. Asserts on STABLE INVARIANTS only —
|
||||
# symbols + classes that are part of the package's public
|
||||
# contract (BaseAdapter interface, the canonical a2a sentinel,
|
||||
# core submodules). Don't add feature-flag-style assertions
|
||||
# here — they fire false-positive every time staging is mid-
|
||||
# release of that feature.
|
||||
python -m venv /tmp/smoke
|
||||
/tmp/smoke/bin/pip install --quiet dist/*.whl
|
||||
WORKSPACE_ID=00000000-0000-0000-0000-000000000000 \
|
||||
PLATFORM_URL=http://localhost:8080 \
|
||||
/tmp/smoke/bin/python -c "
|
||||
# Importing main is the strongest smoke test we can do here:
|
||||
# main.py is the entry point and pulls every other module
|
||||
# transitively. If the build script missed an import rewrite
|
||||
# (e.g. left a bare \`from transcript_auth import ...\` instead
|
||||
# of \`from molecule_runtime.transcript_auth import ...\` — the
|
||||
# 0.1.16 incident), this fails with ModuleNotFoundError instead
|
||||
# of shipping to PyPI and breaking every workspace startup.
|
||||
# Import the entry-point target by NAME — not just the module.
|
||||
# The wheel's pyproject.toml declares
|
||||
# `molecule-runtime = molecule_runtime.main:main_sync` so if
|
||||
# main_sync goes missing (it did in 0.1.16-0.1.18), every
|
||||
# workspace startup fails with `ImportError: cannot import name
|
||||
# 'main_sync'`. Plain `import molecule_runtime.main` doesn't
|
||||
# catch that because the module loads fine.
|
||||
from molecule_runtime.main import main_sync # noqa: F401
|
||||
from molecule_runtime import a2a_client, a2a_tools
|
||||
from molecule_runtime.builtin_tools import memory
|
||||
from molecule_runtime.adapters import get_adapter, BaseAdapter, AdapterConfig
|
||||
# Stable invariants: package exports + BaseAdapter shape.
|
||||
assert a2a_client._A2A_ERROR_PREFIX, 'a2a_client missing error sentinel'
|
||||
assert callable(get_adapter), 'adapters.get_adapter must be callable'
|
||||
assert hasattr(BaseAdapter, 'name'), 'BaseAdapter interface broken'
|
||||
assert hasattr(AdapterConfig, '__init__'), 'AdapterConfig dataclass missing'
|
||||
|
||||
# Call-shape smoke for AgentCard. Pure imports don't catch
|
||||
# field-shape regressions in upstream SDKs that only surface
|
||||
# at construction time. Two bugs of this exact class shipped
|
||||
# since the a2a-sdk 1.0 migration:
|
||||
# - state_transition_history=True (fixed in #2179)
|
||||
# - supported_protocols=[...] (the protobuf field is
|
||||
# supported_interfaces — caused every workspace boot
|
||||
# to crash with `ValueError: Protocol message AgentCard
|
||||
# has no "supported_protocols" field`; fixed alongside
|
||||
# this smoke)
|
||||
#
|
||||
# This block instantiates the EXACT classes main.py uses,
|
||||
# with the EXACT keyword arguments. If a future a2a-sdk
|
||||
# upgrade renames any of supported_interfaces / streaming /
|
||||
# push_notifications / etc., the publish fails here instead
|
||||
# of breaking every workspace startup. main.py and this
|
||||
# smoke MUST stay in lockstep — adding a kwarg to one
|
||||
# without mirroring it here is the regression vector.
|
||||
from a2a.types import AgentCard, AgentCapabilities, AgentSkill, AgentInterface
|
||||
AgentCard(
|
||||
name='smoke-agent',
|
||||
description='publish-runtime smoke test',
|
||||
version='0.0.0-smoke',
|
||||
supported_interfaces=[
|
||||
AgentInterface(protocol_binding='https://a2a.g/v1', url='http://localhost:8080'),
|
||||
],
|
||||
capabilities=AgentCapabilities(
|
||||
streaming=True,
|
||||
push_notifications=False,
|
||||
),
|
||||
skills=[
|
||||
AgentSkill(
|
||||
id='smoke-skill',
|
||||
name='Smoke',
|
||||
description='no-op',
|
||||
tags=['smoke'],
|
||||
examples=['noop'],
|
||||
),
|
||||
],
|
||||
default_input_modes=['text/plain', 'application/json'],
|
||||
default_output_modes=['text/plain', 'application/json'],
|
||||
)
|
||||
print('✓ AgentCard call-shape smoke passed')
|
||||
|
||||
# Well-known agent-card path probe alignment. main.py's
|
||||
# _send_initial_prompt() polls AGENT_CARD_WELL_KNOWN_PATH
|
||||
# to know when the local A2A server is ready. If the SDK
|
||||
# ever splits the constant value from the path that
|
||||
# create_agent_card_routes() actually mounts at, every
|
||||
# workspace silently drops its initial_prompt:
|
||||
# - Probe gets 404 every attempt.
|
||||
# - Falls through to 'server not ready after 30s,
|
||||
# skipping' even though the server is fine.
|
||||
# - The user hits a fresh chat with no kickoff context.
|
||||
# This was the #2193 incident class — the v0.x → v1.x
|
||||
# rename of /.well-known/agent.json → /.well-known/agent-card.json
|
||||
# plus the constant itself moving to a2a.utils.constants.
|
||||
# source-tree pytest (test_agent_card_well_known_path.py)
|
||||
# catches main.py-side regressions; this catches the
|
||||
# SDK-side ones BEFORE PyPI upload.
|
||||
from a2a.utils.constants import AGENT_CARD_WELL_KNOWN_PATH
|
||||
from a2a.server.routes import create_agent_card_routes
|
||||
mounted_paths = [
|
||||
getattr(r, 'path', None)
|
||||
for r in create_agent_card_routes(
|
||||
AgentCard(
|
||||
name='wk-smoke',
|
||||
description='well-known mount alignment',
|
||||
version='0.0.0-smoke',
|
||||
)
|
||||
)
|
||||
]
|
||||
assert AGENT_CARD_WELL_KNOWN_PATH in mounted_paths, (
|
||||
f'AGENT_CARD_WELL_KNOWN_PATH ({AGENT_CARD_WELL_KNOWN_PATH!r}) '
|
||||
f'is NOT among paths mounted by create_agent_card_routes '
|
||||
f'({mounted_paths!r}). The SDK constant and its own route '
|
||||
f'factory have drifted — workspace probes will 404 forever, '
|
||||
f'silently dropping every workspace initial_prompt.'
|
||||
)
|
||||
print(f'✓ well-known mount alignment OK ({AGENT_CARD_WELL_KNOWN_PATH})')
|
||||
|
||||
# Message helper smoke. a2a-sdk renamed
|
||||
# new_agent_text_message → new_text_message in the v1.x
|
||||
# protobuf-flat migration (per the v0→v1 cheat sheet). main.py
|
||||
# and a2a_executor.py call new_text_message in hot paths; if
|
||||
# the import breaks, every reply errors with ImportError before
|
||||
# the message even leaves the workspace. Importing here
|
||||
# catches a future v2.x rename at publish time.
|
||||
from a2a.helpers import new_text_message
|
||||
msg = new_text_message('smoke')
|
||||
assert msg is not None, 'new_text_message returned None'
|
||||
print('✓ message helper import + call OK')
|
||||
|
||||
print('✓ smoke import passed')
|
||||
"
|
||||
/tmp/smoke/bin/python "$GITHUB_WORKSPACE/scripts/wheel_smoke.py"
|
||||
|
||||
- name: Publish to PyPI (Trusted Publisher / OIDC)
|
||||
# PyPI side is configured: project molecule-ai-workspace-runtime →
|
||||
|
||||
11
.github/workflows/redeploy-tenants-on-main.yml
vendored
11
.github/workflows/redeploy-tenants-on-main.yml
vendored
@ -306,6 +306,17 @@ jobs:
|
||||
if [ $UNREACHABLE_COUNT -gt 0 ]; then
|
||||
echo "::warning::$UNREACHABLE_COUNT tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages."
|
||||
fi
|
||||
|
||||
# Belt-and-suspenders sanity floor: same logic as the staging
|
||||
# variant — see that file's comment for the full rationale.
|
||||
# Floor only applies when fleet >= 4; below that, canary-verify
|
||||
# is the actual gate.
|
||||
TOTAL_VERIFIED=${#SLUGS[@]}
|
||||
if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then
|
||||
echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $STALE_COUNT -gt 0 ]; then
|
||||
echo "::error::$STALE_COUNT tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary."
|
||||
exit 1
|
||||
|
||||
@ -283,6 +283,25 @@ jobs:
|
||||
if [ $UNREACHABLE_COUNT -gt 0 ]; then
|
||||
echo "::warning::$UNREACHABLE_COUNT staging tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages."
|
||||
fi
|
||||
|
||||
# Belt-and-suspenders sanity floor: if MORE than half the fleet is
|
||||
# unreachable AND the fleet is large enough that "half down" is
|
||||
# statistically meaningful, this is a real outage (e.g. new image
|
||||
# crashes on startup), not a teardown race. Hard-fail.
|
||||
#
|
||||
# Floor only applies when TOTAL_VERIFIED >= 4 — below that, the
|
||||
# canary-verify step is the actual gate for "all tenants down"
|
||||
# detection (it runs against the canary first and aborts the
|
||||
# rollout if the canary fails to come up). Without the >=4 gate,
|
||||
# a 1-tenant fleet (e.g. a single ephemeral e2e-* tenant on a
|
||||
# quiet staging push) would re-flake on the exact teardown-race
|
||||
# condition #2402 fixed: 1 of 1 unreachable = 100% > 50% → fail.
|
||||
TOTAL_VERIFIED=${#SLUGS[@]}
|
||||
if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then
|
||||
echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED staging tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $STALE_COUNT -gt 0 ]; then
|
||||
echo "::error::$STALE_COUNT staging tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary."
|
||||
exit 1
|
||||
|
||||
10
.github/workflows/runtime-prbuild-compat.yml
vendored
10
.github/workflows/runtime-prbuild-compat.yml
vendored
@ -34,12 +34,14 @@ on:
|
||||
# changes (it controls the wheel layout).
|
||||
- 'workspace/**'
|
||||
- 'scripts/build_runtime_package.py'
|
||||
- 'scripts/wheel_smoke.py'
|
||||
- '.github/workflows/runtime-prbuild-compat.yml'
|
||||
pull_request:
|
||||
branches: [main, staging]
|
||||
paths:
|
||||
- 'workspace/**'
|
||||
- 'scripts/build_runtime_package.py'
|
||||
- 'scripts/wheel_smoke.py'
|
||||
- '.github/workflows/runtime-prbuild-compat.yml'
|
||||
workflow_dispatch:
|
||||
# Required-check support: when this becomes a branch-protection gate,
|
||||
@ -94,7 +96,9 @@ jobs:
|
||||
/tmp/venv-built/bin/pip show molecule-ai-workspace-runtime a2a-sdk \
|
||||
| grep -E '^(Name|Version):'
|
||||
- name: Smoke import the PR-built wheel
|
||||
env:
|
||||
WORKSPACE_ID: 00000000-0000-0000-0000-000000000001
|
||||
# Same script publish-runtime.yml runs against the to-be-PyPI wheel.
|
||||
# Closes the PR-time vs publish-time gap: a PR adding a new SDK
|
||||
# call-shape no longer passes here (narrow `import main_sync`) only
|
||||
# to fail post-merge in publish-runtime's broader smoke.
|
||||
run: |
|
||||
/tmp/venv-built/bin/python -c "from molecule_runtime.main import main_sync; print('PR-built runtime imports OK')"
|
||||
/tmp/venv-built/bin/python "$GITHUB_WORKSPACE/scripts/wheel_smoke.py"
|
||||
|
||||
48
canvas/src/app/api/buildinfo/__tests__/route.test.ts
Normal file
48
canvas/src/app/api/buildinfo/__tests__/route.test.ts
Normal file
@ -0,0 +1,48 @@
|
||||
/**
|
||||
* Canvas /api/buildinfo — version-display endpoint mirroring
|
||||
* workspace-server's /buildinfo. Lets `curl <url>/api/buildinfo`
|
||||
* confirm which git SHA is live on a canvas deployment.
|
||||
*/
|
||||
import { describe, it, expect, beforeEach, afterEach } from "vitest";
|
||||
import { GET } from "../route";
|
||||
|
||||
const ENV_KEYS = ["VERCEL_GIT_COMMIT_SHA", "VERCEL_GIT_COMMIT_REF", "VERCEL_ENV"];
|
||||
|
||||
describe("GET /api/buildinfo", () => {
|
||||
let saved: Record<string, string | undefined>;
|
||||
|
||||
beforeEach(() => {
|
||||
saved = Object.fromEntries(ENV_KEYS.map((k) => [k, process.env[k]]));
|
||||
for (const k of ENV_KEYS) delete process.env[k];
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
for (const k of ENV_KEYS) {
|
||||
if (saved[k] === undefined) delete process.env[k];
|
||||
else process.env[k] = saved[k];
|
||||
}
|
||||
});
|
||||
|
||||
it("returns dev sentinel when Vercel env vars are unset", async () => {
|
||||
const res = await GET();
|
||||
const body = await res.json();
|
||||
expect(body).toEqual({ git_sha: "dev", git_ref: "", vercel_env: "local" });
|
||||
});
|
||||
|
||||
it("reports the SHA Vercel injected at build time", async () => {
|
||||
process.env.VERCEL_GIT_COMMIT_SHA = "abc1234567890";
|
||||
process.env.VERCEL_GIT_COMMIT_REF = "main";
|
||||
process.env.VERCEL_ENV = "production";
|
||||
const res = await GET();
|
||||
const body = await res.json();
|
||||
expect(body.git_sha).toBe("abc1234567890");
|
||||
expect(body.git_ref).toBe("main");
|
||||
expect(body.vercel_env).toBe("production");
|
||||
});
|
||||
|
||||
it("returns 200 status and JSON content type", async () => {
|
||||
const res = await GET();
|
||||
expect(res.status).toBe(200);
|
||||
expect(res.headers.get("content-type")).toContain("application/json");
|
||||
});
|
||||
});
|
||||
18
canvas/src/app/api/buildinfo/route.ts
Normal file
18
canvas/src/app/api/buildinfo/route.ts
Normal file
@ -0,0 +1,18 @@
|
||||
import { NextResponse } from "next/server";
|
||||
|
||||
// Mirror of workspace-server's GET /buildinfo (PR #2398). Lets a developer
|
||||
// confirm which git SHA is live on a canvas deployment with the same
|
||||
// `curl <url>/buildinfo` flow they use against tenant workspaces.
|
||||
//
|
||||
// Vercel injects VERCEL_GIT_COMMIT_SHA / _REF / VERCEL_ENV at build time
|
||||
// from the deploying commit; outside Vercel (local `next dev`, harness)
|
||||
// these are unset and the endpoint reports `git_sha: "dev"`. Same sentinel
|
||||
// the workspace-server uses pre-ldflags-injection so both surfaces speak
|
||||
// the same vocabulary.
|
||||
export async function GET() {
|
||||
return NextResponse.json({
|
||||
git_sha: process.env.VERCEL_GIT_COMMIT_SHA ?? "dev",
|
||||
git_ref: process.env.VERCEL_GIT_COMMIT_REF ?? "",
|
||||
vercel_env: process.env.VERCEL_ENV ?? "local",
|
||||
});
|
||||
}
|
||||
112
scripts/ops/check-prod-versions.sh
Executable file
112
scripts/ops/check-prod-versions.sh
Executable file
@ -0,0 +1,112 @@
|
||||
#!/usr/bin/env bash
|
||||
# Check whether production tenants and canvas are running latest main.
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/ops/check-prod-versions.sh # production
|
||||
# ENV=staging ./scripts/ops/check-prod-versions.sh # staging tenants
|
||||
#
|
||||
# Outputs a table of {surface, current_sha, expected_sha, status}. Returns
|
||||
# non-zero if any surface is stale so this can be wired into a periodic
|
||||
# alert.
|
||||
#
|
||||
# Why this exists: every time someone hits a "is the fix live?" question,
|
||||
# they have to remember the curl pattern + cross-reference with
|
||||
# `git rev-parse origin/main`. This script does that check uniformly across
|
||||
# every public surface (workspace tenants + canvas) and gives a one-line
|
||||
# verdict instead of a stack of one-off curls.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
ENV="${ENV:-production}"
|
||||
EXPECTED_REF="${EXPECTED_REF:-main}"
|
||||
|
||||
case "$ENV" in
|
||||
production)
|
||||
TENANT_DOMAIN="moleculesai.app"
|
||||
CANVAS_URL="https://canvas.moleculesai.app"
|
||||
# Default canary tenant for production. Override via TENANT_SLUGS=
|
||||
# to cover a custom set.
|
||||
DEFAULT_TENANTS="hongmingwang reno-stars"
|
||||
;;
|
||||
staging)
|
||||
TENANT_DOMAIN="staging.moleculesai.app"
|
||||
CANVAS_URL="https://canvas-staging.moleculesai.app"
|
||||
DEFAULT_TENANTS="" # staging tenants are ephemeral; user must specify
|
||||
;;
|
||||
*)
|
||||
echo "Unknown ENV=$ENV (expected: production | staging)" >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
|
||||
TENANT_SLUGS="${TENANT_SLUGS:-$DEFAULT_TENANTS}"
|
||||
|
||||
# Pull EXPECTED_SHA from GitHub. Falls back to local git if gh isn't
|
||||
# logged in — local main may lag origin but is usually close enough for
|
||||
# debugging, and we still report the comparison clearly.
|
||||
EXPECTED_SHA=""
|
||||
if command -v gh >/dev/null 2>&1; then
|
||||
EXPECTED_SHA=$(gh api "repos/Molecule-AI/molecule-core/commits/${EXPECTED_REF}" --jq '.sha' 2>/dev/null || true)
|
||||
fi
|
||||
if [ -z "$EXPECTED_SHA" ]; then
|
||||
if git rev-parse "origin/${EXPECTED_REF}" >/dev/null 2>&1; then
|
||||
EXPECTED_SHA=$(git rev-parse "origin/${EXPECTED_REF}")
|
||||
echo "[check-prod-versions] WARN: gh unavailable, using local origin/${EXPECTED_REF}=${EXPECTED_SHA:0:7} (may lag)"
|
||||
else
|
||||
echo "[check-prod-versions] ERROR: cannot resolve expected SHA — gh not logged in and origin/${EXPECTED_REF} not fetched" >&2
|
||||
exit 2
|
||||
fi
|
||||
fi
|
||||
EXPECTED_SHORT="${EXPECTED_SHA:0:7}"
|
||||
|
||||
echo "Checking ${ENV} surfaces against ${EXPECTED_REF}=${EXPECTED_SHORT}"
|
||||
echo ""
|
||||
printf "%-25s %-9s %-9s %s\n" "Surface" "Live" "Expected" "Status"
|
||||
printf "%-25s %-9s %-9s %s\n" "-------" "----" "--------" "------"
|
||||
|
||||
STALE_COUNT=0
|
||||
UNREACHABLE_COUNT=0
|
||||
|
||||
# Tenant surfaces — workspace-server /buildinfo (added in PR #2398).
|
||||
for slug in $TENANT_SLUGS; do
|
||||
URL="https://${slug}.${TENANT_DOMAIN}/buildinfo"
|
||||
BODY=$(curl -sS --max-time 15 "$URL" 2>/dev/null || echo "")
|
||||
ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
|
||||
if [ -z "$ACTUAL_SHA" ]; then
|
||||
printf "%-25s %-9s %-9s ⚠ unreachable\n" "tenant: $slug" "—" "$EXPECTED_SHORT"
|
||||
UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
|
||||
elif [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then
|
||||
printf "%-25s %-9s %-9s ✓ current\n" "tenant: $slug" "${ACTUAL_SHA:0:7}" "$EXPECTED_SHORT"
|
||||
else
|
||||
printf "%-25s %-9s %-9s ✗ stale\n" "tenant: $slug" "${ACTUAL_SHA:0:7}" "$EXPECTED_SHORT"
|
||||
STALE_COUNT=$((STALE_COUNT + 1))
|
||||
fi
|
||||
done
|
||||
|
||||
# Canvas — Next.js /api/buildinfo (PR #2407). Vercel injects
|
||||
# VERCEL_GIT_COMMIT_SHA at build time so this reflects the deployed
|
||||
# commit, not the request time.
|
||||
CANVAS_BODY=$(curl -sS --max-time 15 "${CANVAS_URL}/api/buildinfo" 2>/dev/null || echo "")
|
||||
CANVAS_SHA=$(echo "$CANVAS_BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
|
||||
if [ -z "$CANVAS_SHA" ]; then
|
||||
printf "%-25s %-9s %-9s ⚠ unreachable (route may not be deployed yet)\n" "canvas" "—" "$EXPECTED_SHORT"
|
||||
UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
|
||||
elif [ "$CANVAS_SHA" = "dev" ]; then
|
||||
printf "%-25s %-9s %-9s ⚠ dev sentinel (Vercel env not injected — check VERCEL_GIT_COMMIT_SHA)\n" "canvas" "dev" "$EXPECTED_SHORT"
|
||||
UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
|
||||
elif [ "$CANVAS_SHA" = "$EXPECTED_SHA" ]; then
|
||||
printf "%-25s %-9s %-9s ✓ current\n" "canvas" "${CANVAS_SHA:0:7}" "$EXPECTED_SHORT"
|
||||
else
|
||||
printf "%-25s %-9s %-9s ✗ stale\n" "canvas" "${CANVAS_SHA:0:7}" "$EXPECTED_SHORT"
|
||||
STALE_COUNT=$((STALE_COUNT + 1))
|
||||
fi
|
||||
|
||||
echo ""
|
||||
if [ $STALE_COUNT -eq 0 ] && [ $UNREACHABLE_COUNT -eq 0 ]; then
|
||||
echo "All surfaces current."
|
||||
exit 0
|
||||
fi
|
||||
echo "Summary: ${STALE_COUNT} stale, ${UNREACHABLE_COUNT} unreachable."
|
||||
# Stale is a deploy gap; unreachable is operational (DNS, CF, route absent).
|
||||
# Both are signal — exit non-zero so cron / CI can alert.
|
||||
exit 1
|
||||
145
scripts/wheel_smoke.py
Normal file
145
scripts/wheel_smoke.py
Normal file
@ -0,0 +1,145 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Smoke-test an installed molecule-ai-workspace-runtime wheel.
|
||||
|
||||
Runs the same invariant assertions in two workflows:
|
||||
* publish-runtime.yml — after building dist/*.whl, before PyPI upload
|
||||
* runtime-prbuild-compat.yml — after building the PR's wheel, before merge
|
||||
|
||||
Splitting the smoke across two inline heredocs let PR-time and publish-time
|
||||
drift apart. After 2026-04 we kept hitting publish-time failures for
|
||||
regressions a PR-time check could have caught. One script, both gates.
|
||||
|
||||
Failure here intentionally exits non-zero so the workflow's `run:` step fails.
|
||||
Each block prints a single ✓ line on success so the GH summary log stays
|
||||
readable; assertion errors propagate with their own message.
|
||||
|
||||
Run directly: `python scripts/wheel_smoke.py` after `pip install <wheel>`.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def smoke_imports_and_invariants() -> None:
|
||||
"""Module imports + stable contract assertions.
|
||||
|
||||
Importing main_sync by name is the strongest pre-PyPI gate we have for
|
||||
import-rewrite mistakes (the 0.1.16 incident, where main.py loaded but
|
||||
main_sync was missing because the build script dropped a re-export).
|
||||
"""
|
||||
from molecule_runtime.main import main_sync # noqa: F401
|
||||
from molecule_runtime import a2a_client, a2a_tools # noqa: F401
|
||||
from molecule_runtime.builtin_tools import memory # noqa: F401
|
||||
from molecule_runtime.adapters import get_adapter, BaseAdapter, AdapterConfig
|
||||
|
||||
assert a2a_client._A2A_ERROR_PREFIX, "a2a_client missing error sentinel"
|
||||
assert callable(get_adapter), "adapters.get_adapter must be callable"
|
||||
assert hasattr(BaseAdapter, "name"), "BaseAdapter interface broken"
|
||||
assert hasattr(AdapterConfig, "__init__"), "AdapterConfig dataclass missing"
|
||||
print("✓ module imports + invariants OK")
|
||||
|
||||
|
||||
def smoke_agent_card_call_shape() -> None:
|
||||
"""Construct AgentCard with the EXACT kwargs main.py uses.
|
||||
|
||||
Pure imports don't catch field-shape regressions in upstream SDKs that
|
||||
only surface at construction time. Two bugs of this exact class shipped
|
||||
since the a2a-sdk 1.0 migration:
|
||||
- state_transition_history=True (#2179)
|
||||
- supported_protocols=[...] (the protobuf field is supported_interfaces;
|
||||
every workspace boot crashed with `ValueError: Protocol message
|
||||
AgentCard has no "supported_protocols" field`)
|
||||
|
||||
main.py and this block MUST stay in lockstep — adding a kwarg there
|
||||
without mirroring it here is the regression vector.
|
||||
"""
|
||||
from a2a.types import AgentCard, AgentCapabilities, AgentSkill, AgentInterface
|
||||
|
||||
AgentCard(
|
||||
name="smoke-agent",
|
||||
description="wheel-smoke: AgentCard call-shape",
|
||||
version="0.0.0-smoke",
|
||||
supported_interfaces=[
|
||||
AgentInterface(protocol_binding="https://a2a.g/v1", url="http://localhost:8080"),
|
||||
],
|
||||
capabilities=AgentCapabilities(
|
||||
streaming=True,
|
||||
push_notifications=False,
|
||||
),
|
||||
skills=[
|
||||
AgentSkill(
|
||||
id="smoke-skill",
|
||||
name="Smoke",
|
||||
description="no-op",
|
||||
tags=["smoke"],
|
||||
examples=["noop"],
|
||||
),
|
||||
],
|
||||
default_input_modes=["text/plain", "application/json"],
|
||||
default_output_modes=["text/plain", "application/json"],
|
||||
)
|
||||
print("✓ AgentCard call-shape smoke passed")
|
||||
|
||||
|
||||
def smoke_well_known_path_alignment() -> None:
|
||||
"""The SDK's published constant must match the path it actually mounts.
|
||||
|
||||
main.py polls AGENT_CARD_WELL_KNOWN_PATH to detect server readiness. If
|
||||
the constant and create_agent_card_routes() drift, every workspace's
|
||||
initial_prompt silently drops (probe 404s, falls through to "skipping").
|
||||
This was the #2193 incident class.
|
||||
"""
|
||||
from a2a.types import AgentCard
|
||||
from a2a.utils.constants import AGENT_CARD_WELL_KNOWN_PATH
|
||||
from a2a.server.routes import create_agent_card_routes
|
||||
|
||||
mounted_paths = [
|
||||
getattr(r, "path", None)
|
||||
for r in create_agent_card_routes(
|
||||
AgentCard(
|
||||
name="wk-smoke",
|
||||
description="well-known mount alignment",
|
||||
version="0.0.0-smoke",
|
||||
)
|
||||
)
|
||||
]
|
||||
assert AGENT_CARD_WELL_KNOWN_PATH in mounted_paths, (
|
||||
f"AGENT_CARD_WELL_KNOWN_PATH ({AGENT_CARD_WELL_KNOWN_PATH!r}) is NOT among "
|
||||
f"paths mounted by create_agent_card_routes ({mounted_paths!r}). The SDK "
|
||||
"constant and its own route factory have drifted — workspace probes will "
|
||||
"404 forever, silently dropping every workspace initial_prompt."
|
||||
)
|
||||
print(f"✓ well-known mount alignment OK ({AGENT_CARD_WELL_KNOWN_PATH})")
|
||||
|
||||
|
||||
def smoke_message_helper() -> None:
|
||||
"""new_text_message is the v1.x rename of new_agent_text_message.
|
||||
|
||||
main.py and a2a_executor.py call new_text_message in hot paths; if the
|
||||
import breaks, every reply errors with ImportError before the message
|
||||
even leaves the workspace. Importing here catches a future v2.x rename
|
||||
at publish time.
|
||||
"""
|
||||
from a2a.helpers import new_text_message
|
||||
|
||||
msg = new_text_message("smoke")
|
||||
assert msg is not None, "new_text_message returned None"
|
||||
print("✓ message helper import + call OK")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
# main.py validates WORKSPACE_ID at module-import time via platform_auth.
|
||||
# Set placeholders so the smoke doesn't trip on the env-var guard.
|
||||
os.environ.setdefault("WORKSPACE_ID", "00000000-0000-0000-0000-000000000000")
|
||||
os.environ.setdefault("PLATFORM_URL", "http://localhost:8080")
|
||||
|
||||
smoke_imports_and_invariants()
|
||||
smoke_agent_card_call_shape()
|
||||
smoke_well_known_path_alignment()
|
||||
smoke_message_helper()
|
||||
print("✓ wheel smoke passed")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
348
tests/e2e/test_staging_external_runtime.sh
Executable file
348
tests/e2e/test_staging_external_runtime.sh
Executable file
@ -0,0 +1,348 @@
|
||||
#!/bin/bash
|
||||
# test_staging_external_runtime.sh — E2E regression for the
|
||||
# external-runtime workspace lifecycle on a real staging tenant.
|
||||
#
|
||||
# Why this test exists: the four/five sites that write 'awaiting_agent'
|
||||
# / 'hibernating' to workspaces.status had been silently failing in
|
||||
# production for five days (see migration 046) before a static drift
|
||||
# gate caught the enum gap. Unit tests passed because sqlmock matched
|
||||
# the SQL by regex but didn't enforce the live enum constraint, and
|
||||
# every existing E2E exercised hermes (not external) so the silent
|
||||
# failures never surfaced. This test pins the four awaiting_agent
|
||||
# transitions in real Postgres on a real staging tenant.
|
||||
#
|
||||
# Verification path:
|
||||
# 1. Provision a fresh tenant (test_staging_full_saas.sh harness shape).
|
||||
# 2. Create an external-runtime workspace with NO URL → assert
|
||||
# response status == 'awaiting_agent' AND GET on the workspace
|
||||
# returns the same. (Pre-fix the row stuck on 'provisioning'
|
||||
# because the UPDATE in workspace.go:333 silently failed.)
|
||||
# 3. Register a fake URL via /registry/register → assert transition
|
||||
# to 'online'. (Pre-fix this branch worked because it writes
|
||||
# 'online' which IS in the enum.)
|
||||
# 4. Stop heartbeating; wait past REMOTE_LIVENESS_STALE_AFTER (90s
|
||||
# default) + a sweep interval → assert transition back to
|
||||
# 'awaiting_agent'. (Pre-fix the sweep UPDATE failed silently and
|
||||
# the workspace stuck on 'online' indefinitely.)
|
||||
#
|
||||
# Hibernation is intentionally NOT covered here — it has its own timing
|
||||
# model (idle threshold) and warrants a separate harness.
|
||||
#
|
||||
# Required env (mirrors test_staging_full_saas.sh):
|
||||
# MOLECULE_CP_URL default: https://staging-api.moleculesai.app
|
||||
# MOLECULE_ADMIN_TOKEN CP admin bearer (Railway CP_ADMIN_API_TOKEN)
|
||||
#
|
||||
# Optional env:
|
||||
# E2E_PROVISION_TIMEOUT_SECS default 900 (15 min cold EC2 budget)
|
||||
# E2E_KEEP_ORG 1 → skip teardown (debugging only)
|
||||
# E2E_RUN_ID Slug suffix; CI: ${GITHUB_RUN_ID}
|
||||
# E2E_STALE_WAIT_SECS default 180 (90s window + 90s buffer)
|
||||
# E2E_INTENTIONAL_FAILURE 1 → break a step on purpose to verify
|
||||
# the EXIT trap still tears down (mirrors
|
||||
# the full-saas harness's safety net).
|
||||
#
|
||||
# Exit codes: 0 happy, 1 generic, 2 missing env, 3 provision timeout,
|
||||
# 4 teardown leak.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}"
|
||||
ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}"
|
||||
PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}"
|
||||
RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}"
|
||||
STALE_WAIT_SECS="${E2E_STALE_WAIT_SECS:-180}"
|
||||
|
||||
SLUG="e2e-ext-$(date +%Y%m%d)-${RUN_ID_SUFFIX}"
|
||||
SLUG=$(echo "$SLUG" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-' | head -c 32)
|
||||
|
||||
log() { echo "[$(date +%H:%M:%S)] $*"; }
|
||||
fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; }
|
||||
ok() { echo "[$(date +%H:%M:%S)] ✅ $*"; }
|
||||
|
||||
CURL_COMMON=(-sS --fail-with-body --max-time 30)
|
||||
|
||||
# ─── cleanup trap (mirrors full-saas) ────────────────────────────────────
|
||||
CLEANUP_DONE=0
|
||||
cleanup_org() {
|
||||
local entry_rc=$?
|
||||
if [ "$CLEANUP_DONE" = "1" ]; then return 0; fi
|
||||
CLEANUP_DONE=1
|
||||
|
||||
if [ "${E2E_KEEP_ORG:-0}" = "1" ]; then
|
||||
log "E2E_KEEP_ORG=1 → leaving $SLUG behind for inspection"
|
||||
return 0
|
||||
fi
|
||||
|
||||
log "Cleanup: deleting tenant $SLUG..."
|
||||
curl "${CURL_COMMON[@]}" --max-time 120 -X DELETE "$CP_URL/cp/admin/tenants/$SLUG" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"confirm\":\"$SLUG\"}" >/dev/null 2>&1 \
|
||||
&& ok "Teardown request accepted" \
|
||||
|| log "Teardown returned non-2xx (may already be gone)"
|
||||
|
||||
local leak_count=1 elapsed=0
|
||||
while [ "$elapsed" -lt 60 ]; do
|
||||
leak_count=$(curl "${CURL_COMMON[@]}" "$CP_URL/cp/admin/orgs" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
|
||||
| python3 -c "import json,sys; d=json.load(sys.stdin); print(sum(1 for o in d.get('orgs', []) if o.get('slug')=='$SLUG' and o.get('status') != 'purged'))" \
|
||||
2>/dev/null || echo 1)
|
||||
[ "$leak_count" = "0" ] && break
|
||||
sleep 5
|
||||
elapsed=$((elapsed + 5))
|
||||
done
|
||||
|
||||
if [ "$leak_count" != "0" ]; then
|
||||
echo "⚠️ LEAK: org $SLUG still present post-teardown (count=$leak_count)" >&2
|
||||
exit 4
|
||||
fi
|
||||
ok "Teardown clean — no orphan resources for $SLUG (${elapsed}s)"
|
||||
|
||||
case "$entry_rc" in
|
||||
0|1|2|3|4) ;;
|
||||
*) exit 1 ;;
|
||||
esac
|
||||
}
|
||||
trap cleanup_org EXIT INT TERM
|
||||
|
||||
# ─── 0. Preflight ───────────────────────────────────────────────────────
|
||||
log "═══════════════════════════════════════════════════════════════════"
|
||||
log " Staging external-runtime E2E (regression for migration 046)"
|
||||
log " CP: $CP_URL"
|
||||
log " Slug: $SLUG"
|
||||
log " Stale: ${STALE_WAIT_SECS}s wait window"
|
||||
log "═══════════════════════════════════════════════════════════════════"
|
||||
|
||||
curl "${CURL_COMMON[@]}" "$CP_URL/health" >/dev/null || fail "CP health check failed"
|
||||
ok "CP reachable"
|
||||
|
||||
admin_call() {
|
||||
local method="$1"; shift; local path="$1"; shift
|
||||
curl "${CURL_COMMON[@]}" -X "$method" "$CP_URL$path" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" "$@"
|
||||
}
|
||||
|
||||
# ─── 1. Create org ──────────────────────────────────────────────────────
|
||||
log "1/8 Creating org $SLUG..."
|
||||
CREATE_RESP=$(admin_call POST /cp/admin/orgs \
|
||||
-d "{\"slug\":\"$SLUG\",\"name\":\"E2E ext $SLUG\",\"owner_user_id\":\"e2e-runner:$SLUG\"}")
|
||||
ORG_ID=$(echo "$CREATE_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))")
|
||||
[ -z "$ORG_ID" ] && fail "Org create response missing 'id'"
|
||||
ok "Org created (id=$ORG_ID)"
|
||||
|
||||
# ─── 2. Wait for tenant provisioning ────────────────────────────────────
|
||||
# Terminal status from /cp/admin/orgs is 'running' (org_instances.status),
|
||||
# NOT 'ready' — same field the full-saas harness polls. 'failed' surfaces
|
||||
# diagnostic dump and aborts. See test_staging_full_saas.sh step 2 for
|
||||
# the field-bugfix history (2026-04-21, last_error path).
|
||||
log "2/8 Waiting for tenant (up to ${PROVISION_TIMEOUT_SECS}s)..."
|
||||
DEADLINE=$(( $(date +%s) + PROVISION_TIMEOUT_SECS ))
|
||||
LAST_STATUS=""
|
||||
while true; do
|
||||
if [ "$(date +%s)" -gt "$DEADLINE" ]; then
|
||||
fail "Tenant provisioning timed out (last: $LAST_STATUS)"
|
||||
fi
|
||||
LIST_JSON=$(admin_call GET /cp/admin/orgs 2>/dev/null || echo '{"orgs":[]}')
|
||||
STATUS=$(echo "$LIST_JSON" | python3 -c "
|
||||
import json, sys
|
||||
d = json.load(sys.stdin)
|
||||
for o in d.get('orgs', []):
|
||||
if o.get('slug') == '$SLUG':
|
||||
print(o.get('instance_status', ''))
|
||||
sys.exit(0)
|
||||
print('')
|
||||
" 2>/dev/null || echo "")
|
||||
if [ "$STATUS" != "$LAST_STATUS" ]; then
|
||||
log " instance_status: $STATUS"
|
||||
LAST_STATUS="$STATUS"
|
||||
fi
|
||||
case "$STATUS" in
|
||||
running) break ;;
|
||||
failed)
|
||||
log "── DIAGNOSTIC BURST (step 2 — tenant provisioning failed) ──"
|
||||
echo "$LIST_JSON" | python3 -c "
|
||||
import json, sys
|
||||
d = json.load(sys.stdin)
|
||||
for o in d.get('orgs', []):
|
||||
if o.get('slug') == '$SLUG':
|
||||
print(json.dumps(o, indent=2))
|
||||
sys.exit(0)
|
||||
print('(no org row found for slug=$SLUG — DB drift?)')
|
||||
" 2>&1 | sed 's/^/ /'
|
||||
log "── END DIAGNOSTIC ──"
|
||||
fail "Tenant provisioning failed for $SLUG (see diagnostic above)"
|
||||
;;
|
||||
*) sleep 15 ;;
|
||||
esac
|
||||
done
|
||||
ok "Tenant provisioning complete"
|
||||
|
||||
# Derive tenant URL the same way the full-saas harness does.
|
||||
CP_HOST=$(echo "$CP_URL" | sed -E 's#^https?://##; s#/.*$##')
|
||||
case "$CP_HOST" in
|
||||
api.*) DERIVED_DOMAIN="${CP_HOST#api.}" ;;
|
||||
staging-api.*) DERIVED_DOMAIN="staging.${CP_HOST#staging-api.}" ;;
|
||||
*) DERIVED_DOMAIN="$CP_HOST" ;;
|
||||
esac
|
||||
TENANT_DOMAIN="${MOLECULE_TENANT_DOMAIN:-$DERIVED_DOMAIN}"
|
||||
TENANT_URL="https://$SLUG.$TENANT_DOMAIN"
|
||||
log " TENANT_URL=$TENANT_URL"
|
||||
|
||||
# ─── 3. Per-tenant admin token + TLS readiness ──────────────────────────
|
||||
log "3/8 Fetching per-tenant admin token..."
|
||||
TENANT_TOKEN_RESP=$(admin_call GET "/cp/admin/orgs/$SLUG/admin-token")
|
||||
TENANT_TOKEN=$(echo "$TENANT_TOKEN_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('admin_token',''))")
|
||||
[ -z "$TENANT_TOKEN" ] && fail "Could not retrieve per-tenant admin token"
|
||||
ok "Token retrieved (len=${#TENANT_TOKEN})"
|
||||
|
||||
log "Waiting for tenant TLS / DNS..."
|
||||
TLS_DEADLINE=$(( $(date +%s) + 15 * 60 ))
|
||||
while true; do
|
||||
if curl -sSfk --max-time 5 "$TENANT_URL/health" >/dev/null 2>&1; then break; fi
|
||||
if [ "$(date +%s)" -gt "$TLS_DEADLINE" ]; then
|
||||
fail "Tenant URL never responded 2xx on /health within 15min"
|
||||
fi
|
||||
sleep 5
|
||||
done
|
||||
ok "Tenant reachable"
|
||||
|
||||
tenant_call() {
|
||||
local method="$1"; shift; local path="$1"; shift
|
||||
curl "${CURL_COMMON[@]}" -X "$method" "$TENANT_URL$path" \
|
||||
-H "Authorization: Bearer $TENANT_TOKEN" \
|
||||
-H "X-Molecule-Org-Id: $ORG_ID" \
|
||||
"$@"
|
||||
}
|
||||
|
||||
# ─── 4. Create external workspace (no URL) ──────────────────────────────
|
||||
# This is the FIRST silent-failure path (workspace.go:333). Pre-migration
|
||||
# 046, the response would say status=awaiting_agent but the row stuck
|
||||
# on whatever the create handler set first (typically 'provisioning')
|
||||
# because the follow-up UPDATE failed the enum cast.
|
||||
log "4/8 Creating external workspace (no URL — exercises workspace.go:333)..."
|
||||
WS_CREATE_RESP=$(tenant_call POST /workspaces \
|
||||
-d '{"name":"ext-e2e","runtime":"external","external":true}')
|
||||
|
||||
WS_ID=$(echo "$WS_CREATE_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))")
|
||||
WS_RESP_STATUS=$(echo "$WS_CREATE_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
|
||||
WS_AUTH_TOKEN=$(echo "$WS_CREATE_RESP" | python3 -c "
|
||||
import json,sys
|
||||
try:
|
||||
d = json.load(sys.stdin)
|
||||
conn = d.get('connection') or {}
|
||||
print(conn.get('auth_token','') or d.get('auth_token',''))
|
||||
except Exception:
|
||||
print('')
|
||||
")
|
||||
[ -z "$WS_ID" ] && fail "Workspace create missing id: $WS_CREATE_RESP"
|
||||
[ "$WS_RESP_STATUS" != "awaiting_agent" ] && fail "Expected response status=awaiting_agent, got $WS_RESP_STATUS"
|
||||
ok "Workspace created (id=$WS_ID, response status=awaiting_agent)"
|
||||
|
||||
# This GET is the proof that the row actually has the value (not just
|
||||
# the response body lying). Pre-migration-046 the UPDATE would have
|
||||
# silently failed and this would return whatever 'provisioning' the
|
||||
# initial INSERT left. Post-fix it must be 'awaiting_agent'.
|
||||
log " Verifying DB row..."
|
||||
GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
|
||||
DB_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
|
||||
[ "$DB_STATUS" != "awaiting_agent" ] && fail "DB row status=$DB_STATUS (expected awaiting_agent — migration 046 likely not applied)"
|
||||
ok "DB row stored as awaiting_agent (proof migration 046 applied)"
|
||||
|
||||
# ─── 5. Register the workspace (transitions to online) ──────────────────
|
||||
# Pre-fix this path was actually fine because it writes 'online', a value
|
||||
# already in the enum. We exercise it anyway because the registration
|
||||
# implicitly walks resolveDeliveryMode (registry.go:resolveDeliveryMode),
|
||||
# which DOES read runtime + apply the new poll-default introduced by
|
||||
# PR #2382.
|
||||
log "5/8 Registering workspace via /registry/register..."
|
||||
[ -z "$WS_AUTH_TOKEN" ] && fail "No workspace auth token returned — register impossible"
|
||||
# Payload contract (workspace-server/internal/models/workspace.go RegisterPayload):
|
||||
# id — required, the workspace UUID (NOT "workspace_id" — that's the
|
||||
# heartbeat payload field; mixing them yields a 400 from
|
||||
# ShouldBindJSON because `id` has binding:"required").
|
||||
# agent_card — required (binding:"required"); minimal valid card is name+skills.
|
||||
# delivery_mode — set explicitly to "poll" so url validation is skipped
|
||||
# regardless of whether the deployed image has the
|
||||
# runtime=external→poll default from PR #2382. Observed
|
||||
# 2026-04-30 17:18Z: a freshly-provisioned staging tenant
|
||||
# was running an older workspace-server :latest image
|
||||
# that lacked resolveDeliveryMode's external→poll branch,
|
||||
# so the implicit default was push and validateAgentURL
|
||||
# 400'd on example.invalid. Asserting on the implicit
|
||||
# default makes the *register call* itself fragile to
|
||||
# image-tag drift on the fleet — verify the default
|
||||
# separately (step 5b assertion) without depending on it
|
||||
# here.
|
||||
# url — accepted but not dispatched-to in poll mode, so
|
||||
# example.invalid is a valid sentinel.
|
||||
REGISTER_BODY=$(printf '{"id":"%s","url":"https://example.invalid:443","delivery_mode":"poll","agent_card":{"name":"e2e-ext","skills":[{"id":"echo","name":"Echo"}]}}' "$WS_ID")
|
||||
# Disable --fail-with-body for this one call so a 4xx surfaces the response
|
||||
# body (the bare CURL_COMMON would `set -e`-kill before we could log it).
|
||||
REGISTER_RESP=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST "$TENANT_URL/registry/register" \
|
||||
-H "Authorization: Bearer $WS_AUTH_TOKEN" \
|
||||
-H "X-Molecule-Org-Id: $ORG_ID" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$REGISTER_BODY") || true
|
||||
log " register response: $(echo "$REGISTER_RESP" | head -c 300)"
|
||||
echo "$REGISTER_RESP" | grep -q "HTTP_CODE=200" || fail "register returned non-200 — see body above"
|
||||
|
||||
GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
|
||||
ONLINE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
|
||||
[ "$ONLINE_STATUS" != "online" ] && fail "Expected online after register, got $ONLINE_STATUS"
|
||||
ok "Workspace transitioned to online"
|
||||
|
||||
# Confirm the register handler echoed back delivery_mode=poll. We read
|
||||
# this from the register RESPONSE, not the workspace GET response, because
|
||||
# the GET handler's SELECT (workspace.go:597) doesn't fetch delivery_mode
|
||||
# — its column list pre-dates the delivery_mode column from #2339 PR 1.
|
||||
# Surfacing delivery_mode in GET is tracked separately; not gating on it
|
||||
# here keeps this test focused on the awaiting_agent transitions.
|
||||
REGISTER_BODY_JSON=$(echo "$REGISTER_RESP" | head -n 1)
|
||||
REGISTER_DELIVERY_MODE=$(echo "$REGISTER_BODY_JSON" | python3 -c "import json,sys; print(json.load(sys.stdin).get('delivery_mode',''))")
|
||||
if [ "$REGISTER_DELIVERY_MODE" = "poll" ]; then
|
||||
ok "delivery_mode=poll (register response echoed explicit value)"
|
||||
else
|
||||
fail "Register response delivery_mode=$REGISTER_DELIVERY_MODE (expected poll). Body: $REGISTER_BODY_JSON"
|
||||
fi
|
||||
|
||||
# ─── 6. Stop heartbeating; wait past REMOTE_LIVENESS_STALE_AFTER ────────
|
||||
# This is the SECOND silent-failure path (registry/healthsweep.go's
|
||||
# sweepStaleRemoteWorkspaces). Pre-migration-046 the heartbeat-staleness
|
||||
# UPDATE silently failed and the workspace stuck on 'online' forever
|
||||
# even though no agent was alive. We wait the full window + a sweep
|
||||
# interval and assert the row transitions back to 'awaiting_agent'.
|
||||
log "6/8 Waiting ${STALE_WAIT_SECS}s for heartbeat-staleness sweep (no heartbeat sent)..."
|
||||
sleep "$STALE_WAIT_SECS"
|
||||
|
||||
GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
|
||||
STALE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
|
||||
[ "$STALE_STATUS" != "awaiting_agent" ] && \
|
||||
fail "After ${STALE_WAIT_SECS}s with no heartbeat, expected status=awaiting_agent (sweep transition), got $STALE_STATUS — migration 046 likely not applied OR sweep not running"
|
||||
ok "Heartbeat-staleness sweep transitioned online → awaiting_agent (proof healthsweep.go fix working)"
|
||||
|
||||
# ─── 7. Re-register and confirm we can come back online ─────────────────
|
||||
# This proves the awaiting_agent state is recoverable (re-registrable),
|
||||
# which is the whole point of using it instead of 'offline'.
|
||||
log "7/8 Re-registering after stale → confirming recovery to online..."
|
||||
# Same payload contract as step 5 (id + agent_card both required). See note
|
||||
# there for why workspace_id would 400.
|
||||
REREG_RESP=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST "$TENANT_URL/registry/register" \
|
||||
-H "Authorization: Bearer $WS_AUTH_TOKEN" \
|
||||
-H "X-Molecule-Org-Id: $ORG_ID" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$REGISTER_BODY") || true
|
||||
log " re-register response: $(echo "$REREG_RESP" | head -c 300)"
|
||||
echo "$REREG_RESP" | grep -q "HTTP_CODE=200" || fail "re-register returned non-200 — see body above"
|
||||
|
||||
GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
|
||||
RECOVERED_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
|
||||
[ "$RECOVERED_STATUS" != "online" ] && \
|
||||
fail "Expected re-register to return workspace to online, got $RECOVERED_STATUS"
|
||||
ok "Re-register succeeded — awaiting_agent → online (operator-recoverable)"
|
||||
|
||||
# ─── 8. Done — cleanup runs in the EXIT trap ───────────────────────────
|
||||
log "8/8 All four awaiting_agent transitions verified."
|
||||
log "═══════════════════════════════════════════════════════════════════"
|
||||
ok "External-runtime E2E PASSED on $SLUG"
|
||||
log "═══════════════════════════════════════════════════════════════════"
|
||||
119
tests/harness/README.md
Normal file
119
tests/harness/README.md
Normal file
@ -0,0 +1,119 @@
|
||||
# Production-shape local harness
|
||||
|
||||
The harness brings up the SaaS tenant topology on localhost using the
|
||||
same `Dockerfile.tenant` image that ships to production. Tests run
|
||||
against `http://harness-tenant.localhost:8080` and exercise the
|
||||
SAME code path a real tenant takes — including TenantGuard middleware,
|
||||
the `/cp/*` reverse proxy, the canvas reverse proxy, and a
|
||||
Cloudflare-tunnel-shape header rewrite layer.
|
||||
|
||||
## Why this exists
|
||||
|
||||
Local `go run ./cmd/server` skips:
|
||||
- `TenantGuard` middleware (no `MOLECULE_ORG_ID` env)
|
||||
- `/cp/*` reverse proxy mount (no `CP_UPSTREAM_URL` env)
|
||||
- `CANVAS_PROXY_URL` (canvas runs separately on `:3000`)
|
||||
- Header rewrites that production's CF tunnel + LB perform
|
||||
- Strict-auth mode (no live `ADMIN_TOKEN`)
|
||||
|
||||
Bugs that survive `go run` and ship to production almost always live
|
||||
in one of those layers. The harness activates ALL of them.
|
||||
|
||||
## Topology
|
||||
|
||||
```
|
||||
client
|
||||
↓
|
||||
cf-proxy nginx, mirrors CF tunnel header rewrites
|
||||
↓ (Host:harness-tenant.localhost, X-Forwarded-*)
|
||||
tenant workspace-server/Dockerfile.tenant — same image as prod
|
||||
↓ (CP_UPSTREAM_URL=http://cp-stub:9090, /cp/* proxied)
|
||||
cp-stub minimal Go service, mocks CP wire surface
|
||||
postgres same version as production
|
||||
redis same version as production
|
||||
```
|
||||
|
||||
## Quickstart
|
||||
|
||||
```bash
|
||||
cd tests/harness
|
||||
./up.sh # builds + starts all services
|
||||
./seed.sh # mints admin token, registers two sample workspaces
|
||||
./replays/peer-discovery-404.sh
|
||||
./replays/buildinfo-stale-image.sh
|
||||
./down.sh # tear down + remove volumes
|
||||
```
|
||||
|
||||
To run every replay in one shot (boot, seed, run-all, teardown):
|
||||
|
||||
```bash
|
||||
cd tests/harness
|
||||
./run-all-replays.sh # full lifecycle; non-zero exit if any replay fails
|
||||
KEEP_UP=1 ./run-all-replays.sh # leave harness up for debugging
|
||||
REBUILD=1 ./run-all-replays.sh # rebuild images before booting
|
||||
```
|
||||
|
||||
First-time setup needs an `/etc/hosts` entry so `harness-tenant.localhost`
|
||||
resolves to the local cf-proxy:
|
||||
|
||||
```bash
|
||||
echo "127.0.0.1 harness-tenant.localhost" | sudo tee -a /etc/hosts
|
||||
```
|
||||
|
||||
(macOS resolves `*.localhost` automatically in some setups; Linux
|
||||
typically does not.)
|
||||
|
||||
## Replay scripts
|
||||
|
||||
Each replay script reproduces a real bug class against the harness so
|
||||
fixes can be verified locally before deploy. The bar for adding a
|
||||
replay is "this bug shipped to production despite local E2E being
|
||||
green" — the script becomes the regression gate that closes that gap.
|
||||
|
||||
| Replay | Closes | What it proves |
|
||||
|--------|--------|----------------|
|
||||
| `peer-discovery-404.sh` | #2397 | tool_list_peers surfaces the actual reason instead of "may be isolated" |
|
||||
| `buildinfo-stale-image.sh` | #2395 | GIT_SHA reaches the binary; verify-step comparison logic works |
|
||||
|
||||
To add a new replay:
|
||||
1. Drop a script under `replays/` named after the issue.
|
||||
2. The script's purpose: reproduce the production failure mode against
|
||||
the harness, then assert the fix is present. PASS criterion is the
|
||||
post-fix behavior.
|
||||
3. The `run-all-replays.sh` runner picks up every `replays/*.sh` script
|
||||
automatically — no per-replay registration needed.
|
||||
|
||||
## Extending the cp-stub
|
||||
|
||||
`cp-stub/main.go` serves the minimum surface for the existing replays
|
||||
plus a catch-all that returns 501 + a clear message when the tenant
|
||||
asks for a route the stub doesn't implement. To add a new CP route:
|
||||
|
||||
1. Add a `mux.HandleFunc` in `cp-stub/main.go` for the path.
|
||||
2. Return the same wire shape the real CP returns. The contract is
|
||||
"wire compatibility with the staging CP at the time of writing" —
|
||||
document it with a comment pointing at the real CP handler.
|
||||
3. Add a replay script that exercises the path.
|
||||
|
||||
## What the harness does NOT cover
|
||||
|
||||
- Real TLS / cert handling (CF terminates TLS in production; harness is
|
||||
HTTP-only).
|
||||
- Cloudflare API edge cases (rate limits, DNS propagation timing).
|
||||
- Real EC2 / SSM / EBS behavior (image-cache replay simulates the
|
||||
outcome but not the AWS API surface).
|
||||
- Cross-region or multi-AZ topology.
|
||||
- Real production data scale.
|
||||
|
||||
These are intentional Phase 1 limits. If a bug class hits one of these
|
||||
gaps, escalate to staging E2E rather than expanding the harness past
|
||||
its mandate of "exercise the tenant binary in production-shape topology."
|
||||
|
||||
## Roadmap
|
||||
|
||||
- **Phase 1 (shipped):** harness + cp-stub + cf-proxy + 2 replays + `run-all-replays.sh` runner.
|
||||
- **Phase 2:** convert `tests/e2e/test_api.sh` to run against the
|
||||
harness instead of localhost. Make harness-based E2E a required CI
|
||||
check (a workflow that invokes `run-all-replays.sh` on every PR).
|
||||
- **Phase 3:** config-coherence lint that diffs harness env list
|
||||
against production CP's env list, fails CI on drift.
|
||||
68
tests/harness/cf-proxy/nginx.conf
Normal file
68
tests/harness/cf-proxy/nginx.conf
Normal file
@ -0,0 +1,68 @@
|
||||
# cf-proxy — Cloudflare-tunnel-shape reverse proxy for the local harness.
|
||||
#
|
||||
# Production path: agent → CF tunnel → AWS LB → tenant container.
|
||||
# This config replays the same header rewrites the CF tunnel does so
|
||||
# the tenant sees the same Host + X-Forwarded-* it would in production.
|
||||
#
|
||||
# The tenant's TenantGuard middleware activates on MOLECULE_ORG_ID; the
|
||||
# canvas's same-origin fetches use the Host header for cookie scoping.
|
||||
# Both behave correctly in production because CF rewrites Host to the
|
||||
# tenant subdomain — this proxy reproduces that locally.
|
||||
#
|
||||
# How tests reach it:
|
||||
# curl --resolve 'harness-tenant.localhost:8443:127.0.0.1' \
|
||||
# https://harness-tenant.localhost:8443/health
|
||||
# or via /etc/hosts (added automatically by ./up.sh on first boot).
|
||||
|
||||
worker_processes 1;
|
||||
events { worker_connections 256; }
|
||||
|
||||
http {
|
||||
# Map the wildcard <slug>.localhost to the tenant container. The
|
||||
# tenant container itself doesn't care which slug routed to it —
|
||||
# what matters is that the Host header it sees matches what
|
||||
# production's CF tunnel sets, so cookie/CORS/TenantGuard logic
|
||||
# exercises the same code path.
|
||||
server {
|
||||
listen 8080;
|
||||
server_name *.localhost localhost;
|
||||
|
||||
# Cap upload at 50MB to mirror the staging tenant nginx limit;
|
||||
# chat upload tests will fail closed if the platform handler
|
||||
# ever silently expands its limit (catches the failure mode
|
||||
# opposite of the chat-files lazy-heal incident).
|
||||
client_max_body_size 50m;
|
||||
|
||||
location / {
|
||||
proxy_pass http://tenant:8080;
|
||||
|
||||
# Header parity with CF tunnel + AWS LB. Production CF sets
|
||||
# X-Forwarded-Proto=https; we keep http here because TLS
|
||||
# termination in compose is unnecessary for testing the
|
||||
# tenant logic — TLS is a CF concern, not a tenant bug
|
||||
# surface. If TLS-specific bugs ever bite, add cert-manager
|
||||
# + listen 8443 ssl here.
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Host $host;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
|
||||
# Streamable HTTP / SSE / WebSocket — the tenant exposes /ws
|
||||
# and /events/stream + MCP /mcp/stream. Disabling buffering
|
||||
# reproduces CF tunnel's pass-through streaming semantics
|
||||
# (CF tunnel = no buffering by default; nginx default IS
|
||||
# buffering, which would mask issue #2397-class streaming
|
||||
# bugs by accumulating output until the client disconnects).
|
||||
proxy_buffering off;
|
||||
proxy_request_buffering off;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Connection "";
|
||||
|
||||
# Read timeout — CF tunnel default is 100s. Setting this to
|
||||
# the same value catches "long agent run finishes after the
|
||||
# proxy already closed the upstream" failure mode.
|
||||
proxy_read_timeout 100s;
|
||||
}
|
||||
}
|
||||
}
|
||||
140
tests/harness/compose.yml
Normal file
140
tests/harness/compose.yml
Normal file
@ -0,0 +1,140 @@
|
||||
# Production-shape harness for local E2E.
|
||||
#
|
||||
# Reproduces the SaaS tenant topology on localhost using the SAME
|
||||
# images that ship to production:
|
||||
#
|
||||
# client → cf-proxy (nginx, mimics CF tunnel headers)
|
||||
# → tenant (workspace-server/Dockerfile.tenant — combined platform + canvas)
|
||||
# → cp-stub (control-plane stand-in) for /cp/* and CP-callback paths
|
||||
# → postgres + redis (same versions as production)
|
||||
#
|
||||
# Why this matters: the workspace-server binary IS identical between
|
||||
# local and production. The bugs that survive local E2E are topology
|
||||
# bugs — env-gated middleware (TenantGuard, CP proxy, Canvas proxy),
|
||||
# auth state, header rewrites, real production image. This harness
|
||||
# activates ALL of them.
|
||||
#
|
||||
# Quickstart:
|
||||
# cd tests/harness && ./up.sh
|
||||
# ./seed.sh
|
||||
# ./replays/peer-discovery-404.sh # reproduces issue #2397
|
||||
#
|
||||
# Env config:
|
||||
# GIT_SHA — passed to the tenant build for /buildinfo verification.
|
||||
# Defaults to "harness" so /buildinfo distinguishes the
|
||||
# harness build from any cached image.
|
||||
# CP_STUB_PEERS_MODE — peers failure mode for replay scripts.
|
||||
# "" / "404" / "401" / "500" / "timeout".
|
||||
|
||||
services:
|
||||
postgres:
|
||||
image: postgres:16-alpine
|
||||
environment:
|
||||
POSTGRES_USER: harness
|
||||
POSTGRES_PASSWORD: harness
|
||||
POSTGRES_DB: molecule
|
||||
networks: [harness-net]
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U harness"]
|
||||
interval: 2s
|
||||
timeout: 5s
|
||||
retries: 10
|
||||
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
networks: [harness-net]
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "ping"]
|
||||
interval: 2s
|
||||
timeout: 5s
|
||||
retries: 10
|
||||
|
||||
cp-stub:
|
||||
build:
|
||||
context: ./cp-stub
|
||||
environment:
|
||||
PORT: "9090"
|
||||
CP_STUB_PEERS_MODE: "${CP_STUB_PEERS_MODE:-}"
|
||||
networks: [harness-net]
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget -q -O- http://localhost:9090/healthz || exit 1"]
|
||||
interval: 2s
|
||||
timeout: 5s
|
||||
retries: 10
|
||||
|
||||
# The actual production tenant image — same Dockerfile.tenant CI publishes.
|
||||
# This is the load-bearing part of the harness: every bug class that hides
|
||||
# behind "but it works locally" is reproducible HERE, against this image,
|
||||
# not against `go run ./cmd/server`.
|
||||
tenant:
|
||||
build:
|
||||
context: ../..
|
||||
dockerfile: workspace-server/Dockerfile.tenant
|
||||
args:
|
||||
GIT_SHA: "${GIT_SHA:-harness}"
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
cp-stub:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
DATABASE_URL: "postgres://harness:harness@postgres:5432/molecule?sslmode=disable"
|
||||
REDIS_URL: "redis://redis:6379"
|
||||
PORT: "8080"
|
||||
PLATFORM_URL: "http://tenant:8080"
|
||||
MOLECULE_ENV: "production"
|
||||
# SECRETS_ENCRYPTION_KEY is required when MOLECULE_ENV=production —
|
||||
# crypto.InitStrict() refuses to boot without it. up.sh generates a
|
||||
# fresh 32-byte key per harness lifetime via `openssl rand -base64 32`
|
||||
# and exports it into this compose file's interpolation environment.
|
||||
# The :? sentinel makes the misuse loud — running `docker compose up`
|
||||
# directly without going through up.sh fails fast with a clear error
|
||||
# rather than getting a confusing tenant-unhealthy timeout.
|
||||
SECRETS_ENCRYPTION_KEY: "${SECRETS_ENCRYPTION_KEY:?must be set — run via tests/harness/up.sh, which generates one per run}"
|
||||
# ADMIN_TOKEN flips the platform into strict-auth mode (matches
|
||||
# production's CP-minted token configuration). Seeded value lets
|
||||
# E2E scripts authenticate without going through CP.
|
||||
ADMIN_TOKEN: "harness-admin-token"
|
||||
# MOLECULE_ORG_ID — activates TenantGuard middleware. Every request
|
||||
# must carry X-Molecule-Org-Id matching this value. Replays bugs
|
||||
# that only fire in SaaS mode.
|
||||
MOLECULE_ORG_ID: "harness-org"
|
||||
# CP_UPSTREAM_URL — activates the /cp/* reverse proxy mount in
|
||||
# router.go. Without this set, /cp/* would 404 and the canvas
|
||||
# bootstrap would silently drift from production behavior.
|
||||
CP_UPSTREAM_URL: "http://cp-stub:9090"
|
||||
RATE_LIMIT: "1000"
|
||||
# Canvas auto-proxy — entrypoint-tenant.sh exports CANVAS_PROXY_URL
|
||||
# by default; keeping it explicit here makes the topology readable.
|
||||
CANVAS_PROXY_URL: "http://localhost:3000"
|
||||
networks: [harness-net]
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget -q -O- http://localhost:8080/health || exit 1"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 20
|
||||
|
||||
# Cloudflare-tunnel-shape proxy — strips the :8080 suffix, rewrites
|
||||
# Host to the tenant subdomain, injects X-Forwarded-*. Tests target
|
||||
# http://harness-tenant.localhost:8080 and exercise the production
|
||||
# routing layer.
|
||||
cf-proxy:
|
||||
image: nginx:1.27-alpine
|
||||
depends_on:
|
||||
tenant:
|
||||
condition: service_healthy
|
||||
volumes:
|
||||
- ./cf-proxy/nginx.conf:/etc/nginx/nginx.conf:ro
|
||||
# Bind to 127.0.0.1 only — the harness uses a hardcoded ADMIN_TOKEN
|
||||
# ("harness-admin-token") so binding 0.0.0.0 (compose's default)
|
||||
# would expose admin access to anyone on the local network or VPN.
|
||||
# Loopback-only is safe for E2E and prevents a known-token leak.
|
||||
ports:
|
||||
- "127.0.0.1:8080:8080"
|
||||
networks: [harness-net]
|
||||
|
||||
networks:
|
||||
harness-net:
|
||||
name: molecule-harness-net
|
||||
14
tests/harness/cp-stub/Dockerfile
Normal file
14
tests/harness/cp-stub/Dockerfile
Normal file
@ -0,0 +1,14 @@
|
||||
# cp-stub — minimal CP stand-in for the local production-shape harness.
|
||||
# See main.go for the rationale. Self-contained build, no module deps.
|
||||
|
||||
FROM golang:1.25-alpine AS builder
|
||||
WORKDIR /src
|
||||
COPY go.mod ./
|
||||
COPY main.go ./
|
||||
RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /cp-stub .
|
||||
|
||||
FROM alpine:3.20
|
||||
RUN apk add --no-cache ca-certificates
|
||||
COPY --from=builder /cp-stub /cp-stub
|
||||
EXPOSE 9090
|
||||
ENTRYPOINT ["/cp-stub"]
|
||||
3
tests/harness/cp-stub/go.mod
Normal file
3
tests/harness/cp-stub/go.mod
Normal file
@ -0,0 +1,3 @@
|
||||
module github.com/Molecule-AI/molecule-monorepo/tests/harness/cp-stub
|
||||
|
||||
go 1.25
|
||||
113
tests/harness/cp-stub/main.go
Normal file
113
tests/harness/cp-stub/main.go
Normal file
@ -0,0 +1,113 @@
|
||||
// cp-stub — minimal control-plane stand-in for the local production-shape harness.
|
||||
//
|
||||
// In production, the tenant Go server reverse-proxies /cp/* to the SaaS
|
||||
// control-plane (molecule-controlplane). This stub plays that role on
|
||||
// localhost so we can exercise the SAME code path the tenant takes in
|
||||
// production — `if cpURL := os.Getenv("CP_UPSTREAM_URL"); cpURL != ""`
|
||||
// in workspace-server/internal/router/router.go fires, the proxy mount
|
||||
// activates, and tests exercise the real tenant→CP wire.
|
||||
//
|
||||
// This is NOT a CP reimplementation. It serves the minimum surface to:
|
||||
// 1. Boot the tenant image without /cp/* breaking the canvas bootstrap.
|
||||
// 2. Replay specific bug classes (e.g. /cp/* returns 404, returns 5xx,
|
||||
// returns malformed JSON) by toggling env vars.
|
||||
//
|
||||
// Scope is bounded by what the tenant + canvas actually call. Add new
|
||||
// handlers as new replay scenarios demand them. Drift from real CP is
|
||||
// tolerated because each handler is named for the exact path it serves —
|
||||
// when the real CP changes, the failing scenario tells us where to look.
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
// redeployFleetCalls tracks how many times /cp/admin/tenants/redeploy-fleet
|
||||
// was invoked. Replay scripts assert > 0 to confirm the workflow's redeploy
|
||||
// step actually reached the stub (catches misrouted CP_URL configs).
|
||||
var redeployFleetCalls atomic.Int64
|
||||
|
||||
func main() {
|
||||
mux := http.NewServeMux()
|
||||
|
||||
// /cp/auth/me — canvas calls this on bootstrap; minimal user record
|
||||
// keeps the canvas from redirecting to login during local E2E.
|
||||
mux.HandleFunc("/cp/auth/me", func(w http.ResponseWriter, r *http.Request) {
|
||||
writeJSON(w, 200, map[string]any{
|
||||
"id": "harness-user",
|
||||
"email": "harness@local",
|
||||
"org_id": "harness-org",
|
||||
"roles": []string{"admin"},
|
||||
})
|
||||
})
|
||||
|
||||
// /cp/admin/tenants/redeploy-fleet — exercised by the
|
||||
// redeploy-tenants-on-{staging,main} workflow's local replay. Returns
|
||||
// the same shape the real CP returns so the verify-fleet logic in CI
|
||||
// can be tested without spinning up a real EC2 fleet.
|
||||
mux.HandleFunc("/cp/admin/tenants/redeploy-fleet", func(w http.ResponseWriter, r *http.Request) {
|
||||
redeployFleetCalls.Add(1)
|
||||
writeJSON(w, 200, map[string]any{
|
||||
"ok": true,
|
||||
"results": []map[string]any{
|
||||
{
|
||||
"slug": "harness-tenant",
|
||||
"phase": "redeploy",
|
||||
"ssm_status": "Success",
|
||||
"ssm_exit_code": 0,
|
||||
"healthz_ok": true,
|
||||
},
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
// __stub/state — expose stub state (counters) so replay scripts can
|
||||
// assert the tenant actually reached us. Read-only.
|
||||
mux.HandleFunc("/__stub/state", func(w http.ResponseWriter, r *http.Request) {
|
||||
writeJSON(w, 200, map[string]any{
|
||||
"redeploy_fleet_calls": redeployFleetCalls.Load(),
|
||||
})
|
||||
})
|
||||
|
||||
// Catch-all for any /cp/* the tenant proxies. Keeps the harness from
|
||||
// crashing the canvas when a new CP route is added — surfaces a clear
|
||||
// "stub doesn't implement X" error instead of opaque 502 from the
|
||||
// reverse proxy.
|
||||
mux.HandleFunc("/cp/", func(w http.ResponseWriter, r *http.Request) {
|
||||
writeJSON(w, 501, map[string]any{
|
||||
"error": "cp-stub: handler not implemented for " + r.Method + " " + r.URL.Path,
|
||||
"hint": "add a handler in tests/harness/cp-stub/main.go for the scenario you're testing",
|
||||
})
|
||||
})
|
||||
|
||||
// /healthz — readiness probe for compose's depends_on.
|
||||
mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) {
|
||||
writeJSON(w, 200, map[string]any{"status": "ok"})
|
||||
})
|
||||
|
||||
addr := ":" + envOr("PORT", "9090")
|
||||
log.Printf("cp-stub listening on %s", addr)
|
||||
if err := http.ListenAndServe(addr, mux); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func writeJSON(w http.ResponseWriter, code int, body any) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(code)
|
||||
if err := json.NewEncoder(w).Encode(body); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "cp-stub: write json: %v\n", err)
|
||||
}
|
||||
}
|
||||
|
||||
func envOr(k, def string) string {
|
||||
if v := os.Getenv(k); v != "" {
|
||||
return v
|
||||
}
|
||||
return def
|
||||
}
|
||||
6
tests/harness/down.sh
Executable file
6
tests/harness/down.sh
Executable file
@ -0,0 +1,6 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
cd "$HERE"
|
||||
docker compose -f compose.yml down -v --remove-orphans
|
||||
echo "[harness] down + volumes removed."
|
||||
75
tests/harness/replays/buildinfo-stale-image.sh
Executable file
75
tests/harness/replays/buildinfo-stale-image.sh
Executable file
@ -0,0 +1,75 @@
|
||||
#!/usr/bin/env bash
|
||||
# Replay for issue #2395 — local proof that the /buildinfo verify gate
|
||||
# closes the SaaS deploy-chain blindness.
|
||||
#
|
||||
# Prior behavior: redeploy-fleet returned ssm_status=Success based on
|
||||
# the SSM RPC return code alone. EC2 tenants kept serving the cached
|
||||
# :latest digest because `docker compose up -d` is a no-op when the
|
||||
# tag hasn't been invalidated. ssm_status=Success was lying.
|
||||
#
|
||||
# This replay simulates that condition locally:
|
||||
# 1. Boot the harness with GIT_SHA=fix-applied.
|
||||
# 2. Curl /buildinfo and assert it returns "fix-applied" (the new code
|
||||
# actually shipped).
|
||||
# 3. Negative test: curl with a different EXPECTED_SHA and assert the
|
||||
# mismatch detection logic the workflow uses returns failure.
|
||||
#
|
||||
# This proves the verify-step's jq lookup + comparison logic works
|
||||
# against the SAME Dockerfile.tenant production builds. If the
|
||||
# /buildinfo route ever stops being wired through, this replay
|
||||
# catches it before it reaches a production tenant.
|
||||
|
||||
set -euo pipefail
|
||||
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
HARNESS_ROOT="$(dirname "$HERE")"
|
||||
|
||||
BASE="${BASE:-http://harness-tenant.localhost:8080}"
|
||||
|
||||
# 1. Confirm /buildinfo wire shape — same shape the workflow's jq lookup expects.
|
||||
echo "[replay] curl $BASE/buildinfo ..."
|
||||
BUILD_JSON=$(curl -sS "$BASE/buildinfo")
|
||||
echo "[replay] $BUILD_JSON"
|
||||
|
||||
ACTUAL_SHA=$(echo "$BUILD_JSON" | jq -r '.git_sha // ""')
|
||||
if [ -z "$ACTUAL_SHA" ]; then
|
||||
echo "[replay] FAIL: /buildinfo response missing git_sha field — workflow's jq lookup would null"
|
||||
exit 1
|
||||
fi
|
||||
echo "[replay] git_sha=$ACTUAL_SHA"
|
||||
|
||||
# 2. Assert the harness build threaded GIT_SHA through. If we got "dev",
|
||||
# the Dockerfile arg / ldflags wiring is broken — same regression
|
||||
# class that made #2395 invisible until production.
|
||||
EXPECTED_FROM_HARNESS="${HARNESS_GIT_SHA:-harness}"
|
||||
if [ "$ACTUAL_SHA" = "dev" ]; then
|
||||
echo "[replay] FAIL: /buildinfo returned 'dev' — Dockerfile.tenant ARG GIT_SHA isn't reaching the binary"
|
||||
echo "[replay] This regresses #2395 by silencing the deploy-verify gate."
|
||||
exit 1
|
||||
fi
|
||||
if [ "$ACTUAL_SHA" != "$EXPECTED_FROM_HARNESS" ]; then
|
||||
echo "[replay] WARN: /buildinfo returned '$ACTUAL_SHA' but harness was built with GIT_SHA='$EXPECTED_FROM_HARNESS'"
|
||||
echo "[replay] Image may be cached from a previous run. Run ./up.sh --rebuild to force a fresh build."
|
||||
fi
|
||||
|
||||
# 3. Negative test — replay the workflow's mismatch detection by
|
||||
# comparing the actual SHA to a deliberately-wrong expected SHA.
|
||||
WRONG_EXPECTED="0000000000000000000000000000000000000000"
|
||||
if [ "$ACTUAL_SHA" = "$WRONG_EXPECTED" ]; then
|
||||
echo "[replay] FAIL: /buildinfo returned all-zero SHA — wiring inverted"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 4. Replay the workflow's exact comparison logic so a regression in
|
||||
# the verify step's bash gets caught here.
|
||||
MISMATCH_DETECTED=0
|
||||
if [ "$ACTUAL_SHA" != "$WRONG_EXPECTED" ]; then
|
||||
MISMATCH_DETECTED=1
|
||||
fi
|
||||
if [ "$MISMATCH_DETECTED" != "1" ]; then
|
||||
echo "[replay] FAIL: workflow comparison logic would not flag a real mismatch"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "[replay] PASS: /buildinfo wire shape, GIT_SHA injection, and mismatch detection all work in"
|
||||
echo " production-shape topology. The redeploy-fleet verify-step covers what it claims to."
|
||||
139
tests/harness/replays/peer-discovery-404.sh
Executable file
139
tests/harness/replays/peer-discovery-404.sh
Executable file
@ -0,0 +1,139 @@
|
||||
#!/usr/bin/env bash
|
||||
# Replay for issue #2397 — local proof that peer-discovery surfaces
|
||||
# actionable diagnostics instead of "may be isolated".
|
||||
#
|
||||
# Prior behavior: tool_list_peers returned "No peers available (this
|
||||
# workspace may be isolated)" regardless of WHY peers were empty —
|
||||
# five distinct conditions (200+empty, 401, 403, 404, 5xx, network)
|
||||
# collapsed to one ambiguous message.
|
||||
#
|
||||
# This replay proves two things, separately:
|
||||
# (a) WIRE: the platform side of the contract — the tenant's
|
||||
# /registry/<unregistered>/peers returns 404. If this regresses
|
||||
# (e.g. tenant starts returning 200 with empty list, or 500),
|
||||
# the runtime helper would parse it differently and the agent
|
||||
# would see a different diagnostic. The harness catches that here.
|
||||
# (b) PARSE: the runtime helper, given a 404, produces a diagnostic
|
||||
# containing "404" + "register" hints. Done in unit tests against
|
||||
# a mock httpx response (test_a2a_client.py::TestGetPeersWithDiagnostic
|
||||
# — the harness re-asserts the same contract here against a real
|
||||
# Python eval that does NOT depend on workspace auth tokens.
|
||||
#
|
||||
# Why split the assertion: the Python eval here doesn't have the
|
||||
# workspace's auth token file, so going through get_peers_with_diagnostic
|
||||
# directly would hit the platform without auth and produce a different
|
||||
# branch (401 instead of 404). Splitting (a) from (b) keeps each
|
||||
# assertion targeting exactly what it claims to test.
|
||||
|
||||
set -euo pipefail
|
||||
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
HARNESS_ROOT="$(dirname "$HERE")"
|
||||
cd "$HARNESS_ROOT"
|
||||
|
||||
if [ ! -f .seed.env ]; then
|
||||
echo "[replay] no .seed.env — running ./seed.sh first..."
|
||||
./seed.sh
|
||||
fi
|
||||
# shellcheck source=/dev/null
|
||||
source .seed.env
|
||||
|
||||
BASE="${BASE:-http://harness-tenant.localhost:8080}"
|
||||
ADMIN="harness-admin-token"
|
||||
ORG="harness-org"
|
||||
|
||||
# ─── (a) WIRE: tenant returns 404 for an unregistered workspace ────────
|
||||
ROGUE_ID="$(uuidgen | tr '[:upper:]' '[:lower:]')"
|
||||
echo "[replay] (a) WIRE: querying /registry/$ROGUE_ID/peers (unregistered workspace)..."
|
||||
HTTP_CODE=$(curl -sS -o /tmp/peer-replay.json -w '%{http_code}' \
|
||||
-H "Authorization: Bearer $ADMIN" \
|
||||
-H "X-Molecule-Org-Id: $ORG" \
|
||||
-H "X-Workspace-ID: $ROGUE_ID" \
|
||||
"$BASE/registry/$ROGUE_ID/peers")
|
||||
|
||||
echo "[replay] tenant responded HTTP $HTTP_CODE"
|
||||
if [ "$HTTP_CODE" != "404" ]; then
|
||||
echo "[replay] FAIL (a): expected 404 from /registry/<unregistered>/peers, got $HTTP_CODE"
|
||||
echo "[replay] This is a platform-side regression — the runtime's diagnostic helper"
|
||||
echo "[replay] would see a different status code than the unit tests cover."
|
||||
cat /tmp/peer-replay.json
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# ─── (b) PARSE: helper converts a synthetic 404 to actionable diagnostic ─
|
||||
#
|
||||
# We construct a synthetic httpx 404 response and run the helper against
|
||||
# it directly. This isolates the parse branch we want to test from the
|
||||
# auth-context concerns of going through the network. The helper's network
|
||||
# branches are exhaustively covered by tests/test_a2a_client.py — this is
|
||||
# a regression-guard that the helper IS in the install, IS importable in
|
||||
# the harness's Python env, and IS reading the status code.
|
||||
|
||||
WORKSPACE_PATH="$(cd "$HARNESS_ROOT/../../workspace" && pwd)"
|
||||
DIAGNOSTIC=$(WORKSPACE_ID="harness-rogue" PYTHONPATH="$WORKSPACE_PATH" \
|
||||
python3 - "$WORKSPACE_PATH" <<'PYEOF'
|
||||
import asyncio
|
||||
import sys
|
||||
import types
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
# Stub platform_auth so a2a_client imports cleanly without requiring a
|
||||
# real workspace token file. The helper's auth_headers() only matters
|
||||
# when going through the network; we're feeding it a mock response.
|
||||
_pa = types.ModuleType("platform_auth")
|
||||
_pa.auth_headers = lambda: {}
|
||||
_pa.self_source_headers = lambda: {}
|
||||
sys.modules.setdefault("platform_auth", _pa)
|
||||
|
||||
sys.path.insert(0, sys.argv[1])
|
||||
import a2a_client # noqa: E402
|
||||
|
||||
# This replay validates PR #2399's diagnostic helper. If the workspace
|
||||
# runtime in the current checkout pre-dates that fix, fail with a
|
||||
# clear message instead of an opaque AttributeError.
|
||||
if not hasattr(a2a_client, "get_peers_with_diagnostic"):
|
||||
print("__SKIP__: workspace/a2a_client.py is pre-#2399 (no get_peers_with_diagnostic).")
|
||||
sys.exit(0)
|
||||
|
||||
resp = MagicMock()
|
||||
resp.status_code = 404
|
||||
resp.json = MagicMock(return_value={"detail": "not found"})
|
||||
|
||||
mock_client = AsyncMock()
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client.get = AsyncMock(return_value=resp)
|
||||
|
||||
async def main():
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
peers, diag = await a2a_client.get_peers_with_diagnostic()
|
||||
print(repr(diag))
|
||||
|
||||
asyncio.run(main())
|
||||
PYEOF
|
||||
)
|
||||
|
||||
if [[ "$DIAGNOSTIC" == __SKIP__:* ]]; then
|
||||
echo "[replay] (b) SKIP: ${DIAGNOSTIC#__SKIP__: }"
|
||||
echo "[replay] Re-run after #2399 lands on staging."
|
||||
echo ""
|
||||
echo "[replay] PASS (a) only: peer-discovery wire returns 404 (parse branch skipped — see above)."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "[replay] (b) PARSE: helper diagnostic = $DIAGNOSTIC"
|
||||
|
||||
if ! echo "$DIAGNOSTIC" | grep -q "404"; then
|
||||
echo "[replay] FAIL (b): diagnostic missing '404' — helper regressed to swallow-the-status-code"
|
||||
exit 1
|
||||
fi
|
||||
if ! echo "$DIAGNOSTIC" | grep -qi "regist"; then
|
||||
echo "[replay] FAIL (b): diagnostic missing 'register' guidance — helper regressed to opaque message"
|
||||
exit 1
|
||||
fi
|
||||
if echo "$DIAGNOSTIC" | grep -qi "may be isolated"; then
|
||||
echo "[replay] FAIL (b): diagnostic still says 'may be isolated' — fix didn't reach this code path"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "[replay] PASS: peer-discovery (a) wire returns 404, (b) helper produces actionable diagnostic."
|
||||
14
tests/harness/requirements.txt
Normal file
14
tests/harness/requirements.txt
Normal file
@ -0,0 +1,14 @@
|
||||
# Harness-replay Python deps — minimal set for replays/*.sh scripts that
|
||||
# eval Python against the running tenant (e.g. importing
|
||||
# workspace/a2a_client.py to assert parser behavior).
|
||||
#
|
||||
# This is intentionally smaller than workspace/requirements.txt: the
|
||||
# replays don't need a2a-sdk, langchain, opentelemetry, etc. — only the
|
||||
# HTTP client surface that the imported helpers depend on. Adding the
|
||||
# full workspace deps would slow every harness CI run by ~30s for no
|
||||
# gain.
|
||||
#
|
||||
# Add a line here (with a version constraint matching workspace/requirements.txt)
|
||||
# when a new replay introduces a new Python import.
|
||||
|
||||
httpx>=0.28.1
|
||||
90
tests/harness/run-all-replays.sh
Executable file
90
tests/harness/run-all-replays.sh
Executable file
@ -0,0 +1,90 @@
|
||||
#!/usr/bin/env bash
|
||||
# Run every replay under tests/harness/replays/ against a fresh harness.
|
||||
#
|
||||
# Boots the harness (up.sh + seed.sh), runs each `replays/*.sh` in
|
||||
# alphabetical order, tracks pass/fail, and tears down on exit. Returns
|
||||
# non-zero if any replay failed.
|
||||
#
|
||||
# Usage:
|
||||
# ./run-all-replays.sh # boot, run, teardown
|
||||
# KEEP_UP=1 ./run-all-replays.sh # leave harness running on exit (debug)
|
||||
# REBUILD=1 ./run-all-replays.sh # rebuild images before booting
|
||||
#
|
||||
# CI usage: invoke without flags. The trap-on-EXIT teardown ensures we
|
||||
# don't leak Docker resources when a replay fails partway through.
|
||||
|
||||
set -euo pipefail
|
||||
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
cd "$HERE"
|
||||
|
||||
REPLAYS_DIR="$HERE/replays"
|
||||
if [ ! -d "$REPLAYS_DIR" ]; then
|
||||
echo "[run-all] no replays/ directory at $REPLAYS_DIR — nothing to run"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
shopt -s nullglob
|
||||
REPLAYS=("$REPLAYS_DIR"/*.sh)
|
||||
shopt -u nullglob
|
||||
if [ ${#REPLAYS[@]} -eq 0 ]; then
|
||||
echo "[run-all] replays/ is empty — nothing to run"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
local exit_code=$?
|
||||
if [ "${KEEP_UP:-0}" = "1" ]; then
|
||||
echo ""
|
||||
echo "[run-all] KEEP_UP=1 — leaving harness up. Tear down manually with ./down.sh"
|
||||
else
|
||||
echo ""
|
||||
echo "[run-all] tearing down harness..."
|
||||
./down.sh >/dev/null 2>&1 || echo "[run-all] WARN: ./down.sh exited non-zero"
|
||||
fi
|
||||
exit "$exit_code"
|
||||
}
|
||||
trap cleanup EXIT INT TERM
|
||||
|
||||
echo "[run-all] booting harness..."
|
||||
if [ "${REBUILD:-0}" = "1" ]; then
|
||||
./up.sh --rebuild
|
||||
else
|
||||
./up.sh
|
||||
fi
|
||||
|
||||
echo "[run-all] seeding workspaces..."
|
||||
./seed.sh
|
||||
|
||||
PASS_COUNT=0
|
||||
FAIL_COUNT=0
|
||||
SKIP_COUNT=0
|
||||
FAILED_NAMES=()
|
||||
|
||||
for replay in "${REPLAYS[@]}"; do
|
||||
name=$(basename "$replay" .sh)
|
||||
echo ""
|
||||
echo "[run-all] ━━━ $name ━━━"
|
||||
if bash "$replay"; then
|
||||
# Replays signal "skip" by exiting 0 with a __SKIP__ marker in stdout —
|
||||
# but we capture that as a pass here since the script exited 0. The
|
||||
# skip is documented in the script's own output. CI uses pass/fail.
|
||||
PASS_COUNT=$((PASS_COUNT + 1))
|
||||
echo "[run-all] PASS: $name"
|
||||
else
|
||||
FAIL_COUNT=$((FAIL_COUNT + 1))
|
||||
FAILED_NAMES+=("$name")
|
||||
echo "[run-all] FAIL: $name"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "[run-all] ============================="
|
||||
echo "[run-all] Replay summary: ${PASS_COUNT} passed, ${FAIL_COUNT} failed (of ${#REPLAYS[@]} total)"
|
||||
if [ ${FAIL_COUNT} -gt 0 ]; then
|
||||
echo "[run-all] Failed:"
|
||||
for name in "${FAILED_NAMES[@]}"; do
|
||||
echo "[run-all] - $name"
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
echo "[run-all] All replays passed."
|
||||
65
tests/harness/seed.sh
Executable file
65
tests/harness/seed.sh
Executable file
@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env bash
|
||||
# Seed the harness with two registered workspaces so peer-discovery
|
||||
# replay scripts have something to discover.
|
||||
#
|
||||
# - "alpha" parent (tier 0)
|
||||
# - "beta" child of alpha (tier 1)
|
||||
#
|
||||
# Both register via the platform's /registry/register endpoint, which
|
||||
# is what real workspaces do at boot. The platform then has them in its
|
||||
# DB; tool_list_peers from inside alpha can resolve beta as a peer.
|
||||
|
||||
set -euo pipefail
|
||||
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
cd "$HERE"
|
||||
|
||||
BASE="${BASE:-http://harness-tenant.localhost:8080}"
|
||||
ADMIN="harness-admin-token"
|
||||
ORG="harness-org"
|
||||
|
||||
curl_admin() {
|
||||
curl -sS -H "Authorization: Bearer $ADMIN" \
|
||||
-H "X-Molecule-Org-Id: $ORG" \
|
||||
-H "Content-Type: application/json" "$@"
|
||||
}
|
||||
|
||||
echo "[seed] confirming tenant is reachable via cf-proxy..."
|
||||
HEALTH=$(curl -sS "$BASE/health" || echo "")
|
||||
if [ -z "$HEALTH" ]; then
|
||||
echo "[seed] FAILED: $BASE/health unreachable. Did ./up.sh complete? Did you add"
|
||||
echo " 127.0.0.1 harness-tenant.localhost to /etc/hosts?"
|
||||
exit 1
|
||||
fi
|
||||
echo "[seed] $HEALTH"
|
||||
|
||||
echo "[seed] confirming /buildinfo returns the harness GIT_SHA..."
|
||||
BUILD=$(curl -sS "$BASE/buildinfo" || echo "")
|
||||
echo "[seed] $BUILD"
|
||||
|
||||
# Mint a fresh admin-call workspace ID for the parent. Platform's
|
||||
# /admin/workspaces/:id/test-token mints a per-workspace bearer; the
|
||||
# replay scripts use it to call the workspace-scoped routes.
|
||||
echo "[seed] creating workspace 'alpha' (parent)..."
|
||||
ALPHA_ID=$(uuidgen | tr '[:upper:]' '[:lower:]')
|
||||
curl_admin -X POST "$BASE/workspaces" \
|
||||
-d "{\"id\":\"$ALPHA_ID\",\"name\":\"alpha\",\"tier\":0,\"runtime\":\"langgraph\"}" \
|
||||
>/dev/null
|
||||
echo "[seed] alpha id=$ALPHA_ID"
|
||||
|
||||
echo "[seed] creating workspace 'beta' (child of alpha)..."
|
||||
BETA_ID=$(uuidgen | tr '[:upper:]' '[:lower:]')
|
||||
curl_admin -X POST "$BASE/workspaces" \
|
||||
-d "{\"id\":\"$BETA_ID\",\"name\":\"beta\",\"tier\":1,\"parent_id\":\"$ALPHA_ID\",\"runtime\":\"langgraph\"}" \
|
||||
>/dev/null
|
||||
echo "[seed] beta id=$BETA_ID"
|
||||
|
||||
# Stash IDs so replay scripts pick them up.
|
||||
{
|
||||
echo "ALPHA_ID=$ALPHA_ID"
|
||||
echo "BETA_ID=$BETA_ID"
|
||||
} > "$HERE/.seed.env"
|
||||
|
||||
echo ""
|
||||
echo "[seed] done. IDs persisted to tests/harness/.seed.env"
|
||||
echo "[seed] ALPHA_ID=$ALPHA_ID"
|
||||
echo "[seed] BETA_ID=$BETA_ID"
|
||||
55
tests/harness/up.sh
Executable file
55
tests/harness/up.sh
Executable file
@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env bash
|
||||
# Bring the production-shape harness up.
|
||||
#
|
||||
# Usage: ./up.sh [--rebuild]
|
||||
#
|
||||
# Always operates in tests/harness/ regardless of where it's invoked
|
||||
# from — test scripts under tests/harness/replays/ source it via the
|
||||
# absolute path, so cd-ing first prevents compose-context surprises.
|
||||
|
||||
set -euo pipefail
|
||||
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
cd "$HERE"
|
||||
|
||||
REBUILD=false
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
--rebuild) REBUILD=true ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Generate a per-run encryption key. The tenant runs with
|
||||
# MOLECULE_ENV=production (intentional, to replay prod-shape bugs), and
|
||||
# crypto.InitStrict() refuses to boot without SECRETS_ENCRYPTION_KEY.
|
||||
# Generate fresh so:
|
||||
# - No key-shaped string lives in the repo (avoids muscle-memorying a
|
||||
# hardcoded value into other places + secret-scanner false positives).
|
||||
# - Each harness lifetime gets a unique key, mimicking prod's per-tenant
|
||||
# isolation. Persistence across runs isn't required — the harness DB
|
||||
# is wiped on every ./down.sh.
|
||||
# Honor a caller-supplied value if already exported (lets a debug session
|
||||
# pin a key for reproducibility).
|
||||
if [ -z "${SECRETS_ENCRYPTION_KEY:-}" ]; then
|
||||
SECRETS_ENCRYPTION_KEY=$(openssl rand -base64 32)
|
||||
export SECRETS_ENCRYPTION_KEY
|
||||
fi
|
||||
|
||||
if [ "$REBUILD" = true ]; then
|
||||
docker compose -f compose.yml build --no-cache tenant cp-stub
|
||||
fi
|
||||
|
||||
echo "[harness] starting cp-stub + postgres + redis + tenant + cf-proxy ..."
|
||||
docker compose -f compose.yml up -d --wait
|
||||
|
||||
echo "[harness] /etc/hosts entry for harness-tenant.localhost..."
|
||||
if ! grep -q '^127\.0\.0\.1[[:space:]]\+harness-tenant\.localhost' /etc/hosts; then
|
||||
echo " (skip — your /etc/hosts may not resolve *.localhost. If tests fail with"
|
||||
echo " 'getaddrinfo' errors, add: 127.0.0.1 harness-tenant.localhost)"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "[harness] up. Tenant: http://harness-tenant.localhost:8080/health"
|
||||
echo " http://harness-tenant.localhost:8080/buildinfo"
|
||||
echo " cp-stub: http://localhost (internal-only via compose net)"
|
||||
echo ""
|
||||
echo "Next: ./seed.sh # mint admin token + register sample workspaces"
|
||||
@ -223,13 +223,24 @@ func main() {
|
||||
registry.StartLivenessMonitor(c, onWorkspaceOffline)
|
||||
})
|
||||
|
||||
// Proactive container health sweep — detects dead containers faster than Redis TTL.
|
||||
// Checks all "online" workspaces against Docker every 15 seconds.
|
||||
if prov != nil {
|
||||
go supervised.RunWithRecover(ctx, "health-sweep", func(c context.Context) {
|
||||
registry.StartHealthSweep(c, prov, 15*time.Second, onWorkspaceOffline)
|
||||
})
|
||||
}
|
||||
// Proactive health sweep — two passes per tick:
|
||||
// 1. Docker-side: checks "online" workspaces against the local Docker
|
||||
// daemon (only runs when prov is non-nil, i.e. self-hosted mode).
|
||||
// 2. Remote-side: scans runtime='external' rows whose last_heartbeat_at
|
||||
// is past REMOTE_LIVENESS_STALE_AFTER and flips them to
|
||||
// awaiting_agent. Runs regardless of provisioner mode — SaaS
|
||||
// tenants need this even though they don't run Docker locally,
|
||||
// because external-runtime workspaces are operator-managed and
|
||||
// the platform-side liveness sweep is the only thing that
|
||||
// transitions them off 'online' when the operator's CLI dies.
|
||||
//
|
||||
// Pre-2026-04-30 this goroutine was gated on prov != nil, which silently
|
||||
// disabled the remote-side sweep on every SaaS tenant. The function in
|
||||
// healthsweep.go has always handled nil checker correctly; only the
|
||||
// orchestration was wrong. See #2392's CI failure for the trace.
|
||||
go supervised.RunWithRecover(ctx, "health-sweep", func(c context.Context) {
|
||||
registry.StartHealthSweep(c, prov, 15*time.Second, onWorkspaceOffline)
|
||||
})
|
||||
|
||||
// Orphan-container reconcile sweep — finds running containers
|
||||
// whose workspace row is already status='removed' and stops
|
||||
|
||||
@ -53,6 +53,7 @@ const tenantOrgIDHeader = "X-Molecule-Org-Id"
|
||||
// here only bypasses the cross-org routing check, not auth.
|
||||
var tenantGuardAllowlist = map[string]struct{}{
|
||||
"/health": {},
|
||||
"/buildinfo": {},
|
||||
"/metrics": {},
|
||||
"/registry/register": {},
|
||||
"/registry/heartbeat": {},
|
||||
|
||||
@ -8,13 +8,15 @@ import (
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// helper: build a router with TenantGuard configured to `orgID` and two
|
||||
// representative routes — a regular API route and two allowlisted ones.
|
||||
// helper: build a router with TenantGuard configured to `orgID` and a
|
||||
// representative API route plus the public allowlisted ones (/health,
|
||||
// /buildinfo, /metrics).
|
||||
func newGuardedRouter(orgID string) *gin.Engine {
|
||||
gin.SetMode(gin.TestMode)
|
||||
r := gin.New()
|
||||
r.Use(TenantGuardWithOrgID(orgID))
|
||||
r.GET("/health", func(c *gin.Context) { c.String(200, "ok") })
|
||||
r.GET("/buildinfo", func(c *gin.Context) { c.String(200, "buildinfo") })
|
||||
r.GET("/metrics", func(c *gin.Context) { c.String(200, "metrics") })
|
||||
r.GET("/workspaces", func(c *gin.Context) { c.String(200, "workspaces") })
|
||||
return r
|
||||
@ -71,10 +73,14 @@ func TestTenantGuard_MissingHeaderIs404(t *testing.T) {
|
||||
}
|
||||
|
||||
// Allowlisted paths bypass the guard even in tenant mode — required for health
|
||||
// probes (Fly Machines checks) and Prometheus scrape.
|
||||
// probes (Fly Machines checks), Prometheus scrape, and the redeploy-fleet
|
||||
// /buildinfo verification step. /buildinfo without an org header used to
|
||||
// 404-via-NoRoute → canvas (HTML), which made the redeploy verifier think
|
||||
// every tenant was stale even when the binary was current. Pin this so a
|
||||
// future allowlist edit can't silently regress that check.
|
||||
func TestTenantGuard_AllowlistBypassesCheck(t *testing.T) {
|
||||
r := newGuardedRouter("org-abc")
|
||||
for _, path := range []string{"/health", "/metrics"} {
|
||||
for _, path := range []string{"/health", "/buildinfo", "/metrics"} {
|
||||
w := doRequest(r, path, "") // no header
|
||||
if w.Code != 200 {
|
||||
t.Errorf("%s: allowlisted path should return 200 without header, got %d", path, w.Code)
|
||||
|
||||
@ -229,19 +229,61 @@ async def send_a2a_message(target_url: str, message: str) -> str:
|
||||
return _format_a2a_error(last_exc, target_url)
|
||||
|
||||
|
||||
async def get_peers() -> list[dict]:
|
||||
"""Get this workspace's peers from the platform registry."""
|
||||
async def get_peers_with_diagnostic() -> tuple[list[dict], str | None]:
|
||||
"""Get this workspace's peers, returning (peers, diagnostic).
|
||||
|
||||
diagnostic is None when the call succeeded (status 200, even if the list
|
||||
is empty). When peers is [] for a non-trivial reason (auth failure,
|
||||
workspace-id missing from registry, platform error, network error),
|
||||
diagnostic is a short human-readable string explaining what went wrong
|
||||
so callers can surface it instead of "may be isolated" — see #2397.
|
||||
|
||||
The legacy get_peers() shim below preserves the bare-list contract for
|
||||
non-tool callers.
|
||||
"""
|
||||
url = f"{PLATFORM_URL}/registry/{WORKSPACE_ID}/peers"
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
try:
|
||||
resp = await client.get(
|
||||
f"{PLATFORM_URL}/registry/{WORKSPACE_ID}/peers",
|
||||
url,
|
||||
headers={"X-Workspace-ID": WORKSPACE_ID, **auth_headers()},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
return resp.json()
|
||||
return []
|
||||
except Exception:
|
||||
return []
|
||||
except Exception as e:
|
||||
return [], f"Cannot reach platform at {PLATFORM_URL}: {e}"
|
||||
|
||||
if resp.status_code == 200:
|
||||
try:
|
||||
data = resp.json()
|
||||
except Exception as e:
|
||||
return [], f"Platform returned 200 but body was not JSON: {e}"
|
||||
if not isinstance(data, list):
|
||||
return [], f"Platform returned 200 but body was not a list: {type(data).__name__}"
|
||||
return data, None
|
||||
|
||||
if resp.status_code in (401, 403):
|
||||
return [], (
|
||||
f"Authentication to platform failed (HTTP {resp.status_code}). "
|
||||
"The workspace bearer token may be invalid — restarting the workspace usually re-mints it."
|
||||
)
|
||||
if resp.status_code == 404:
|
||||
return [], (
|
||||
f"Workspace ID {WORKSPACE_ID} is not registered with the platform (HTTP 404). "
|
||||
"Re-registration via the platform's /registry/register endpoint is needed."
|
||||
)
|
||||
if 500 <= resp.status_code < 600:
|
||||
return [], f"Platform error: HTTP {resp.status_code}."
|
||||
return [], f"Unexpected platform response: HTTP {resp.status_code}."
|
||||
|
||||
|
||||
async def get_peers() -> list[dict]:
|
||||
"""Get this workspace's peers from the platform registry.
|
||||
|
||||
Bare-list shim over get_peers_with_diagnostic() — discards the diagnostic
|
||||
so callers that don't care about the failure reason (e.g. system-prompt
|
||||
bootstrap formatters) get the same shape they always had.
|
||||
"""
|
||||
peers, _ = await get_peers_with_diagnostic()
|
||||
return peers
|
||||
|
||||
|
||||
async def get_workspace_info() -> dict:
|
||||
|
||||
@ -18,6 +18,7 @@ from a2a_client import (
|
||||
_peer_names,
|
||||
discover_peer,
|
||||
get_peers,
|
||||
get_peers_with_diagnostic,
|
||||
get_workspace_info,
|
||||
send_a2a_message,
|
||||
)
|
||||
@ -410,9 +411,16 @@ async def tool_send_message_to_user(message: str, attachments: list[str] | None
|
||||
|
||||
async def tool_list_peers() -> str:
|
||||
"""List all workspaces this agent can communicate with."""
|
||||
peers = await get_peers()
|
||||
peers, diagnostic = await get_peers_with_diagnostic()
|
||||
if not peers:
|
||||
return "No peers available (this workspace may be isolated)"
|
||||
if diagnostic is not None:
|
||||
# Non-trivial empty: auth failure / 404 / 5xx / network — surface
|
||||
# the actual reason so the user/agent doesn't have to guess. #2397.
|
||||
return f"No peers found. {diagnostic}"
|
||||
return (
|
||||
"You have no peers in the platform registry. "
|
||||
"(No parent, no children, no siblings registered.)"
|
||||
)
|
||||
lines = []
|
||||
for p in peers:
|
||||
status = p.get("status", "unknown")
|
||||
|
||||
@ -577,6 +577,149 @@ class TestGetPeers:
|
||||
assert headers_sent.get("X-Workspace-ID") == a2a_client.WORKSPACE_ID
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# get_peers_with_diagnostic — issue #2397
|
||||
#
|
||||
# Pin: an empty peer list MUST come with an actionable diagnostic on every
|
||||
# non-200 + every transport failure. The bug was that get_peers swallowed
|
||||
# every failure mode behind `return []`, leaving the agent's tool wrapper
|
||||
# with no way to distinguish "you have no peers" from "auth broke" / "404
|
||||
# from registry" / "platform 5xx" / "network timeout". Each of these
|
||||
# requires a different operator action.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestGetPeersWithDiagnostic:
|
||||
|
||||
async def test_200_returns_peers_and_no_diagnostic(self):
|
||||
"""200 with valid list → (peers, None). diagnostic stays None on success."""
|
||||
import a2a_client
|
||||
|
||||
peers = [{"id": "ws-1", "name": "Alpha"}]
|
||||
resp = _make_response(200, peers)
|
||||
mock_client = _make_mock_client(get_resp=resp)
|
||||
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
result, diag = await a2a_client.get_peers_with_diagnostic()
|
||||
|
||||
assert result == peers
|
||||
assert diag is None
|
||||
|
||||
async def test_200_empty_list_returns_no_diagnostic(self):
|
||||
"""200 with [] → (peers=[], diag=None). Truly no peers is success, not error."""
|
||||
import a2a_client
|
||||
|
||||
resp = _make_response(200, [])
|
||||
mock_client = _make_mock_client(get_resp=resp)
|
||||
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
result, diag = await a2a_client.get_peers_with_diagnostic()
|
||||
|
||||
assert result == []
|
||||
assert diag is None
|
||||
|
||||
async def test_401_returns_auth_diagnostic(self):
|
||||
"""401 → diagnostic mentions auth + restart hint."""
|
||||
import a2a_client
|
||||
|
||||
resp = _make_response(401, {"detail": "unauthorized"})
|
||||
mock_client = _make_mock_client(get_resp=resp)
|
||||
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
result, diag = await a2a_client.get_peers_with_diagnostic()
|
||||
|
||||
assert result == []
|
||||
assert diag is not None
|
||||
assert "401" in diag
|
||||
assert "Authentication" in diag or "authentication" in diag.lower()
|
||||
|
||||
async def test_403_returns_auth_diagnostic(self):
|
||||
"""403 → same auth-failure diagnostic shape as 401."""
|
||||
import a2a_client
|
||||
|
||||
resp = _make_response(403, {"detail": "forbidden"})
|
||||
mock_client = _make_mock_client(get_resp=resp)
|
||||
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
result, diag = await a2a_client.get_peers_with_diagnostic()
|
||||
|
||||
assert result == []
|
||||
assert diag is not None
|
||||
assert "403" in diag
|
||||
|
||||
async def test_404_returns_registration_diagnostic(self):
|
||||
"""404 → diagnostic tells operator the workspace ID is missing from the registry."""
|
||||
import a2a_client
|
||||
|
||||
resp = _make_response(404, {"detail": "not found"})
|
||||
mock_client = _make_mock_client(get_resp=resp)
|
||||
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
result, diag = await a2a_client.get_peers_with_diagnostic()
|
||||
|
||||
assert result == []
|
||||
assert diag is not None
|
||||
assert "404" in diag
|
||||
assert "registered" in diag.lower() or "registration" in diag.lower()
|
||||
|
||||
async def test_500_returns_platform_error_diagnostic(self):
|
||||
"""5xx → 'Platform error: HTTP <code>.'"""
|
||||
import a2a_client
|
||||
|
||||
resp = _make_response(503, {"detail": "service unavailable"})
|
||||
mock_client = _make_mock_client(get_resp=resp)
|
||||
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
result, diag = await a2a_client.get_peers_with_diagnostic()
|
||||
|
||||
assert result == []
|
||||
assert diag is not None
|
||||
assert "503" in diag
|
||||
assert "Platform error" in diag or "platform error" in diag.lower()
|
||||
|
||||
async def test_network_exception_returns_unreachable_diagnostic(self):
|
||||
"""httpx exception → diagnostic mentions PLATFORM_URL + the underlying error."""
|
||||
import a2a_client
|
||||
|
||||
mock_client = _make_mock_client(get_exc=TimeoutError("connection timed out"))
|
||||
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
result, diag = await a2a_client.get_peers_with_diagnostic()
|
||||
|
||||
assert result == []
|
||||
assert diag is not None
|
||||
assert "Cannot reach platform" in diag or "cannot reach" in diag.lower()
|
||||
assert "timed out" in diag
|
||||
|
||||
async def test_200_with_non_list_body_returns_diagnostic(self):
|
||||
"""200 but body is a dict → diagnostic flags shape mismatch (regression guard)."""
|
||||
import a2a_client
|
||||
|
||||
resp = _make_response(200, {"oops": "should have been a list"})
|
||||
mock_client = _make_mock_client(get_resp=resp)
|
||||
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
result, diag = await a2a_client.get_peers_with_diagnostic()
|
||||
|
||||
assert result == []
|
||||
assert diag is not None
|
||||
assert "list" in diag.lower()
|
||||
|
||||
async def test_get_peers_shim_preserves_bare_list_contract(self):
|
||||
"""get_peers() still returns just list[dict] — no API break for non-tool callers."""
|
||||
import a2a_client
|
||||
|
||||
peers = [{"id": "ws-1", "name": "Alpha"}]
|
||||
resp = _make_response(200, peers)
|
||||
mock_client = _make_mock_client(get_resp=resp)
|
||||
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
result = await a2a_client.get_peers()
|
||||
|
||||
# Must be a list, not a tuple — bare-list shim contract.
|
||||
assert isinstance(result, list)
|
||||
assert result == peers
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# get_workspace_info
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@ -536,11 +536,54 @@ class TestToolSendMessageToUser:
|
||||
|
||||
class TestToolListPeers:
|
||||
|
||||
async def test_no_peers_returns_isolated_message(self):
|
||||
async def test_true_empty_returns_no_peers_message_without_diagnostic(self):
|
||||
"""200 + empty list → 'no peers in the platform registry' (no failure)."""
|
||||
import a2a_tools
|
||||
with patch("a2a_tools.get_peers", return_value=[]):
|
||||
with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], None)):
|
||||
result = await a2a_tools.tool_list_peers()
|
||||
assert "No peers available" in result
|
||||
# The new wording explicitly says no peers exist (no parent/sibling/child).
|
||||
# Avoids the misleading "may be isolated" hint when discovery succeeded.
|
||||
assert "no peers" in result.lower()
|
||||
assert "No peers found." not in result # diagnostic prefix should NOT appear on the success branch
|
||||
assert "may be isolated" not in result
|
||||
|
||||
async def test_auth_failure_surfaces_restart_hint(self):
|
||||
"""401/403 → tool_list_peers must surface the auth failure + restart hint, not 'isolated'."""
|
||||
import a2a_tools
|
||||
diag = "Authentication to platform failed (HTTP 401). Restart the workspace to re-mint."
|
||||
with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)):
|
||||
result = await a2a_tools.tool_list_peers()
|
||||
assert "401" in result
|
||||
assert "Authentication" in result
|
||||
# The "isolated" message was the bug — make sure the regression doesn't return.
|
||||
assert "may be isolated" not in result
|
||||
|
||||
async def test_404_surfaces_registration_hint(self):
|
||||
"""404 → tool_list_peers tells the user re-registration is needed."""
|
||||
import a2a_tools
|
||||
diag = "Workspace ID ws-test is not registered with the platform (HTTP 404). Re-register."
|
||||
with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)):
|
||||
result = await a2a_tools.tool_list_peers()
|
||||
assert "404" in result
|
||||
assert "registered" in result.lower()
|
||||
|
||||
async def test_5xx_surfaces_platform_error(self):
|
||||
"""5xx → 'Platform error' surfaced; agent / user can correctly route to oncall."""
|
||||
import a2a_tools
|
||||
diag = "Platform error: HTTP 503."
|
||||
with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)):
|
||||
result = await a2a_tools.tool_list_peers()
|
||||
assert "503" in result
|
||||
assert "Platform error" in result
|
||||
|
||||
async def test_network_error_surfaces_unreachable(self):
|
||||
"""Network error → operator can tell that the workspace can't reach the platform at all."""
|
||||
import a2a_tools
|
||||
diag = "Cannot reach platform at http://platform.example: timed out"
|
||||
with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)):
|
||||
result = await a2a_tools.tool_list_peers()
|
||||
assert "Cannot reach platform" in result
|
||||
assert "timed out" in result
|
||||
|
||||
async def test_peers_returned_formatted_lines(self):
|
||||
"""Peers list is formatted as '- name (ID: ..., status: ..., role: ...)'."""
|
||||
@ -550,7 +593,7 @@ class TestToolListPeers:
|
||||
{"id": "ws-1", "name": "Alpha", "status": "online", "role": "worker"},
|
||||
{"id": "ws-2", "name": "Beta", "status": "idle", "role": "analyst"},
|
||||
]
|
||||
with patch("a2a_tools.get_peers", return_value=peers):
|
||||
with patch("a2a_tools.get_peers_with_diagnostic", return_value=(peers, None)):
|
||||
result = await a2a_tools.tool_list_peers()
|
||||
|
||||
assert "Alpha" in result
|
||||
@ -567,7 +610,7 @@ class TestToolListPeers:
|
||||
# Clear any prior cache entries for these IDs
|
||||
a2a_tools._peer_names.pop("ws-cache-test", None)
|
||||
peers = [{"id": "ws-cache-test", "name": "CacheMe", "status": "online", "role": "w"}]
|
||||
with patch("a2a_tools.get_peers", return_value=peers):
|
||||
with patch("a2a_tools.get_peers_with_diagnostic", return_value=(peers, None)):
|
||||
await a2a_tools.tool_list_peers()
|
||||
|
||||
assert a2a_tools._peer_names.get("ws-cache-test") == "CacheMe"
|
||||
@ -577,7 +620,7 @@ class TestToolListPeers:
|
||||
import a2a_tools
|
||||
|
||||
peers = [{"id": "ws-3", "name": "Gamma"}] # no status, no role
|
||||
with patch("a2a_tools.get_peers", return_value=peers):
|
||||
with patch("a2a_tools.get_peers_with_diagnostic", return_value=(peers, None)):
|
||||
result = await a2a_tools.tool_list_peers()
|
||||
|
||||
assert "Gamma" in result
|
||||
|
||||
Loading…
Reference in New Issue
Block a user