name: Harness Replays # Boots tests/harness (production-shape compose topology with TenantGuard, # /cp/* proxy, canvas proxy, real production Dockerfile.tenant) and runs # every replay under tests/harness/replays/. Fails the PR if any replay # fails. # # Why this exists: 2026-04-30 we shipped #2398 which added /buildinfo as # a public route in router.go but forgot to add it to TenantGuard's # allowlist. The handler-level test in buildinfo_test.go constructed a # minimal gin engine without TenantGuard — green. The harness's # buildinfo-stale-image.sh replay would have caught it (cf-proxy doesn't # inject X-Molecule-Org-Id, so the curl path is identical to production's # redeploy verifier), but no one ran the harness pre-merge. The bug # shipped; the redeploy verifier silently soft-warned every tenant as # "unreachable" for ~1 day before being noticed. # # This gate makes "did you actually run the harness?" a CI invariant # instead of a memory-discipline thing. # # Trigger model — match e2e-api.yml: always FIRES on push/pull_request # to staging+main, real work is gated per-step on detect-changes output. # One job → one check run → branch-protection-clean (the SKIPPED-in-set # trap from PR #2264 is documented in e2e-api.yml's e2e-api job comment). on: push: branches: [main, staging] paths: - 'workspace-server/**' - 'canvas/**' - 'tests/harness/**' - '.github/workflows/harness-replays.yml' pull_request: branches: [main, staging] paths: - 'workspace-server/**' - 'canvas/**' - 'tests/harness/**' - '.github/workflows/harness-replays.yml' workflow_dispatch: merge_group: types: [checks_requested] concurrency: # Per-SHA grouping. Per-ref kept hitting the auto-promote-staging # cancellation deadlock — see e2e-api.yml's concurrency block for # the 2026-04-28 incident that codified this pattern. group: harness-replays-${{ github.event.pull_request.head.sha || github.sha }} cancel-in-progress: false jobs: detect-changes: runs-on: ubuntu-latest outputs: run: ${{ steps.decide.outputs.run }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1 id: filter with: filters: | run: - 'workspace-server/**' - 'canvas/**' - 'tests/harness/**' - '.github/workflows/harness-replays.yml' - id: decide run: | if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then echo "run=true" >> "$GITHUB_OUTPUT" else echo "run=${{ steps.filter.outputs.run }}" >> "$GITHUB_OUTPUT" fi # ONE job that always runs. Real work is gated per-step on # detect-changes.outputs.run so an unrelated PR (e.g. doc-only # change to molecule-controlplane wired here later) emits the # required check without spending CI cycles. Single-job pattern # matches e2e-api.yml — see that workflow's comment for why a # job-level `if: false` would block branch protection via the # SKIPPED-in-set bug. harness-replays: needs: detect-changes name: Harness Replays runs-on: ubuntu-latest timeout-minutes: 30 steps: - name: No-op pass (paths filter excluded this commit) if: needs.detect-changes.outputs.run != 'true' run: | echo "No workspace-server / canvas / tests/harness / workflow changes — Harness Replays gate satisfied without running." echo "::notice::Harness Replays no-op pass (paths filter excluded this commit)." - if: needs.detect-changes.outputs.run == 'true' uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 # github-app-auth sibling-checkout removed 2026-05-07 (#157): # the plugin was dropped + Dockerfile.tenant no longer COPYs it. # Pre-clone manifest deps before docker compose builds the tenant # image (Task #173 followup — same pattern as # publish-workspace-server-image.yml's "Pre-clone manifest deps" # step). # # Why pre-clone here too: tests/harness/compose.yml builds tenant-alpha # and tenant-beta from workspace-server/Dockerfile.tenant with # context=../.. (repo root). That Dockerfile expects # .tenant-bundle-deps/{workspace-configs-templates,org-templates,plugins} # to be present at build context root (post-#173 it COPYs from there # instead of running an in-image clone — the in-image clone failed # with "could not read Username for https://git.moleculesai.app" # because there's no auth path inside the build sandbox). # # Without this step harness-replays fails before any replay runs, # with `failed to calculate checksum of ref ... # "/.tenant-bundle-deps/plugins": not found`. Caught by run #892 # (main, 2026-05-07T20:28:53Z) and run #964 (staging — same # symptom, different root cause: staging still has the in-image # clone path, hits the auth error directly). # # 2026-05-08 sub-finding (#192): the clone step ALSO fails when # any referenced workspace-template repo is private and the # AUTO_SYNC_TOKEN bearer (devops-engineer persona) lacks read # access. Root cause: 5 of 9 workspace-template repos # (openclaw, codex, crewai, deepagents, gemini-cli) had been # marked private with no team grant. Resolution: flipped them # to public per `feedback_oss_first_repo_visibility_default` # (the OSS surface should be public). Layer-3 (customer-private + # marketplace third-party repos) tracked separately in # internal#102. # # Token shape matches publish-workspace-server-image.yml: AUTO_SYNC_TOKEN # is the devops-engineer persona PAT, NOT the founder PAT (per # `feedback_per_agent_gitea_identity_default`). clone-manifest.sh # embeds it as basic-auth for the duration of the clones and strips # .git directories — the token never enters the resulting image. - name: Pre-clone manifest deps if: needs.detect-changes.outputs.run == 'true' env: MOLECULE_GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }} run: | set -euo pipefail if [ -z "${MOLECULE_GITEA_TOKEN}" ]; then echo "::error::AUTO_SYNC_TOKEN secret is empty — register the devops-engineer persona PAT in repo Actions secrets" exit 1 fi mkdir -p .tenant-bundle-deps bash scripts/clone-manifest.sh \ manifest.json \ .tenant-bundle-deps/workspace-configs-templates \ .tenant-bundle-deps/org-templates \ .tenant-bundle-deps/plugins # Sanity-check counts so a silent partial clone fails fast # instead of producing a half-empty image. ws_count=$(find .tenant-bundle-deps/workspace-configs-templates -mindepth 1 -maxdepth 1 -type d | wc -l) org_count=$(find .tenant-bundle-deps/org-templates -mindepth 1 -maxdepth 1 -type d | wc -l) plugins_count=$(find .tenant-bundle-deps/plugins -mindepth 1 -maxdepth 1 -type d | wc -l) echo "Cloned: ws=$ws_count org=$org_count plugins=$plugins_count" - name: Install Python deps for replays # peer-discovery-404 (and future replays) eval Python against the # running tenant — importing workspace/a2a_client.py pulls in # httpx. tests/harness/requirements.txt holds just the HTTP-client # surface to keep CI install fast (~3s) vs the full # workspace/requirements.txt (~30s). if: needs.detect-changes.outputs.run == 'true' run: pip install -r tests/harness/requirements.txt - name: Run all replays against the harness # run-all-replays.sh: boot via up.sh → seed via seed.sh → run # every replays/*.sh → tear down via down.sh on EXIT (trap). # Non-zero exit on any replay failure. # # KEEP_UP=1: without this, the script's trap-on-EXIT tears # down containers immediately on failure, leaving the dump # step below with nothing to dump (verified on PR #2410's # first run — tenant became unhealthy, trap fired, dump # step saw empty containers). Keeping them up lets the # failure path collect tenant/cp-stub/cf-proxy logs. The # always-run "Force teardown" step does the actual cleanup. if: needs.detect-changes.outputs.run == 'true' working-directory: tests/harness env: KEEP_UP: "1" run: ./run-all-replays.sh - name: Dump compose logs on failure # SECRETS_ENCRYPTION_KEY: docker compose validates the entire compose # file even for read-only `logs` calls. up.sh generates a per-run key # and exports it to its OWN shell — this step runs in a fresh shell # that wouldn't see it, so without a placeholder the validate step # errors before logs print (verified against PR #2492's first run: # "required variable SECRETS_ENCRYPTION_KEY is missing a value"). # A placeholder is fine — we're only reading log streams, not booting. if: failure() && needs.detect-changes.outputs.run == 'true' working-directory: tests/harness env: SECRETS_ENCRYPTION_KEY: dump-logs-placeholder run: | echo "=== docker compose ps ===" docker compose -f compose.yml ps || true echo "=== tenant-alpha logs ===" docker compose -f compose.yml logs tenant-alpha || true echo "=== tenant-beta logs ===" docker compose -f compose.yml logs tenant-beta || true echo "=== cp-stub logs ===" docker compose -f compose.yml logs cp-stub || true echo "=== cf-proxy logs ===" docker compose -f compose.yml logs cf-proxy || true echo "=== postgres-alpha logs (last 100) ===" docker compose -f compose.yml logs --tail 100 postgres-alpha || true echo "=== postgres-beta logs (last 100) ===" docker compose -f compose.yml logs --tail 100 postgres-beta || true - name: Force teardown # We pass KEEP_UP=1 to run-all-replays.sh so the dump step # above sees real containers — that means we own teardown # explicitly here. Always run. if: always() && needs.detect-changes.outputs.run == 'true' working-directory: tests/harness run: ./down.sh || true