From 3105e87cf73bea689987e8f69b17ed2c6db3be2b Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Thu, 30 Apr 2026 13:04:53 -0700
Subject: [PATCH 1/5] ci: gate PRs on tests/harness/run-all-replays.sh
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes the gap between "the harness exists" and "the harness blocks bugs."
Phase 2 of the harness roadmap (per tests/harness/README.md): make
harness-based E2E a required CI check on every PR touching the tenant
binary or the harness itself.

Trigger: push + pull_request to staging+main, paths-filtered to
workspace-server/**, canvas/**, tests/harness/**, and this workflow.
merge_group support included so this becomes branch-protectable.

Single-job-with-conditional-steps pattern (matches e2e-api.yml). One
check run regardless of paths-filter outcome; satisfies branch
protection cleanly per the PR #2264 SKIPPED-in-set finding.

Why this exists: 2026-04-30 we shipped a TenantGuard allowlist gap
(/buildinfo added to router.go in #2398, never added to the allowlist)
that the existing buildinfo-stale-image.sh replay would have caught.
The harness was wired correctly; nobody ran it. Replays as a discipline
beat replays as a memory item.

The CI pipeline:
  detect-changes (paths filter)
    └ harness-replays (always)
        ├ no-op pass when paths-filter says no relevant change
        └ otherwise: checkout + sibling plugin checkout +
                     /etc/hosts entry + run-all-replays.sh +
                     compose-logs-on-failure + force-teardown

Compose logs from tenant/cp-stub/cf-proxy/postgres are dumped on
failure so a CI red is debuggable without re-reproducing locally.
The trap in run-all-replays.sh handles teardown; the always-run
down.sh step is a belt-and-suspenders against trap-bypass kills.

Follow-ups (not in this PR):
- Add this check to staging branch protection once it's been green
  for a few PRs (the new-workflow-instability hedge that other gates
  followed).
- Eventually wire the buildx GHA cache to speed up tenant image
  builds — currently every PR rebuilds the full Dockerfile.tenant
  (Go + Next.js + template clones) from scratch. Acceptable for now;
  optimize when the timeout-minutes:30 ceiling becomes painful.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/harness-replays.yml | 148 ++++++++++++++++++++++++++
 1 file changed, 148 insertions(+)
 create mode 100644 .github/workflows/harness-replays.yml

diff --git a/.github/workflows/harness-replays.yml b/.github/workflows/harness-replays.yml
new file mode 100644
index 00000000..caf5fe77
--- /dev/null
+++ b/.github/workflows/harness-replays.yml
@@ -0,0 +1,148 @@
+name: Harness Replays
+
+# Boots tests/harness (production-shape compose topology with TenantGuard,
+# /cp/* proxy, canvas proxy, real production Dockerfile.tenant) and runs
+# every replay under tests/harness/replays/. Fails the PR if any replay
+# fails.
+#
+# Why this exists: 2026-04-30 we shipped #2398 which added /buildinfo as
+# a public route in router.go but forgot to add it to TenantGuard's
+# allowlist. The handler-level test in buildinfo_test.go constructed a
+# minimal gin engine without TenantGuard — green. The harness's
+# buildinfo-stale-image.sh replay would have caught it (cf-proxy doesn't
+# inject X-Molecule-Org-Id, so the curl path is identical to production's
+# redeploy verifier), but no one ran the harness pre-merge. The bug
+# shipped; the redeploy verifier silently soft-warned every tenant as
+# "unreachable" for ~1 day before being noticed.
+#
+# This gate makes "did you actually run the harness?" a CI invariant
+# instead of a memory-discipline thing.
+#
+# Trigger model — match e2e-api.yml: always FIRES on push/pull_request
+# to staging+main, real work is gated per-step on detect-changes output.
+# One job → one check run → branch-protection-clean (the SKIPPED-in-set
+# trap from PR #2264 is documented in e2e-api.yml's e2e-api job comment).
+
+on:
+  push:
+    branches: [main, staging]
+    paths:
+      - 'workspace-server/**'
+      - 'canvas/**'
+      - 'tests/harness/**'
+      - '.github/workflows/harness-replays.yml'
+  pull_request:
+    branches: [main, staging]
+    paths:
+      - 'workspace-server/**'
+      - 'canvas/**'
+      - 'tests/harness/**'
+      - '.github/workflows/harness-replays.yml'
+  workflow_dispatch:
+  merge_group:
+    types: [checks_requested]
+
+concurrency:
+  # Per-SHA grouping. Per-ref kept hitting the auto-promote-staging
+  # cancellation deadlock — see e2e-api.yml's concurrency block for
+  # the 2026-04-28 incident that codified this pattern.
+  group: harness-replays-${{ github.event.pull_request.head.sha || github.sha }}
+  cancel-in-progress: false
+
+jobs:
+  detect-changes:
+    runs-on: ubuntu-latest
+    outputs:
+      run: ${{ steps.decide.outputs.run }}
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+      - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1
+        id: filter
+        with:
+          filters: |
+            run:
+              - 'workspace-server/**'
+              - 'canvas/**'
+              - 'tests/harness/**'
+              - '.github/workflows/harness-replays.yml'
+      - id: decide
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            echo "run=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "run=${{ steps.filter.outputs.run }}" >> "$GITHUB_OUTPUT"
+          fi
+
+  # ONE job that always runs. Real work is gated per-step on
+  # detect-changes.outputs.run so an unrelated PR (e.g. doc-only
+  # change to molecule-controlplane wired here later) emits the
+  # required check without spending CI cycles. Single-job pattern
+  # matches e2e-api.yml — see that workflow's comment for why a
+  # job-level `if: false` would block branch protection via the
+  # SKIPPED-in-set bug.
+  harness-replays:
+    needs: detect-changes
+    name: Harness Replays
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+      - name: No-op pass (paths filter excluded this commit)
+        if: needs.detect-changes.outputs.run != 'true'
+        run: |
+          echo "No workspace-server / canvas / tests/harness / workflow changes — Harness Replays gate satisfied without running."
+          echo "::notice::Harness Replays no-op pass (paths filter excluded this commit)."
+
+      - if: needs.detect-changes.outputs.run == 'true'
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+
+      - name: Checkout sibling plugin repo
+        # Dockerfile.tenant copies molecule-ai-plugin-github-app-auth/
+        # at the build-context root (see workspace-server/Dockerfile.tenant
+        # line 19). PLUGIN_REPO_PAT pattern matches publish-workspace-server-image.yml.
+        if: needs.detect-changes.outputs.run == 'true'
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+        with:
+          repository: Molecule-AI/molecule-ai-plugin-github-app-auth
+          path: molecule-ai-plugin-github-app-auth
+          token: ${{ secrets.PLUGIN_REPO_PAT || secrets.GITHUB_TOKEN }}
+
+      - name: Add /etc/hosts entry for harness-tenant.localhost
+        # ubuntu-latest doesn't auto-resolve *.localhost the way macOS
+        # sometimes does. seed.sh + replay scripts curl
+        # http://harness-tenant.localhost:8080 — without the entry
+        # they'd fail with getaddrinfo ENOTFOUND.
+        if: needs.detect-changes.outputs.run == 'true'
+        run: |
+          echo "127.0.0.1 harness-tenant.localhost" | sudo tee -a /etc/hosts >/dev/null
+          getent hosts harness-tenant.localhost
+
+      - name: Run all replays against the harness
+        # run-all-replays.sh: boot via up.sh → seed via seed.sh → run
+        # every replays/*.sh → tear down via down.sh on EXIT (trap).
+        # Non-zero exit on any replay failure.
+        if: needs.detect-changes.outputs.run == 'true'
+        working-directory: tests/harness
+        run: ./run-all-replays.sh
+
+      - name: Dump compose logs on failure
+        if: failure() && needs.detect-changes.outputs.run == 'true'
+        working-directory: tests/harness
+        run: |
+          echo "=== docker compose ps ==="
+          docker compose -f compose.yml ps || true
+          echo "=== tenant logs ==="
+          docker compose -f compose.yml logs tenant || true
+          echo "=== cp-stub logs ==="
+          docker compose -f compose.yml logs cp-stub || true
+          echo "=== cf-proxy logs ==="
+          docker compose -f compose.yml logs cf-proxy || true
+          echo "=== postgres logs (last 100) ==="
+          docker compose -f compose.yml logs --tail 100 postgres || true
+
+      - name: Force teardown (belt-and-suspenders)
+        # run-all-replays.sh's trap should already have torn down,
+        # but if something killed bash before the trap fired, this
+        # ensures the runner doesn't leak the network/volumes.
+        if: always() && needs.detect-changes.outputs.run == 'true'
+        working-directory: tests/harness
+        run: ./down.sh || true

From 24cb2a286f531e880c94e9edb71892352e817e50 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Thu, 30 Apr 2026 13:15:46 -0700
Subject: [PATCH 2/5] ci(harness-replays): KEEP_UP=1 so dump-logs step has
 containers to read

First run on PR #2410 failed with 'container harness-tenant-1 is unhealthy'
but the dump-compose-logs step printed empty tenant logs because
run-all-replays.sh's trap-on-EXIT had already torn down the harness.

Setting KEEP_UP=1 leaves containers in place; the always-run Force
teardown step at the end owns cleanup explicitly. Now we'll actually
see why the tenant didn't become healthy.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/harness-replays.yml | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/harness-replays.yml b/.github/workflows/harness-replays.yml
index caf5fe77..384b8567 100644
--- a/.github/workflows/harness-replays.yml
+++ b/.github/workflows/harness-replays.yml
@@ -120,8 +120,18 @@ jobs:
         # run-all-replays.sh: boot via up.sh → seed via seed.sh → run
         # every replays/*.sh → tear down via down.sh on EXIT (trap).
         # Non-zero exit on any replay failure.
+        #
+        # KEEP_UP=1: without this, the script's trap-on-EXIT tears
+        # down containers immediately on failure, leaving the dump
+        # step below with nothing to dump (verified on PR #2410's
+        # first run — tenant became unhealthy, trap fired, dump
+        # step saw empty containers). Keeping them up lets the
+        # failure path collect tenant/cp-stub/cf-proxy logs. The
+        # always-run "Force teardown" step does the actual cleanup.
         if: needs.detect-changes.outputs.run == 'true'
         working-directory: tests/harness
+        env:
+          KEEP_UP: "1"
         run: ./run-all-replays.sh
 
       - name: Dump compose logs on failure
@@ -139,10 +149,10 @@ jobs:
           echo "=== postgres logs (last 100) ==="
           docker compose -f compose.yml logs --tail 100 postgres || true
 
-      - name: Force teardown (belt-and-suspenders)
-        # run-all-replays.sh's trap should already have torn down,
-        # but if something killed bash before the trap fired, this
-        # ensures the runner doesn't leak the network/volumes.
+      - name: Force teardown
+        # We pass KEEP_UP=1 to run-all-replays.sh so the dump step
+        # above sees real containers — that means we own teardown
+        # explicitly here. Always run.
         if: always() && needs.detect-changes.outputs.run == 'true'
         working-directory: tests/harness
         run: ./down.sh || true

From 630dd0dae7084586ca037ef54cc15a9037562035 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Thu, 30 Apr 2026 13:25:52 -0700
Subject: [PATCH 3/5] fix(harness): seed SECRETS_ENCRYPTION_KEY so
 MOLECULE_ENV=production tenant boots
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Found via the first run of the harness-replays-required-check workflow
(#2410): the tenant container failed its healthcheck after 100s with
"refusing to boot without encryption in production". This is the
deferred CRITICAL flagged on PR #2401 — `crypto.InitStrict()` requires
SECRETS_ENCRYPTION_KEY when MOLECULE_ENV=production, and the harness
sets prod-mode but never seeded a key.

Fix: add a clearly-test 32-byte base64 value (encoding the literal
string "harness-test-only-not-for-prod!!") inline. Keeping
MOLECULE_ENV=production preserves the harness's value as a production-
shape replay surface — it now exercises the full encryption boot path
including the strict check, rather than skirting it via dev-mode.

Why inline rather than .env:
- The harness compose file is meant to be self-contained and
  reproducible from a clean clone. An external .env would split the
  config across two files for one synthetic value.
- The value is intentionally a sentinel; there's no operator decision
  here to gate behind a per-deployment file.

After this lands the harness boots clean and `run-all-replays.sh` can
exercise the buildinfo + peer-discovery replays as designed. The
required-check workflow itself (#2410) needs no change.
---
 tests/harness/compose.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/harness/compose.yml b/tests/harness/compose.yml
index 867f67bd..3fe185cc 100644
--- a/tests/harness/compose.yml
+++ b/tests/harness/compose.yml
@@ -85,6 +85,14 @@ services:
       PORT: "8080"
       PLATFORM_URL: "http://tenant:8080"
       MOLECULE_ENV: "production"
+      # SECRETS_ENCRYPTION_KEY is required when MOLECULE_ENV=production —
+      # crypto.InitStrict() refuses to boot without it ("32 bytes raw or
+      # base64-encoded"). The harness uses a clearly-test sentinel so the
+      # production code path is exercised end-to-end (including the
+      # encrypted-secret reads/writes) without coupling to a real key.
+      # Value is base64 of the literal string "harness-test-only-not-for-prod!!"
+      # (exactly 32 bytes). Do NOT copy this to any other environment.
+      SECRETS_ENCRYPTION_KEY: "aGFybmVzcy10ZXN0LW9ubHktbm90LWZvci1wcm9kISE="
       # ADMIN_TOKEN flips the platform into strict-auth mode (matches
       # production's CP-minted token configuration). Seeded value lets
       # E2E scripts authenticate without going through CP.

From 9dae0503ee9cd32f3816e7faf6796dc27f42e4a7 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Thu, 30 Apr 2026 13:30:14 -0700
Subject: [PATCH 4/5] fix(harness): generate SECRETS_ENCRYPTION_KEY per-run
 instead of hardcoding
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the hardcoded base64 sentinel (630dd0da) with a per-run
generation in up.sh, exported into compose's interpolation environment.

Why:
- Hardcoding a 32-byte base64 string in the repo, even one labelled
  "test-only", sets a bad muscle-memory pattern. The next agent or
  contributor copies the shape into another harness — or worse, into a
  staging .env — and the test-only sentinel turns into something
  someone treats as a real key.
- Secret scanners flag key-shaped values regardless of the surrounding
  comment claiming intent. Avoiding the literal entirely sidesteps the
  false-positive.
- A fresh key per harness lifetime more closely mimics prod's
  per-tenant isolation, exercising the same code paths without any
  pretense of stable encrypted-data fixtures (which the harness wipes
  on every ./down.sh anyway).

Implementation:
- up.sh: `openssl rand -base64 32` if SECRETS_ENCRYPTION_KEY isn't
  already set in the caller's env. Honoring a pre-set value lets a
  debug session pin a key for reproducibility (e.g. when investigating
  encrypted-row corruption).
- compose.yml: `${SECRETS_ENCRYPTION_KEY:?…}` makes a misuse loud —
  running `docker compose up` directly bypassing up.sh fails fast with
  a clear error pointing at the right entry point, rather than a 100s
  unhealthy-tenant timeout.

Both paths verified via `docker compose config`:
- with key exported: value interpolates cleanly
- without it: "required variable SECRETS_ENCRYPTION_KEY is missing a
  value: must be set — run via tests/harness/up.sh, which generates
  one per run"
---
 tests/harness/compose.yml | 14 +++++++-------
 tests/harness/up.sh       | 16 ++++++++++++++++
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/tests/harness/compose.yml b/tests/harness/compose.yml
index 3fe185cc..1a382a6a 100644
--- a/tests/harness/compose.yml
+++ b/tests/harness/compose.yml
@@ -86,13 +86,13 @@ services:
       PLATFORM_URL: "http://tenant:8080"
       MOLECULE_ENV: "production"
       # SECRETS_ENCRYPTION_KEY is required when MOLECULE_ENV=production —
-      # crypto.InitStrict() refuses to boot without it ("32 bytes raw or
-      # base64-encoded"). The harness uses a clearly-test sentinel so the
-      # production code path is exercised end-to-end (including the
-      # encrypted-secret reads/writes) without coupling to a real key.
-      # Value is base64 of the literal string "harness-test-only-not-for-prod!!"
-      # (exactly 32 bytes). Do NOT copy this to any other environment.
-      SECRETS_ENCRYPTION_KEY: "aGFybmVzcy10ZXN0LW9ubHktbm90LWZvci1wcm9kISE="
+      # crypto.InitStrict() refuses to boot without it. up.sh generates a
+      # fresh 32-byte key per harness lifetime via `openssl rand -base64 32`
+      # and exports it into this compose file's interpolation environment.
+      # The :? sentinel makes the misuse loud — running `docker compose up`
+      # directly without going through up.sh fails fast with a clear error
+      # rather than getting a confusing tenant-unhealthy timeout.
+      SECRETS_ENCRYPTION_KEY: "${SECRETS_ENCRYPTION_KEY:?must be set — run via tests/harness/up.sh, which generates one per run}"
       # ADMIN_TOKEN flips the platform into strict-auth mode (matches
       # production's CP-minted token configuration). Seeded value lets
       # E2E scripts authenticate without going through CP.
diff --git a/tests/harness/up.sh b/tests/harness/up.sh
index b3c87936..fbc14910 100755
--- a/tests/harness/up.sh
+++ b/tests/harness/up.sh
@@ -18,6 +18,22 @@ for arg in "$@"; do
     esac
 done
 
+# Generate a per-run encryption key. The tenant runs with
+# MOLECULE_ENV=production (intentional, to replay prod-shape bugs), and
+# crypto.InitStrict() refuses to boot without SECRETS_ENCRYPTION_KEY.
+# Generate fresh so:
+#   - No key-shaped string lives in the repo (avoids muscle-memorying a
+#     hardcoded value into other places + secret-scanner false positives).
+#   - Each harness lifetime gets a unique key, mimicking prod's per-tenant
+#     isolation. Persistence across runs isn't required — the harness DB
+#     is wiped on every ./down.sh.
+# Honor a caller-supplied value if already exported (lets a debug session
+# pin a key for reproducibility).
+if [ -z "${SECRETS_ENCRYPTION_KEY:-}" ]; then
+    SECRETS_ENCRYPTION_KEY=$(openssl rand -base64 32)
+    export SECRETS_ENCRYPTION_KEY
+fi
+
 if [ "$REBUILD" = true ]; then
     docker compose -f compose.yml build --no-cache tenant cp-stub
 fi

From c8b17ea1ad582929fc525bb7cc19e4ca629a60d0 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Thu, 30 Apr 2026 13:32:00 -0700
Subject: [PATCH 5/5] fix(harness): install httpx for replay Python evals
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

peer-discovery-404 imports workspace/a2a_client.py which depends on
httpx; the runner's stock Python doesn't have it, so the replay's
PARSE assertion (b) fails with ModuleNotFoundError on every run. The
WIRE assertion (a) — pure curl — passes, so the failure was masking
just enough to make the replay LOOK partially-broken when the tenant
side is fine.

Adding tests/harness/requirements.txt with only httpx instead of
sourcing workspace/requirements.txt: that file pulls a2a-sdk,
langchain-core, opentelemetry, sqlalchemy, temporalio, etc. — ~30s
of install for one replay's PARSE step. The harness's deps surface
should grow when a new replay introduces a new import, not by
default.

Workflow gains one step (`pip install -r tests/harness/requirements.txt`)
between the /etc/hosts setup and run-all-replays. No other changes.
---
 .github/workflows/harness-replays.yml |  9 +++++++++
 tests/harness/requirements.txt        | 14 ++++++++++++++
 2 files changed, 23 insertions(+)
 create mode 100644 tests/harness/requirements.txt

diff --git a/.github/workflows/harness-replays.yml b/.github/workflows/harness-replays.yml
index 384b8567..6330e885 100644
--- a/.github/workflows/harness-replays.yml
+++ b/.github/workflows/harness-replays.yml
@@ -116,6 +116,15 @@ jobs:
           echo "127.0.0.1 harness-tenant.localhost" | sudo tee -a /etc/hosts >/dev/null
           getent hosts harness-tenant.localhost
 
+      - name: Install Python deps for replays
+        # peer-discovery-404 (and future replays) eval Python against the
+        # running tenant — importing workspace/a2a_client.py pulls in
+        # httpx. tests/harness/requirements.txt holds just the HTTP-client
+        # surface to keep CI install fast (~3s) vs the full
+        # workspace/requirements.txt (~30s).
+        if: needs.detect-changes.outputs.run == 'true'
+        run: pip install -r tests/harness/requirements.txt
+
       - name: Run all replays against the harness
         # run-all-replays.sh: boot via up.sh → seed via seed.sh → run
         # every replays/*.sh → tear down via down.sh on EXIT (trap).
diff --git a/tests/harness/requirements.txt b/tests/harness/requirements.txt
new file mode 100644
index 00000000..75a30722
--- /dev/null
+++ b/tests/harness/requirements.txt
@@ -0,0 +1,14 @@
+# Harness-replay Python deps — minimal set for replays/*.sh scripts that
+# eval Python against the running tenant (e.g. importing
+# workspace/a2a_client.py to assert parser behavior).
+#
+# This is intentionally smaller than workspace/requirements.txt: the
+# replays don't need a2a-sdk, langchain, opentelemetry, etc. — only the
+# HTTP client surface that the imported helpers depend on. Adding the
+# full workspace deps would slow every harness CI run by ~30s for no
+# gain.
+#
+# Add a line here (with a version constraint matching workspace/requirements.txt)
+# when a new replay introduces a new Python import.
+
+httpx>=0.28.1