From 046eccbb7c6d308a3561dd8b0528fe65d7497ec2 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 11:32:35 -0700 Subject: [PATCH] fix(harness): five-axis self-review fixes before merge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three findings from re-reviewing PR #2401 with fresh eyes: 1. Critical — port binding to 0.0.0.0 compose.yml's cf-proxy bound 8080:8080 (default 0.0.0.0). The harness uses a hardcoded ADMIN_TOKEN so anyone on the local network or VPN could hit /workspaces with admin privileges. Switch to 127.0.0.1:8080 so admin access is loopback-only — safe for E2E and prevents the known-token leak. 2. Required — dead code in cp-stub peersFailureMode + __stub/mode + __stub/peers were declared with atomic.Value setters but no handler ever READ from them. CP doesn't host /registry/peers (the tenant does), so the toggles couldn't drive responses. Removed the dead vars + handlers; kept redeployFleetCalls counter and __stub/state since those have a real consumer in the buildinfo replay. 3. Required — replay's auth-context dependency peer-discovery-404.sh's Python eval ran a2a_client.get_peers_with_ diagnostic() against the live tenant. Without a workspace token file, auth_headers() yields empty headers — so the helper might exercise a 401 branch instead of the 404 branch the replay claims to test. Split the assertion into (a) WIRE — direct curl proves the platform returns 404 from /registry//peers — and (b) PARSE — feed the helper a mocked 404 via httpx patches, no network/auth. Each branch tests exactly what it claims. Also added a graceful skip when the workspace runtime in the current checkout pre-dates #2399 (no get_peers_with_diagnostic yet) — replay falls back to wire-only verification with a clear message instead of an opaque AttributeError. After #2399 lands on staging, both branches will run. cp-stub still builds clean. compose.yml validates. Replay's bash syntax + Python eval both verified locally. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/harness/compose.yml | 6 +- tests/harness/cp-stub/main.go | 56 +------- tests/harness/replays/peer-discovery-404.sh | 140 ++++++++++++-------- 3 files changed, 97 insertions(+), 105 deletions(-) diff --git a/tests/harness/compose.yml b/tests/harness/compose.yml index e27edd56..867f67bd 100644 --- a/tests/harness/compose.yml +++ b/tests/harness/compose.yml @@ -119,8 +119,12 @@ services: condition: service_healthy volumes: - ./cf-proxy/nginx.conf:/etc/nginx/nginx.conf:ro + # Bind to 127.0.0.1 only — the harness uses a hardcoded ADMIN_TOKEN + # ("harness-admin-token") so binding 0.0.0.0 (compose's default) + # would expose admin access to anyone on the local network or VPN. + # Loopback-only is safe for E2E and prevents a known-token leak. ports: - - "8080:8080" + - "127.0.0.1:8080:8080" networks: [harness-net] networks: diff --git a/tests/harness/cp-stub/main.go b/tests/harness/cp-stub/main.go index 7b322740..e87c3ece 100644 --- a/tests/harness/cp-stub/main.go +++ b/tests/harness/cp-stub/main.go @@ -24,28 +24,13 @@ import ( "log" "net/http" "os" - "strings" "sync/atomic" ) -// peersFailureMode controls /registry//peers responses for replay scripts. -// Empty (default) → 200 with the rolling peer list set via /__stub/peers. -// "404" → 404 (workspace not registered) — replay #2397. -// "401" → 401 (auth failure) — replay #2397. -// "500" → 500 (platform error) — replay #2397. -// "timeout" → hang for 60s — replay #2397 network branch. -// -// Set via env var CP_STUB_PEERS_MODE at startup, or POST /__stub/mode at runtime. -var ( - peersFailureMode atomic.Value // string - peersList atomic.Value // []map[string]any - redeployFleetCalls atomic.Int64 -) - -func init() { - peersFailureMode.Store(strings.ToLower(os.Getenv("CP_STUB_PEERS_MODE"))) - peersList.Store([]map[string]any{}) -} +// redeployFleetCalls tracks how many times /cp/admin/tenants/redeploy-fleet +// was invoked. Replay scripts assert > 0 to confirm the workflow's redeploy +// step actually reached the stub (catches misrouted CP_URL configs). +var redeployFleetCalls atomic.Int64 func main() { mux := http.NewServeMux() @@ -81,39 +66,10 @@ func main() { }) }) - // __stub/peers — set the rolling peer list returned via tenant's - // /registry//peers proxy. Used by replay scripts to seed the - // scenario before invoking tool_list_peers from a workspace. - mux.HandleFunc("/__stub/peers", func(w http.ResponseWriter, r *http.Request) { - if r.Method != http.MethodPost { - http.Error(w, "POST required", 405) - return - } - var body []map[string]any - if err := json.NewDecoder(r.Body).Decode(&body); err != nil { - http.Error(w, "bad JSON: "+err.Error(), 400) - return - } - peersList.Store(body) - writeJSON(w, 200, map[string]any{"ok": true, "count": len(body)}) - }) - - // __stub/mode — toggle peersFailureMode at runtime for replay scripts. - mux.HandleFunc("/__stub/mode", func(w http.ResponseWriter, r *http.Request) { - if r.Method != http.MethodPost { - http.Error(w, "POST required", 405) - return - } - mode := strings.ToLower(r.URL.Query().Get("peers")) - peersFailureMode.Store(mode) - writeJSON(w, 200, map[string]any{"ok": true, "peers_mode": mode}) - }) - - // __stub/state — expose stub state (counters, current mode) so replay - // scripts can assert the tenant actually called us. + // __stub/state — expose stub state (counters) so replay scripts can + // assert the tenant actually reached us. Read-only. mux.HandleFunc("/__stub/state", func(w http.ResponseWriter, r *http.Request) { writeJSON(w, 200, map[string]any{ - "peers_mode": peersFailureMode.Load(), "redeploy_fleet_calls": redeployFleetCalls.Load(), }) }) diff --git a/tests/harness/replays/peer-discovery-404.sh b/tests/harness/replays/peer-discovery-404.sh index 5552d120..cfd393b7 100755 --- a/tests/harness/replays/peer-discovery-404.sh +++ b/tests/harness/replays/peer-discovery-404.sh @@ -1,20 +1,29 @@ #!/usr/bin/env bash -# Replay for issue #2397 — local proof that the peer-discovery -# diagnostic surfacing fix actually works. +# Replay for issue #2397 — local proof that peer-discovery surfaces +# actionable diagnostics instead of "may be isolated". # # Prior behavior: tool_list_peers returned "No peers available (this -# workspace may be isolated)" regardless of WHY peers were empty. -# Five distinct conditions collapsed to one ambiguous message. +# workspace may be isolated)" regardless of WHY peers were empty — +# five distinct conditions (200+empty, 401, 403, 404, 5xx, network) +# collapsed to one ambiguous message. # -# This replay seeds the cp-stub to return 404 from /registry//peers -# (simulating a workspace whose registration was wiped), then calls -# the workspace's tool_list_peers via MCP. After the fix in #2399, the -# response should mention "404" + "registered" — proving the diagnostic -# reaches the agent in production-shape topology, not just unit tests. +# This replay proves two things, separately: +# (a) WIRE: the platform side of the contract — the tenant's +# /registry//peers returns 404. If this regresses +# (e.g. tenant starts returning 200 with empty list, or 500), +# the runtime helper would parse it differently and the agent +# would see a different diagnostic. The harness catches that here. +# (b) PARSE: the runtime helper, given a 404, produces a diagnostic +# containing "404" + "register" hints. Done in unit tests against +# a mock httpx response (test_a2a_client.py::TestGetPeersWithDiagnostic +# — the harness re-asserts the same contract here against a real +# Python eval that does NOT depend on workspace auth tokens. # -# Pre-fix baseline: this script's PASS criterion is the new diagnostic -# string. If we ever regress to "may be isolated", the replay fails -# and CI catches it before the agent + user are blind to the cause. +# Why split the assertion: the Python eval here doesn't have the +# workspace's auth token file, so going through get_peers_with_diagnostic +# directly would hit the platform without auth and produce a different +# branch (401 instead of 404). Splitting (a) from (b) keeps each +# assertion targeting exactly what it claims to test. set -euo pipefail HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -32,76 +41,99 @@ BASE="${BASE:-http://harness-tenant.localhost:8080}" ADMIN="harness-admin-token" ORG="harness-org" -# 1. Toggle cp-stub to return 404 on the peers endpoint. This isn't -# actually how the platform calls it (the platform's /registry -# endpoints aren't proxied through cp-stub), but the workspace -# runtime's get_peers calls /registry/:id/peers ON THE TENANT — -# which DB-resolves and returns []. To force a 404 path on the -# runtime side, we'd need a workspace whose ID never registered. -# Easier replay: ask the runtime to look up a non-existent id. -# -# Step 1: ask the tenant for peers of a non-registered id. Tenant's -# discovery handler returns 404 when the workspace doesn't exist. - +# ─── (a) WIRE: tenant returns 404 for an unregistered workspace ──────── ROGUE_ID="$(uuidgen | tr '[:upper:]' '[:lower:]')" - -echo "[replay] querying /registry/$ROGUE_ID/peers (workspace doesn't exist)..." +echo "[replay] (a) WIRE: querying /registry/$ROGUE_ID/peers (unregistered workspace)..." HTTP_CODE=$(curl -sS -o /tmp/peer-replay.json -w '%{http_code}' \ -H "Authorization: Bearer $ADMIN" \ -H "X-Molecule-Org-Id: $ORG" \ -H "X-Workspace-ID: $ROGUE_ID" \ "$BASE/registry/$ROGUE_ID/peers") -echo "[replay] tenant responded HTTP $HTTP_CODE" - -# 2. The Python diagnostic helper get_peers_with_diagnostic must convert -# that 404 into an actionable string. We simulate the helper's parse -# here to assert the contract end-to-end (the runtime is the actual -# consumer; this proves the wire shape that feeds it). - +echo "[replay] tenant responded HTTP $HTTP_CODE" if [ "$HTTP_CODE" != "404" ]; then - echo "[replay] FAIL: expected 404 from /registry//peers, got $HTTP_CODE" + echo "[replay] FAIL (a): expected 404 from /registry//peers, got $HTTP_CODE" + echo "[replay] This is a platform-side regression — the runtime's diagnostic helper" + echo "[replay] would see a different status code than the unit tests cover." cat /tmp/peer-replay.json exit 1 fi -# 3. Verify that running the runtime's diagnostic helper against this -# response surfaces the actionable string. We call the helper as a -# one-shot Python eval, mirroring how the runtime would consume it. - -echo "[replay] invoking workspace runtime diagnostic helper against the 404..." +# ─── (b) PARSE: helper converts a synthetic 404 to actionable diagnostic ─ +# +# We construct a synthetic httpx 404 response and run the helper against +# it directly. This isolates the parse branch we want to test from the +# auth-context concerns of going through the network. The helper's network +# branches are exhaustively covered by tests/test_a2a_client.py — this is +# a regression-guard that the helper IS in the install, IS importable in +# the harness's Python env, and IS reading the status code. WORKSPACE_PATH="$(cd "$HARNESS_ROOT/../../workspace" && pwd)" -DIAGNOSTIC=$(WORKSPACE_ID="$ROGUE_ID" PLATFORM_URL="$BASE" \ - PYTHONPATH="$WORKSPACE_PATH" \ - python3 -c " -import asyncio, sys -sys.path.insert(0, '$WORKSPACE_PATH') -import a2a_client +DIAGNOSTIC=$(WORKSPACE_ID="harness-rogue" PYTHONPATH="$WORKSPACE_PATH" \ + python3 - "$WORKSPACE_PATH" <<'PYEOF' +import asyncio +import sys +import types +from unittest.mock import AsyncMock, MagicMock, patch + +# Stub platform_auth so a2a_client imports cleanly without requiring a +# real workspace token file. The helper's auth_headers() only matters +# when going through the network; we're feeding it a mock response. +_pa = types.ModuleType("platform_auth") +_pa.auth_headers = lambda: {} +_pa.self_source_headers = lambda: {} +sys.modules.setdefault("platform_auth", _pa) + +sys.path.insert(0, sys.argv[1]) +import a2a_client # noqa: E402 + +# This replay validates PR #2399's diagnostic helper. If the workspace +# runtime in the current checkout pre-dates that fix, fail with a +# clear message instead of an opaque AttributeError. +if not hasattr(a2a_client, "get_peers_with_diagnostic"): + print("__SKIP__: workspace/a2a_client.py is pre-#2399 (no get_peers_with_diagnostic).") + sys.exit(0) + +resp = MagicMock() +resp.status_code = 404 +resp.json = MagicMock(return_value={"detail": "not found"}) + +mock_client = AsyncMock() +mock_client.__aenter__ = AsyncMock(return_value=mock_client) +mock_client.__aexit__ = AsyncMock(return_value=False) +mock_client.get = AsyncMock(return_value=resp) + async def main(): - peers, diag = await a2a_client.get_peers_with_diagnostic() + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + peers, diag = await a2a_client.get_peers_with_diagnostic() print(repr(diag)) + asyncio.run(main()) -") +PYEOF +) -echo "[replay] diagnostic from helper: $DIAGNOSTIC" +if [[ "$DIAGNOSTIC" == __SKIP__:* ]]; then + echo "[replay] (b) SKIP: ${DIAGNOSTIC#__SKIP__: }" + echo "[replay] Re-run after #2399 lands on staging." + echo "" + echo "[replay] PASS (a) only: peer-discovery wire returns 404 (parse branch skipped — see above)." + exit 0 +fi -# 4. Assert the diagnostic contains "404" + "register" — the actionable -# parts of the message. If we regress to None or "may be isolated", -# fail the replay. +echo "[replay] (b) PARSE: helper diagnostic = $DIAGNOSTIC" if ! echo "$DIAGNOSTIC" | grep -q "404"; then - echo "[replay] FAIL: diagnostic missing '404' — regressed to swallow-the-status-code" + echo "[replay] FAIL (b): diagnostic missing '404' — helper regressed to swallow-the-status-code" exit 1 fi if ! echo "$DIAGNOSTIC" | grep -qi "regist"; then - echo "[replay] FAIL: diagnostic missing 'register' guidance — regressed to opaque message" + echo "[replay] FAIL (b): diagnostic missing 'register' guidance — helper regressed to opaque message" exit 1 fi if echo "$DIAGNOSTIC" | grep -qi "may be isolated"; then - echo "[replay] FAIL: diagnostic still says 'may be isolated' — fix didn't reach this code path" + echo "[replay] FAIL (b): diagnostic still says 'may be isolated' — fix didn't reach this code path" exit 1 fi echo "" -echo "[replay] PASS: peer-discovery 404 surfaces actionable diagnostic in production-shape topology." +echo "[replay] PASS: peer-discovery (a) wire returns 404, (b) helper produces actionable diagnostic."