forked from molecule-ai/molecule-core
fix(harness): five-axis self-review fixes before merge
Three findings from re-reviewing PR #2401 with fresh eyes: 1. Critical — port binding to 0.0.0.0 compose.yml's cf-proxy bound 8080:8080 (default 0.0.0.0). The harness uses a hardcoded ADMIN_TOKEN so anyone on the local network or VPN could hit /workspaces with admin privileges. Switch to 127.0.0.1:8080 so admin access is loopback-only — safe for E2E and prevents the known-token leak. 2. Required — dead code in cp-stub peersFailureMode + __stub/mode + __stub/peers were declared with atomic.Value setters but no handler ever READ from them. CP doesn't host /registry/peers (the tenant does), so the toggles couldn't drive responses. Removed the dead vars + handlers; kept redeployFleetCalls counter and __stub/state since those have a real consumer in the buildinfo replay. 3. Required — replay's auth-context dependency peer-discovery-404.sh's Python eval ran a2a_client.get_peers_with_ diagnostic() against the live tenant. Without a workspace token file, auth_headers() yields empty headers — so the helper might exercise a 401 branch instead of the 404 branch the replay claims to test. Split the assertion into (a) WIRE — direct curl proves the platform returns 404 from /registry/<unregistered>/peers — and (b) PARSE — feed the helper a mocked 404 via httpx patches, no network/auth. Each branch tests exactly what it claims. Also added a graceful skip when the workspace runtime in the current checkout pre-dates #2399 (no get_peers_with_diagnostic yet) — replay falls back to wire-only verification with a clear message instead of an opaque AttributeError. After #2399 lands on staging, both branches will run. cp-stub still builds clean. compose.yml validates. Replay's bash syntax + Python eval both verified locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
f13d2b2b7b
commit
046eccbb7c
@ -119,8 +119,12 @@ services:
|
||||
condition: service_healthy
|
||||
volumes:
|
||||
- ./cf-proxy/nginx.conf:/etc/nginx/nginx.conf:ro
|
||||
# Bind to 127.0.0.1 only — the harness uses a hardcoded ADMIN_TOKEN
|
||||
# ("harness-admin-token") so binding 0.0.0.0 (compose's default)
|
||||
# would expose admin access to anyone on the local network or VPN.
|
||||
# Loopback-only is safe for E2E and prevents a known-token leak.
|
||||
ports:
|
||||
- "8080:8080"
|
||||
- "127.0.0.1:8080:8080"
|
||||
networks: [harness-net]
|
||||
|
||||
networks:
|
||||
|
||||
@ -24,28 +24,13 @@ import (
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
// peersFailureMode controls /registry/<id>/peers responses for replay scripts.
|
||||
// Empty (default) → 200 with the rolling peer list set via /__stub/peers.
|
||||
// "404" → 404 (workspace not registered) — replay #2397.
|
||||
// "401" → 401 (auth failure) — replay #2397.
|
||||
// "500" → 500 (platform error) — replay #2397.
|
||||
// "timeout" → hang for 60s — replay #2397 network branch.
|
||||
//
|
||||
// Set via env var CP_STUB_PEERS_MODE at startup, or POST /__stub/mode at runtime.
|
||||
var (
|
||||
peersFailureMode atomic.Value // string
|
||||
peersList atomic.Value // []map[string]any
|
||||
redeployFleetCalls atomic.Int64
|
||||
)
|
||||
|
||||
func init() {
|
||||
peersFailureMode.Store(strings.ToLower(os.Getenv("CP_STUB_PEERS_MODE")))
|
||||
peersList.Store([]map[string]any{})
|
||||
}
|
||||
// redeployFleetCalls tracks how many times /cp/admin/tenants/redeploy-fleet
|
||||
// was invoked. Replay scripts assert > 0 to confirm the workflow's redeploy
|
||||
// step actually reached the stub (catches misrouted CP_URL configs).
|
||||
var redeployFleetCalls atomic.Int64
|
||||
|
||||
func main() {
|
||||
mux := http.NewServeMux()
|
||||
@ -81,39 +66,10 @@ func main() {
|
||||
})
|
||||
})
|
||||
|
||||
// __stub/peers — set the rolling peer list returned via tenant's
|
||||
// /registry/<id>/peers proxy. Used by replay scripts to seed the
|
||||
// scenario before invoking tool_list_peers from a workspace.
|
||||
mux.HandleFunc("/__stub/peers", func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
http.Error(w, "POST required", 405)
|
||||
return
|
||||
}
|
||||
var body []map[string]any
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
||||
http.Error(w, "bad JSON: "+err.Error(), 400)
|
||||
return
|
||||
}
|
||||
peersList.Store(body)
|
||||
writeJSON(w, 200, map[string]any{"ok": true, "count": len(body)})
|
||||
})
|
||||
|
||||
// __stub/mode — toggle peersFailureMode at runtime for replay scripts.
|
||||
mux.HandleFunc("/__stub/mode", func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
http.Error(w, "POST required", 405)
|
||||
return
|
||||
}
|
||||
mode := strings.ToLower(r.URL.Query().Get("peers"))
|
||||
peersFailureMode.Store(mode)
|
||||
writeJSON(w, 200, map[string]any{"ok": true, "peers_mode": mode})
|
||||
})
|
||||
|
||||
// __stub/state — expose stub state (counters, current mode) so replay
|
||||
// scripts can assert the tenant actually called us.
|
||||
// __stub/state — expose stub state (counters) so replay scripts can
|
||||
// assert the tenant actually reached us. Read-only.
|
||||
mux.HandleFunc("/__stub/state", func(w http.ResponseWriter, r *http.Request) {
|
||||
writeJSON(w, 200, map[string]any{
|
||||
"peers_mode": peersFailureMode.Load(),
|
||||
"redeploy_fleet_calls": redeployFleetCalls.Load(),
|
||||
})
|
||||
})
|
||||
|
||||
@ -1,20 +1,29 @@
|
||||
#!/usr/bin/env bash
|
||||
# Replay for issue #2397 — local proof that the peer-discovery
|
||||
# diagnostic surfacing fix actually works.
|
||||
# Replay for issue #2397 — local proof that peer-discovery surfaces
|
||||
# actionable diagnostics instead of "may be isolated".
|
||||
#
|
||||
# Prior behavior: tool_list_peers returned "No peers available (this
|
||||
# workspace may be isolated)" regardless of WHY peers were empty.
|
||||
# Five distinct conditions collapsed to one ambiguous message.
|
||||
# workspace may be isolated)" regardless of WHY peers were empty —
|
||||
# five distinct conditions (200+empty, 401, 403, 404, 5xx, network)
|
||||
# collapsed to one ambiguous message.
|
||||
#
|
||||
# This replay seeds the cp-stub to return 404 from /registry/<id>/peers
|
||||
# (simulating a workspace whose registration was wiped), then calls
|
||||
# the workspace's tool_list_peers via MCP. After the fix in #2399, the
|
||||
# response should mention "404" + "registered" — proving the diagnostic
|
||||
# reaches the agent in production-shape topology, not just unit tests.
|
||||
# This replay proves two things, separately:
|
||||
# (a) WIRE: the platform side of the contract — the tenant's
|
||||
# /registry/<unregistered>/peers returns 404. If this regresses
|
||||
# (e.g. tenant starts returning 200 with empty list, or 500),
|
||||
# the runtime helper would parse it differently and the agent
|
||||
# would see a different diagnostic. The harness catches that here.
|
||||
# (b) PARSE: the runtime helper, given a 404, produces a diagnostic
|
||||
# containing "404" + "register" hints. Done in unit tests against
|
||||
# a mock httpx response (test_a2a_client.py::TestGetPeersWithDiagnostic
|
||||
# — the harness re-asserts the same contract here against a real
|
||||
# Python eval that does NOT depend on workspace auth tokens.
|
||||
#
|
||||
# Pre-fix baseline: this script's PASS criterion is the new diagnostic
|
||||
# string. If we ever regress to "may be isolated", the replay fails
|
||||
# and CI catches it before the agent + user are blind to the cause.
|
||||
# Why split the assertion: the Python eval here doesn't have the
|
||||
# workspace's auth token file, so going through get_peers_with_diagnostic
|
||||
# directly would hit the platform without auth and produce a different
|
||||
# branch (401 instead of 404). Splitting (a) from (b) keeps each
|
||||
# assertion targeting exactly what it claims to test.
|
||||
|
||||
set -euo pipefail
|
||||
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
@ -32,76 +41,99 @@ BASE="${BASE:-http://harness-tenant.localhost:8080}"
|
||||
ADMIN="harness-admin-token"
|
||||
ORG="harness-org"
|
||||
|
||||
# 1. Toggle cp-stub to return 404 on the peers endpoint. This isn't
|
||||
# actually how the platform calls it (the platform's /registry
|
||||
# endpoints aren't proxied through cp-stub), but the workspace
|
||||
# runtime's get_peers calls /registry/:id/peers ON THE TENANT —
|
||||
# which DB-resolves and returns []. To force a 404 path on the
|
||||
# runtime side, we'd need a workspace whose ID never registered.
|
||||
# Easier replay: ask the runtime to look up a non-existent id.
|
||||
#
|
||||
# Step 1: ask the tenant for peers of a non-registered id. Tenant's
|
||||
# discovery handler returns 404 when the workspace doesn't exist.
|
||||
|
||||
# ─── (a) WIRE: tenant returns 404 for an unregistered workspace ────────
|
||||
ROGUE_ID="$(uuidgen | tr '[:upper:]' '[:lower:]')"
|
||||
|
||||
echo "[replay] querying /registry/$ROGUE_ID/peers (workspace doesn't exist)..."
|
||||
echo "[replay] (a) WIRE: querying /registry/$ROGUE_ID/peers (unregistered workspace)..."
|
||||
HTTP_CODE=$(curl -sS -o /tmp/peer-replay.json -w '%{http_code}' \
|
||||
-H "Authorization: Bearer $ADMIN" \
|
||||
-H "X-Molecule-Org-Id: $ORG" \
|
||||
-H "X-Workspace-ID: $ROGUE_ID" \
|
||||
"$BASE/registry/$ROGUE_ID/peers")
|
||||
|
||||
echo "[replay] tenant responded HTTP $HTTP_CODE"
|
||||
|
||||
# 2. The Python diagnostic helper get_peers_with_diagnostic must convert
|
||||
# that 404 into an actionable string. We simulate the helper's parse
|
||||
# here to assert the contract end-to-end (the runtime is the actual
|
||||
# consumer; this proves the wire shape that feeds it).
|
||||
|
||||
echo "[replay] tenant responded HTTP $HTTP_CODE"
|
||||
if [ "$HTTP_CODE" != "404" ]; then
|
||||
echo "[replay] FAIL: expected 404 from /registry/<unregistered>/peers, got $HTTP_CODE"
|
||||
echo "[replay] FAIL (a): expected 404 from /registry/<unregistered>/peers, got $HTTP_CODE"
|
||||
echo "[replay] This is a platform-side regression — the runtime's diagnostic helper"
|
||||
echo "[replay] would see a different status code than the unit tests cover."
|
||||
cat /tmp/peer-replay.json
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 3. Verify that running the runtime's diagnostic helper against this
|
||||
# response surfaces the actionable string. We call the helper as a
|
||||
# one-shot Python eval, mirroring how the runtime would consume it.
|
||||
|
||||
echo "[replay] invoking workspace runtime diagnostic helper against the 404..."
|
||||
# ─── (b) PARSE: helper converts a synthetic 404 to actionable diagnostic ─
|
||||
#
|
||||
# We construct a synthetic httpx 404 response and run the helper against
|
||||
# it directly. This isolates the parse branch we want to test from the
|
||||
# auth-context concerns of going through the network. The helper's network
|
||||
# branches are exhaustively covered by tests/test_a2a_client.py — this is
|
||||
# a regression-guard that the helper IS in the install, IS importable in
|
||||
# the harness's Python env, and IS reading the status code.
|
||||
|
||||
WORKSPACE_PATH="$(cd "$HARNESS_ROOT/../../workspace" && pwd)"
|
||||
DIAGNOSTIC=$(WORKSPACE_ID="$ROGUE_ID" PLATFORM_URL="$BASE" \
|
||||
PYTHONPATH="$WORKSPACE_PATH" \
|
||||
python3 -c "
|
||||
import asyncio, sys
|
||||
sys.path.insert(0, '$WORKSPACE_PATH')
|
||||
import a2a_client
|
||||
DIAGNOSTIC=$(WORKSPACE_ID="harness-rogue" PYTHONPATH="$WORKSPACE_PATH" \
|
||||
python3 - "$WORKSPACE_PATH" <<'PYEOF'
|
||||
import asyncio
|
||||
import sys
|
||||
import types
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
# Stub platform_auth so a2a_client imports cleanly without requiring a
|
||||
# real workspace token file. The helper's auth_headers() only matters
|
||||
# when going through the network; we're feeding it a mock response.
|
||||
_pa = types.ModuleType("platform_auth")
|
||||
_pa.auth_headers = lambda: {}
|
||||
_pa.self_source_headers = lambda: {}
|
||||
sys.modules.setdefault("platform_auth", _pa)
|
||||
|
||||
sys.path.insert(0, sys.argv[1])
|
||||
import a2a_client # noqa: E402
|
||||
|
||||
# This replay validates PR #2399's diagnostic helper. If the workspace
|
||||
# runtime in the current checkout pre-dates that fix, fail with a
|
||||
# clear message instead of an opaque AttributeError.
|
||||
if not hasattr(a2a_client, "get_peers_with_diagnostic"):
|
||||
print("__SKIP__: workspace/a2a_client.py is pre-#2399 (no get_peers_with_diagnostic).")
|
||||
sys.exit(0)
|
||||
|
||||
resp = MagicMock()
|
||||
resp.status_code = 404
|
||||
resp.json = MagicMock(return_value={"detail": "not found"})
|
||||
|
||||
mock_client = AsyncMock()
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client.get = AsyncMock(return_value=resp)
|
||||
|
||||
async def main():
|
||||
peers, diag = await a2a_client.get_peers_with_diagnostic()
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
peers, diag = await a2a_client.get_peers_with_diagnostic()
|
||||
print(repr(diag))
|
||||
|
||||
asyncio.run(main())
|
||||
")
|
||||
PYEOF
|
||||
)
|
||||
|
||||
echo "[replay] diagnostic from helper: $DIAGNOSTIC"
|
||||
if [[ "$DIAGNOSTIC" == __SKIP__:* ]]; then
|
||||
echo "[replay] (b) SKIP: ${DIAGNOSTIC#__SKIP__: }"
|
||||
echo "[replay] Re-run after #2399 lands on staging."
|
||||
echo ""
|
||||
echo "[replay] PASS (a) only: peer-discovery wire returns 404 (parse branch skipped — see above)."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 4. Assert the diagnostic contains "404" + "register" — the actionable
|
||||
# parts of the message. If we regress to None or "may be isolated",
|
||||
# fail the replay.
|
||||
echo "[replay] (b) PARSE: helper diagnostic = $DIAGNOSTIC"
|
||||
|
||||
if ! echo "$DIAGNOSTIC" | grep -q "404"; then
|
||||
echo "[replay] FAIL: diagnostic missing '404' — regressed to swallow-the-status-code"
|
||||
echo "[replay] FAIL (b): diagnostic missing '404' — helper regressed to swallow-the-status-code"
|
||||
exit 1
|
||||
fi
|
||||
if ! echo "$DIAGNOSTIC" | grep -qi "regist"; then
|
||||
echo "[replay] FAIL: diagnostic missing 'register' guidance — regressed to opaque message"
|
||||
echo "[replay] FAIL (b): diagnostic missing 'register' guidance — helper regressed to opaque message"
|
||||
exit 1
|
||||
fi
|
||||
if echo "$DIAGNOSTIC" | grep -qi "may be isolated"; then
|
||||
echo "[replay] FAIL: diagnostic still says 'may be isolated' — fix didn't reach this code path"
|
||||
echo "[replay] FAIL (b): diagnostic still says 'may be isolated' — fix didn't reach this code path"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "[replay] PASS: peer-discovery 404 surfaces actionable diagnostic in production-shape topology."
|
||||
echo "[replay] PASS: peer-discovery (a) wire returns 404, (b) helper produces actionable diagnostic."
|
||||
|
||||
Loading…
Reference in New Issue
Block a user