From 201f39a6d07ae01b0fba4c03f4b88da22db831f4 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 10:27:50 -0700 Subject: [PATCH] test(e2e): set delivery_mode=poll explicitly to decouple from image drift MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Second-round failure on the same test (run 25179171433): register response: {"error":"hostname \"example.invalid\" cannot be resolved (DNS error)"} HTTP_CODE=400 Root cause: registry.Register's resolveDeliveryMode was supposed to default runtime=external workspaces to poll mode (PR #2382), in which case validateAgentURL is skipped and example.invalid passes through. But the freshly-provisioned staging tenant for this test was running an older workspace-server image that lacked that branch — the implicit default was still push, validateAgentURL ran, and the DNS lookup 400'd. Same image-drift class as the production bug seen on the hongmingwang tenant 17:30Z (deployed image lagging main HEAD). Fix: send delivery_mode="poll" explicitly. Eliminates the test's dependence on resolveDeliveryMode's default branch being deployed. Step 5b reframed: was "verify external→poll default working", now "verify explicit-poll round-trips". The default-resolution behavior is exercised by handler-level tests in registry_test.go, which run against the SHA being merged (not whatever :latest happens to be on the fleet). That's the right place for it — E2E should test what users see, unit tests should pin what handlers compute. Pulling those apart removes a class of "intermittent on staging, green locally" failures. The deeper bug — fleet redeploy + provision both can serve stale images even when the tag has been republished — gets a separate issue. This commit just unblocks the merge. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/e2e/test_staging_external_runtime.sh | 38 +++++++++++++++------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/tests/e2e/test_staging_external_runtime.sh b/tests/e2e/test_staging_external_runtime.sh index 4876b4c5..9c1ffffc 100755 --- a/tests/e2e/test_staging_external_runtime.sh +++ b/tests/e2e/test_staging_external_runtime.sh @@ -258,15 +258,25 @@ ok "DB row stored as awaiting_agent (proof migration 046 applied)" log "5/8 Registering workspace via /registry/register..." [ -z "$WS_AUTH_TOKEN" ] && fail "No workspace auth token returned — register impossible" # Payload contract (workspace-server/internal/models/workspace.go RegisterPayload): -# id — required, the workspace UUID (NOT "workspace_id" — that's the -# heartbeat payload field; mixing them yields a 400 from -# ShouldBindJSON because `id` has binding:"required"). -# agent_card — required (binding:"required"); minimal valid card is name+skills. -# url — only validated for push-mode workspaces; runtime=external -# resolves to poll (registry.go:resolveDeliveryMode), so -# example.invalid is accepted as a placeholder URL the -# platform never dispatches to. -REGISTER_BODY=$(printf '{"id":"%s","url":"https://example.invalid:443","agent_card":{"name":"e2e-ext","skills":[{"id":"echo","name":"Echo"}]}}' "$WS_ID") +# id — required, the workspace UUID (NOT "workspace_id" — that's the +# heartbeat payload field; mixing them yields a 400 from +# ShouldBindJSON because `id` has binding:"required"). +# agent_card — required (binding:"required"); minimal valid card is name+skills. +# delivery_mode — set explicitly to "poll" so url validation is skipped +# regardless of whether the deployed image has the +# runtime=external→poll default from PR #2382. Observed +# 2026-04-30 17:18Z: a freshly-provisioned staging tenant +# was running an older workspace-server :latest image +# that lacked resolveDeliveryMode's external→poll branch, +# so the implicit default was push and validateAgentURL +# 400'd on example.invalid. Asserting on the implicit +# default makes the *register call* itself fragile to +# image-tag drift on the fleet — verify the default +# separately (step 5b assertion) without depending on it +# here. +# url — accepted but not dispatched-to in poll mode, so +# example.invalid is a valid sentinel. +REGISTER_BODY=$(printf '{"id":"%s","url":"https://example.invalid:443","delivery_mode":"poll","agent_card":{"name":"e2e-ext","skills":[{"id":"echo","name":"Echo"}]}}' "$WS_ID") # Disable --fail-with-body for this one call so a 4xx surfaces the response # body (the bare CURL_COMMON would `set -e`-kill before we could log it). REGISTER_RESP=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST "$TENANT_URL/registry/register" \ @@ -282,12 +292,16 @@ ONLINE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load( [ "$ONLINE_STATUS" != "online" ] && fail "Expected online after register, got $ONLINE_STATUS" ok "Workspace transitioned to online" -# Confirm delivery_mode defaulted to poll for runtime=external (PR #2382). +# Confirm explicit delivery_mode=poll round-trips correctly. We now pass +# poll explicitly above (see REGISTER_BODY) rather than rely on the +# runtime=external→poll default, so this is a round-trip smoke check, not +# a default-resolution check. The default is exercised by integration +# tests in workspace-server/internal/handlers/registry_test.go. DELIVERY_MODE=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('delivery_mode',''))") if [ "$DELIVERY_MODE" = "poll" ]; then - ok "delivery_mode=poll (resolveDeliveryMode external default working)" + ok "delivery_mode=poll (explicit-poll round-trip)" else - log " delivery_mode=$DELIVERY_MODE (poll default may be off — non-fatal for this test)" + fail "Expected delivery_mode=poll (explicitly set in REGISTER_BODY), got $DELIVERY_MODE — register UPDATE not honoring payload.delivery_mode" fi # ─── 6. Stop heartbeating; wait past REMOTE_LIVENESS_STALE_AFTER ────────