diff --git a/tests/e2e/lib/llm_proxy_preflight.sh b/tests/e2e/lib/llm_proxy_preflight.sh index 144709f5..e767d3c5 100755 --- a/tests/e2e/lib/llm_proxy_preflight.sh +++ b/tests/e2e/lib/llm_proxy_preflight.sh @@ -35,14 +35,23 @@ # # STATUS CODES # ============ -# 0 preflight OK (the proxy answered a cheap completion cleanly) -# 70 DEP-DOWN:staging-llm (proxy unreachable, slow, or auth-failed) +# 0 preflight OK (the proxy is reachable and returned an HTTP response) +# 70 DEP-DOWN:staging-llm (proxy unreachable, slow, or returned a 5xx) # 71 E2E_LLM_PROXY_URL not set and the URL could not be derived # # Why distinct exit codes: the redgate-reporter and the workflow's notify # step can use them to differentiate "infrastructure down" from "config # missing" (the latter is operator error and should not dedup against # live dependency outages). +# +# SEMANTICS NOTE (#76 root cause, 2026-06-13): +# The preflight sends an UNauthenticated probe. A healthy staging LLM proxy +# that requires auth correctly returns 401. Previously any non-200 status +# (including 401) was classified as DEP-DOWN, causing fleet-wide false +# staging-down incidents. The preflight only needs to prove REACHABILITY: +# any HTTP response (including 401/403/404) means the proxy is up. Only +# transport failures (connection refused, timeout) or 5xx server errors +# classify as DEP-DOWN. # e2e_llm_proxy_preflight # Source the lib's `llm_proxy_preflight` function. Returns 0 on success, @@ -74,13 +83,12 @@ llm_proxy_preflight() { return 71 fi - # Cheap completion: minimal token count, no streaming. The exact model - # name is a no-op for the liveness check (any model id that the proxy - # will accept is fine; the proxy returns 200 + completion for healthy - # provider keys and 5xx/timeout for outage conditions). + # Cheap, auth-less reachability probe: minimal token count, no streaming. + # The model name is a no-op for reachability; the bare slug avoids a + # provider-specific 400 on proxies that validate model IDs. local body body=$(cat <<'JSON' -{"model":"minimax/MiniMax-M2.7","max_tokens":1,"messages":[{"role":"user","content":"pong"}]} +{"model":"MiniMax-M2.7","max_tokens":1,"messages":[{"role":"user","content":"pong"}]} JSON ) @@ -96,7 +104,11 @@ JSON -d "$body" \ "$proxy_url" 2>/dev/null) || http_code="000" - if [ "$http_code" != "200" ]; then + # #76 semantics fix: the preflight only needs to prove the proxy is + # reachable and speaking HTTP. An auth-required proxy returns 401; a + # mis-routed probe returns 404 — both mean the proxy is UP. Only + # transport failures (http_code=000) or 5xx server errors mean DOWN. + if [ "$http_code" = "000" ] || [[ "$http_code" == 5* ]]; then # NOTE: the prefix `DEP-DOWN:staging-llm` is the SSOT that the # redgate-reporter parses for dedup. Do not edit without coordinating # with the redgate-reporter's parser in molecule-ci. @@ -104,14 +116,5 @@ JSON return 70 fi - # Even on 200, sanity-check the response shape — an LLM proxy that - # returns 200 with an empty/malformed body is itself a class of outage - # (the 2026-06-12 incident had a few minutes of 200 + empty body for - # one of the affected providers). - if ! grep -q '"choices"' "$tmpfile" 2>/dev/null; then - echo "::error::DEP-DOWN:staging-llm preflight failed: 200 with malformed body: $(head -c 500 "$tmpfile" 2>/dev/null)" - return 70 - fi - return 0 } diff --git a/tests/e2e/test_llm_proxy_preflight_unit.sh b/tests/e2e/test_llm_proxy_preflight_unit.sh index 46c06f12..7aa90ce4 100755 --- a/tests/e2e/test_llm_proxy_preflight_unit.sh +++ b/tests/e2e/test_llm_proxy_preflight_unit.sh @@ -5,11 +5,11 @@ # 1. Config-missing path (exit 71) when E2E_LLM_PROXY_URL is unset AND # MOLECULE_CP_URL is unset. # 2. DEP-DOWN path (exit 70) when the proxy URL is unreachable. -# 3. DEP-DOWN path (exit 70) when the proxy returns 200 with a -# malformed body (the 2026-06-12 incident's "200 with empty body" -# class of outage — see lib doc). -# 4. Happy path (exit 0) when the proxy returns 200 with a normal -# completion body containing "choices". +# 3. DEP-DOWN path (exit 70) when the proxy returns 5xx. +# 4. Happy path (exit 0) when the proxy returns any HTTP response, +# including 401 (the #76 semantics fix: an unauthenticated probe +# against an auth-required proxy must NOT be classified as +# dependency-down). # 5. The error message starts with the `DEP-DOWN:staging-llm` prefix # that the redgate-reporter parses for dedup. # @@ -35,7 +35,7 @@ PY_SERVER_LOG=$(mktemp) PY_SERVER_PID= start_test_server() { - local mode="$1" # "ok" | "down" | "empty_200" + local mode="$1" # "ok" | "down" | "unauth" # Pick a free port via socket binding; pass it explicitly to the server. local port port=$(python3 -c " @@ -54,11 +54,11 @@ class H(http.server.BaseHTTPRequestHandler): if mode == "down": self.send_error(503, "simulated outage") return - if mode == "empty_200": - self.send_response(200) + if mode == "unauth": + self.send_response(401) self.send_header("Content-Type", "application/json") self.end_headers() - self.wfile.write(b'{"error":"upstream silent"}') + self.wfile.write(b'{"error":"unauthorized"}') return # ok body = {"choices":[{"message":{"role":"assistant","content":"pong"}}]} @@ -140,10 +140,10 @@ test_proxy_unreachable() { return 0 } -# Test 3: proxy returns 200 with malformed body → exit 70. -test_200_empty_body() { +# Test 3: proxy returns 401 (auth required) → exit 0 (#76 semantics fix). +test_401_reachable() { PY_SERVER_PORT=0 - start_test_server "empty_200" + start_test_server "unauth" # E2E_LLM_PROXY_URL is read by the sourced llm_proxy_preflight helper # (lib/llm_proxy_preflight.sh) via ${E2E_LLM_PROXY_URL:-}. Export it # here so shellcheck doesn't false-positive SC2034 (appears unused) when @@ -152,19 +152,14 @@ test_200_empty_body() { local out rc out=$(llm_proxy_preflight 2>&1) rc=$? - if [ "$rc" -ne 70 ]; then - echo "FAIL: test_200_empty_body expected exit 70, got $rc" - echo " output: $out" - return 1 - fi - if ! echo "$out" | grep -q "DEP-DOWN:staging-llm"; then - echo "FAIL: test_200_empty_body output missing DEP-DOWN:staging-llm prefix" + if [ "$rc" -ne 0 ]; then + echo "FAIL: test_401_reachable expected exit 0, got $rc" echo " output: $out" return 1 fi stop_test_server PY_SERVER_PID= - echo "PASS: test_200_empty_body" + echo "PASS: test_401_reachable" return 0 } @@ -217,7 +212,7 @@ test_503() { failed=0 test_config_missing || failed=$((failed+1)) test_proxy_unreachable || failed=$((failed+1)) -test_200_empty_body || failed=$((failed+1)) +test_401_reachable || failed=$((failed+1)) test_ok || failed=$((failed+1)) test_503 || failed=$((failed+1))