fix(e2e): #76 staging LLM preflight treats any HTTP response as UP #2866

Merged
devops-engineer merged 2 commits from fix/76-staging-llm-preflight-model-auth into main 2026-06-14 17:23:17 +00:00
2 changed files with 36 additions and 38 deletions
+20 -17
View File
@@ -35,14 +35,23 @@
#
# STATUS CODES
# ============
# 0 preflight OK (the proxy answered a cheap completion cleanly)
# 70 DEP-DOWN:staging-llm (proxy unreachable, slow, or auth-failed)
# 0 preflight OK (the proxy is reachable and returned an HTTP response)
# 70 DEP-DOWN:staging-llm (proxy unreachable, slow, or returned a 5xx)
# 71 E2E_LLM_PROXY_URL not set and the URL could not be derived
#
# Why distinct exit codes: the redgate-reporter and the workflow's notify
# step can use them to differentiate "infrastructure down" from "config
# missing" (the latter is operator error and should not dedup against
# live dependency outages).
#
# SEMANTICS NOTE (#76 root cause, 2026-06-13):
# The preflight sends an UNauthenticated probe. A healthy staging LLM proxy
# that requires auth correctly returns 401. Previously any non-200 status
# (including 401) was classified as DEP-DOWN, causing fleet-wide false
# staging-down incidents. The preflight only needs to prove REACHABILITY:
# any HTTP response (including 401/403/404) means the proxy is up. Only
# transport failures (connection refused, timeout) or 5xx server errors
# classify as DEP-DOWN.
# e2e_llm_proxy_preflight
# Source the lib's `llm_proxy_preflight` function. Returns 0 on success,
@@ -74,13 +83,12 @@ llm_proxy_preflight() {
return 71
fi
# Cheap completion: minimal token count, no streaming. The exact model
# name is a no-op for the liveness check (any model id that the proxy
# will accept is fine; the proxy returns 200 + completion for healthy
# provider keys and 5xx/timeout for outage conditions).
# Cheap, auth-less reachability probe: minimal token count, no streaming.
# The model name is a no-op for reachability; the bare slug avoids a
# provider-specific 400 on proxies that validate model IDs.
local body
body=$(cat <<'JSON'
{"model":"minimax/MiniMax-M2.7","max_tokens":1,"messages":[{"role":"user","content":"pong"}]}
{"model":"MiniMax-M2.7","max_tokens":1,"messages":[{"role":"user","content":"pong"}]}
JSON
)
@@ -96,7 +104,11 @@ JSON
-d "$body" \
"$proxy_url" 2>/dev/null) || http_code="000"
if [ "$http_code" != "200" ]; then
# #76 semantics fix: the preflight only needs to prove the proxy is
# reachable and speaking HTTP. An auth-required proxy returns 401; a
# mis-routed probe returns 404 — both mean the proxy is UP. Only
# transport failures (http_code=000) or 5xx server errors mean DOWN.
if [ "$http_code" = "000" ] || [[ "$http_code" == 5* ]]; then
# NOTE: the prefix `DEP-DOWN:staging-llm` is the SSOT that the
# redgate-reporter parses for dedup. Do not edit without coordinating
# with the redgate-reporter's parser in molecule-ci.
@@ -104,14 +116,5 @@ JSON
return 70
fi
# Even on 200, sanity-check the response shape — an LLM proxy that
# returns 200 with an empty/malformed body is itself a class of outage
# (the 2026-06-12 incident had a few minutes of 200 + empty body for
# one of the affected providers).
if ! grep -q '"choices"' "$tmpfile" 2>/dev/null; then
echo "::error::DEP-DOWN:staging-llm preflight failed: 200 with malformed body: $(head -c 500 "$tmpfile" 2>/dev/null)"
return 70
fi
return 0
}
+16 -21
View File
@@ -5,11 +5,11 @@
# 1. Config-missing path (exit 71) when E2E_LLM_PROXY_URL is unset AND
# MOLECULE_CP_URL is unset.
# 2. DEP-DOWN path (exit 70) when the proxy URL is unreachable.
# 3. DEP-DOWN path (exit 70) when the proxy returns 200 with a
# malformed body (the 2026-06-12 incident's "200 with empty body"
# class of outage — see lib doc).
# 4. Happy path (exit 0) when the proxy returns 200 with a normal
# completion body containing "choices".
# 3. DEP-DOWN path (exit 70) when the proxy returns 5xx.
# 4. Happy path (exit 0) when the proxy returns any HTTP response,
# including 401 (the #76 semantics fix: an unauthenticated probe
# against an auth-required proxy must NOT be classified as
# dependency-down).
# 5. The error message starts with the `DEP-DOWN:staging-llm` prefix
# that the redgate-reporter parses for dedup.
#
@@ -35,7 +35,7 @@ PY_SERVER_LOG=$(mktemp)
PY_SERVER_PID=
start_test_server() {
local mode="$1" # "ok" | "down" | "empty_200"
local mode="$1" # "ok" | "down" | "unauth"
# Pick a free port via socket binding; pass it explicitly to the server.
local port
port=$(python3 -c "
@@ -54,11 +54,11 @@ class H(http.server.BaseHTTPRequestHandler):
if mode == "down":
self.send_error(503, "simulated outage")
return
if mode == "empty_200":
self.send_response(200)
if mode == "unauth":
self.send_response(401)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(b'{"error":"upstream silent"}')
self.wfile.write(b'{"error":"unauthorized"}')
return
# ok
body = {"choices":[{"message":{"role":"assistant","content":"pong"}}]}
@@ -140,10 +140,10 @@ test_proxy_unreachable() {
return 0
}
# Test 3: proxy returns 200 with malformed body → exit 70.
test_200_empty_body() {
# Test 3: proxy returns 401 (auth required) → exit 0 (#76 semantics fix).
test_401_reachable() {
PY_SERVER_PORT=0
start_test_server "empty_200"
start_test_server "unauth"
# E2E_LLM_PROXY_URL is read by the sourced llm_proxy_preflight helper
# (lib/llm_proxy_preflight.sh) via ${E2E_LLM_PROXY_URL:-}. Export it
# here so shellcheck doesn't false-positive SC2034 (appears unused) when
@@ -152,19 +152,14 @@ test_200_empty_body() {
local out rc
out=$(llm_proxy_preflight 2>&1)
rc=$?
if [ "$rc" -ne 70 ]; then
echo "FAIL: test_200_empty_body expected exit 70, got $rc"
echo " output: $out"
return 1
fi
if ! echo "$out" | grep -q "DEP-DOWN:staging-llm"; then
echo "FAIL: test_200_empty_body output missing DEP-DOWN:staging-llm prefix"
if [ "$rc" -ne 0 ]; then
echo "FAIL: test_401_reachable expected exit 0, got $rc"
echo " output: $out"
return 1
fi
stop_test_server
PY_SERVER_PID=
echo "PASS: test_200_empty_body"
echo "PASS: test_401_reachable"
return 0
}
@@ -217,7 +212,7 @@ test_503() {
failed=0
test_config_missing || failed=$((failed+1))
test_proxy_unreachable || failed=$((failed+1))
test_200_empty_body || failed=$((failed+1))
test_401_reachable || failed=$((failed+1))
test_ok || failed=$((failed+1))
test_503 || failed=$((failed+1))