fix(e2e): #76 staging LLM preflight treats any HTTP response as UP #2866
@@ -35,14 +35,23 @@
|
||||
#
|
||||
# STATUS CODES
|
||||
# ============
|
||||
# 0 preflight OK (the proxy answered a cheap completion cleanly)
|
||||
# 70 DEP-DOWN:staging-llm (proxy unreachable, slow, or auth-failed)
|
||||
# 0 preflight OK (the proxy is reachable and returned an HTTP response)
|
||||
# 70 DEP-DOWN:staging-llm (proxy unreachable, slow, or returned a 5xx)
|
||||
# 71 E2E_LLM_PROXY_URL not set and the URL could not be derived
|
||||
#
|
||||
# Why distinct exit codes: the redgate-reporter and the workflow's notify
|
||||
# step can use them to differentiate "infrastructure down" from "config
|
||||
# missing" (the latter is operator error and should not dedup against
|
||||
# live dependency outages).
|
||||
#
|
||||
# SEMANTICS NOTE (#76 root cause, 2026-06-13):
|
||||
# The preflight sends an UNauthenticated probe. A healthy staging LLM proxy
|
||||
# that requires auth correctly returns 401. Previously any non-200 status
|
||||
# (including 401) was classified as DEP-DOWN, causing fleet-wide false
|
||||
# staging-down incidents. The preflight only needs to prove REACHABILITY:
|
||||
# any HTTP response (including 401/403/404) means the proxy is up. Only
|
||||
# transport failures (connection refused, timeout) or 5xx server errors
|
||||
# classify as DEP-DOWN.
|
||||
|
||||
# e2e_llm_proxy_preflight
|
||||
# Source the lib's `llm_proxy_preflight` function. Returns 0 on success,
|
||||
@@ -74,13 +83,12 @@ llm_proxy_preflight() {
|
||||
return 71
|
||||
fi
|
||||
|
||||
# Cheap completion: minimal token count, no streaming. The exact model
|
||||
# name is a no-op for the liveness check (any model id that the proxy
|
||||
# will accept is fine; the proxy returns 200 + completion for healthy
|
||||
# provider keys and 5xx/timeout for outage conditions).
|
||||
# Cheap, auth-less reachability probe: minimal token count, no streaming.
|
||||
# The model name is a no-op for reachability; the bare slug avoids a
|
||||
# provider-specific 400 on proxies that validate model IDs.
|
||||
local body
|
||||
body=$(cat <<'JSON'
|
||||
{"model":"minimax/MiniMax-M2.7","max_tokens":1,"messages":[{"role":"user","content":"pong"}]}
|
||||
{"model":"MiniMax-M2.7","max_tokens":1,"messages":[{"role":"user","content":"pong"}]}
|
||||
JSON
|
||||
)
|
||||
|
||||
@@ -96,7 +104,11 @@ JSON
|
||||
-d "$body" \
|
||||
"$proxy_url" 2>/dev/null) || http_code="000"
|
||||
|
||||
if [ "$http_code" != "200" ]; then
|
||||
# #76 semantics fix: the preflight only needs to prove the proxy is
|
||||
# reachable and speaking HTTP. An auth-required proxy returns 401; a
|
||||
# mis-routed probe returns 404 — both mean the proxy is UP. Only
|
||||
# transport failures (http_code=000) or 5xx server errors mean DOWN.
|
||||
if [ "$http_code" = "000" ] || [[ "$http_code" == 5* ]]; then
|
||||
# NOTE: the prefix `DEP-DOWN:staging-llm` is the SSOT that the
|
||||
# redgate-reporter parses for dedup. Do not edit without coordinating
|
||||
# with the redgate-reporter's parser in molecule-ci.
|
||||
@@ -104,14 +116,5 @@ JSON
|
||||
return 70
|
||||
fi
|
||||
|
||||
# Even on 200, sanity-check the response shape — an LLM proxy that
|
||||
# returns 200 with an empty/malformed body is itself a class of outage
|
||||
# (the 2026-06-12 incident had a few minutes of 200 + empty body for
|
||||
# one of the affected providers).
|
||||
if ! grep -q '"choices"' "$tmpfile" 2>/dev/null; then
|
||||
echo "::error::DEP-DOWN:staging-llm preflight failed: 200 with malformed body: $(head -c 500 "$tmpfile" 2>/dev/null)"
|
||||
return 70
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
@@ -5,11 +5,11 @@
|
||||
# 1. Config-missing path (exit 71) when E2E_LLM_PROXY_URL is unset AND
|
||||
# MOLECULE_CP_URL is unset.
|
||||
# 2. DEP-DOWN path (exit 70) when the proxy URL is unreachable.
|
||||
# 3. DEP-DOWN path (exit 70) when the proxy returns 200 with a
|
||||
# malformed body (the 2026-06-12 incident's "200 with empty body"
|
||||
# class of outage — see lib doc).
|
||||
# 4. Happy path (exit 0) when the proxy returns 200 with a normal
|
||||
# completion body containing "choices".
|
||||
# 3. DEP-DOWN path (exit 70) when the proxy returns 5xx.
|
||||
# 4. Happy path (exit 0) when the proxy returns any HTTP response,
|
||||
# including 401 (the #76 semantics fix: an unauthenticated probe
|
||||
# against an auth-required proxy must NOT be classified as
|
||||
# dependency-down).
|
||||
# 5. The error message starts with the `DEP-DOWN:staging-llm` prefix
|
||||
# that the redgate-reporter parses for dedup.
|
||||
#
|
||||
@@ -35,7 +35,7 @@ PY_SERVER_LOG=$(mktemp)
|
||||
PY_SERVER_PID=
|
||||
|
||||
start_test_server() {
|
||||
local mode="$1" # "ok" | "down" | "empty_200"
|
||||
local mode="$1" # "ok" | "down" | "unauth"
|
||||
# Pick a free port via socket binding; pass it explicitly to the server.
|
||||
local port
|
||||
port=$(python3 -c "
|
||||
@@ -54,11 +54,11 @@ class H(http.server.BaseHTTPRequestHandler):
|
||||
if mode == "down":
|
||||
self.send_error(503, "simulated outage")
|
||||
return
|
||||
if mode == "empty_200":
|
||||
self.send_response(200)
|
||||
if mode == "unauth":
|
||||
self.send_response(401)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.end_headers()
|
||||
self.wfile.write(b'{"error":"upstream silent"}')
|
||||
self.wfile.write(b'{"error":"unauthorized"}')
|
||||
return
|
||||
# ok
|
||||
body = {"choices":[{"message":{"role":"assistant","content":"pong"}}]}
|
||||
@@ -140,10 +140,10 @@ test_proxy_unreachable() {
|
||||
return 0
|
||||
}
|
||||
|
||||
# Test 3: proxy returns 200 with malformed body → exit 70.
|
||||
test_200_empty_body() {
|
||||
# Test 3: proxy returns 401 (auth required) → exit 0 (#76 semantics fix).
|
||||
test_401_reachable() {
|
||||
PY_SERVER_PORT=0
|
||||
start_test_server "empty_200"
|
||||
start_test_server "unauth"
|
||||
# E2E_LLM_PROXY_URL is read by the sourced llm_proxy_preflight helper
|
||||
# (lib/llm_proxy_preflight.sh) via ${E2E_LLM_PROXY_URL:-}. Export it
|
||||
# here so shellcheck doesn't false-positive SC2034 (appears unused) when
|
||||
@@ -152,19 +152,14 @@ test_200_empty_body() {
|
||||
local out rc
|
||||
out=$(llm_proxy_preflight 2>&1)
|
||||
rc=$?
|
||||
if [ "$rc" -ne 70 ]; then
|
||||
echo "FAIL: test_200_empty_body expected exit 70, got $rc"
|
||||
echo " output: $out"
|
||||
return 1
|
||||
fi
|
||||
if ! echo "$out" | grep -q "DEP-DOWN:staging-llm"; then
|
||||
echo "FAIL: test_200_empty_body output missing DEP-DOWN:staging-llm prefix"
|
||||
if [ "$rc" -ne 0 ]; then
|
||||
echo "FAIL: test_401_reachable expected exit 0, got $rc"
|
||||
echo " output: $out"
|
||||
return 1
|
||||
fi
|
||||
stop_test_server
|
||||
PY_SERVER_PID=
|
||||
echo "PASS: test_200_empty_body"
|
||||
echo "PASS: test_401_reachable"
|
||||
return 0
|
||||
}
|
||||
|
||||
@@ -217,7 +212,7 @@ test_503() {
|
||||
failed=0
|
||||
test_config_missing || failed=$((failed+1))
|
||||
test_proxy_unreachable || failed=$((failed+1))
|
||||
test_200_empty_body || failed=$((failed+1))
|
||||
test_401_reachable || failed=$((failed+1))
|
||||
test_ok || failed=$((failed+1))
|
||||
test_503 || failed=$((failed+1))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user