fix(core#2675): LLM-proxy preflight with DEP-DOWN:staging-llm status description convention #2763

Merged
devops-engineer merged 3 commits from fix/core2675-llm-preflight into main 2026-06-13 19:50:07 +00:00
3 changed files with 362 additions and 0 deletions
+17
View File
@@ -51,11 +51,13 @@ on:
- 'workspace-server/internal/providers/providers.yaml'
- 'tests/e2e/test_staging_full_saas.sh'
- 'tests/e2e/lib/completion_assert.sh'
- 'tests/e2e/lib/llm_proxy_preflight.sh'
- 'tests/e2e/lib/model_slug.sh'
- 'tests/e2e/lib/aws_leak_check.sh'
- 'tests/e2e/test_aws_leak_check.sh'
- 'tests/e2e/test_staging_concierge_e2e.sh'
- 'tests/e2e/test_staging_concierge_creates_workspace_e2e.sh'
- 'tests/e2e/test_llm_proxy_preflight_unit.sh'
- 'workspace-server/internal/staginge2e/**'
- 'workspace-server/internal/handlers/platform_agent.go'
- 'workspace-server/internal/handlers/user_tasks.go'
@@ -73,11 +75,13 @@ on:
- 'workspace-server/internal/providers/providers.yaml'
- 'tests/e2e/test_staging_full_saas.sh'
- 'tests/e2e/lib/completion_assert.sh'
- 'tests/e2e/lib/llm_proxy_preflight.sh'
- 'tests/e2e/lib/model_slug.sh'
- 'tests/e2e/lib/aws_leak_check.sh'
- 'tests/e2e/test_aws_leak_check.sh'
- 'tests/e2e/test_staging_concierge_e2e.sh'
- 'tests/e2e/test_staging_concierge_creates_workspace_e2e.sh'
- 'tests/e2e/test_llm_proxy_preflight_unit.sh'
- 'workspace-server/internal/staginge2e/**'
- 'workspace-server/internal/handlers/platform_agent.go'
- 'workspace-server/internal/handlers/user_tasks.go'
@@ -288,6 +292,19 @@ jobs:
fi
echo "Staging CP healthy ✓"
# core#2675: completion-gated lanes must distinguish "staging LLM proxy
# down" from "real code bug" with a distinct, machine-readable status
# description prefix `DEP-DOWN:staging-llm` so the redgate-reporter
# can dedup N identical reds into ONE incident issue. Wired into the
# pr-validate job first; the same source line is replicated in the
# other 3 completion-gated lanes in a follow-up commit (the file's
# 5 nearly-identical job blocks are mechanically derivable).
- name: LLM proxy preflight (DEP-DOWN:staging-llm)
run: |
# shellcheck source=lib/llm_proxy_preflight.sh
source tests/e2e/lib/llm_proxy_preflight.sh
llm_proxy_preflight
- name: Run full-lifecycle E2E
id: e2e
run: bash tests/e2e/test_staging_full_saas.sh
+117
View File
@@ -0,0 +1,117 @@
#!/usr/bin/env bash
# LLM-proxy preflight helper for completion-gated e2e lanes (core#2675).
#
# PURPOSE
# =======
# Before booting workspaces (an expensive, multi-minute operation), confirm
# the staging LLM proxy can serve a cheap completion. The 2026-06-12 staging
# LLM outage (~21:10-21:38Z) produced 4 identical red CI lanes — Staging SaaS
# x3 + Local Provision — with no machine-readable signal distinguishing
# "dependency down" from "real code bug." Triage required forensic log-diffing
# across lanes and (per the issue) initially mis-attributed an unrelated
# deploy-path bug to the outage.
#
# This preflight fast-fails the lane with a DISTINCT, machine-readable status
# description prefix `DEP-DOWN:staging-llm` so the redgate-reporter can:
# 1. file ONE incident issue for the dependency outage (dedup), and
# 2. let operators skip the lane's workspace-boot logic while the
# dependency is being restored.
#
# The convention (status description prefix + per-run dedup) is the whole
# deliverable; the actual LLM-proxy endpoint is configurable via env so the
# same helper works across lanes with different proxy URLs (e.g. the
# staging SaaS stack uses a different LLM proxy than the local-provision
# dev proxy).
#
# CONVENTIONS
# ===========
# - Source this lib AFTER the host script defines fail()/ok()/log().
# - Call `llm_proxy_preflight` (no args). It reads E2E_LLM_PROXY_URL
# (or falls back to deriving one from MOLECULE_CP_URL) and exits the
# whole lane on failure.
# - The status description prefix `DEP-DOWN:staging-llm` is the SSOT —
# `redgate-reporter` parses this and dedups. Do NOT change the prefix
# without coordinating the redgate-reporter's parser.
#
# STATUS CODES
# ============
# 0 preflight OK (the proxy answered a cheap completion cleanly)
# 70 DEP-DOWN:staging-llm (proxy unreachable, slow, or auth-failed)
# 71 E2E_LLM_PROXY_URL not set and the URL could not be derived
#
# Why distinct exit codes: the redgate-reporter and the workflow's notify
# step can use them to differentiate "infrastructure down" from "config
# missing" (the latter is operator error and should not dedup against
# live dependency outages).
# e2e_llm_proxy_preflight
# Source the lib's `llm_proxy_preflight` function. Returns 0 on success,
# 70/71 on the dedicated DEP-DOWN / config-missing cases.
llm_proxy_preflight() {
local proxy_url="${E2E_LLM_PROXY_URL:-}"
local timeout_secs="${E2E_LLM_PROXY_TIMEOUT:-30}"
if [ -z "$proxy_url" ]; then
# Derive from the CP URL when not set. The platform-managed LLM proxy
# is exposed at <cp_url>/api/v1/internal/llm/openai/v1; the staging
# instance lives at staging-api.moleculesai.app. E2E_LLM_PROXY_URL
# override stays available for lanes that point at a different proxy
# (local provision uses the local workspace-server's built-in proxy).
if [ -n "${MOLECULE_CP_URL:-}" ]; then
proxy_url="${MOLECULE_CP_URL%/}/api/v1/internal/llm/openai/v1/chat/completions"
fi
fi
if [ -z "$proxy_url" ]; then
# Config-missing is NOT a dependency-down condition — it is operator
# error (an E2E_LANE was wired without setting E2E_LLM_PROXY_URL or
# MOLECULE_CP_URL). Emit a distinct CONFIG-MISSING prefix so the
# redgate-reporter dedups separately: DEP-DOWN dedups against
# live dependency outages; CONFIG-MISSING dedups against the same
# misconfiguration across runs/lanes. Do NOT change the prefix
# without coordinating the redgate-reporter's parser.
echo "::error::CONFIG-MISSING:staging-llm-proxy-url E2E_LLM_PROXY_URL is unset and could not be derived from MOLECULE_CP_URL"
return 71
fi
# Cheap completion: minimal token count, no streaming. The exact model
# name is a no-op for the liveness check (any model id that the proxy
# will accept is fine; the proxy returns 200 + completion for healthy
# provider keys and 5xx/timeout for outage conditions).
local body
body=$(cat <<'JSON'
{"model":"minimax/MiniMax-M2.7","max_tokens":1,"messages":[{"role":"user","content":"pong"}]}
JSON
)
local tmpfile http_code
tmpfile=$(mktemp)
# shellcheck disable=SC2064
trap "rm -f '$tmpfile'" RETURN
http_code=$(curl -sS -o "$tmpfile" -w "%{http_code}" \
--max-time "$timeout_secs" \
-H "Content-Type: application/json" \
-X POST \
-d "$body" \
"$proxy_url" 2>/dev/null) || http_code="000"
if [ "$http_code" != "200" ]; then
# NOTE: the prefix `DEP-DOWN:staging-llm` is the SSOT that the
# redgate-reporter parses for dedup. Do not edit without coordinating
# with the redgate-reporter's parser in molecule-ci.
echo "::error::DEP-DOWN:staging-llm preflight failed: proxy=$proxy_url http_code=$http_code body=$(head -c 500 "$tmpfile" 2>/dev/null)"
return 70
fi
# Even on 200, sanity-check the response shape — an LLM proxy that
# returns 200 with an empty/malformed body is itself a class of outage
# (the 2026-06-12 incident had a few minutes of 200 + empty body for
# one of the affected providers).
if ! grep -q '"choices"' "$tmpfile" 2>/dev/null; then
echo "::error::DEP-DOWN:staging-llm preflight failed: 200 with malformed body: $(head -c 500 "$tmpfile" 2>/dev/null)"
return 70
fi
return 0
}
+228
View File
@@ -0,0 +1,228 @@
#!/usr/bin/env bash
# Unit tests for tests/e2e/lib/llm_proxy_preflight.sh (core#2675).
#
# Verifies:
# 1. Config-missing path (exit 71) when E2E_LLM_PROXY_URL is unset AND
# MOLECULE_CP_URL is unset.
# 2. DEP-DOWN path (exit 70) when the proxy URL is unreachable.
# 3. DEP-DOWN path (exit 70) when the proxy returns 200 with a
# malformed body (the 2026-06-12 incident's "200 with empty body"
# class of outage — see lib doc).
# 4. Happy path (exit 0) when the proxy returns 200 with a normal
# completion body containing "choices".
# 5. The error message starts with the `DEP-DOWN:staging-llm` prefix
# that the redgate-reporter parses for dedup.
#
# These tests use a small Python helper as a stand-in for the actual LLM
# proxy (avoids needing a real proxy in the test environment). The Python
# helper listens on a localhost port and serves a configurable response.
set -uo pipefail
# Find the lib under test. Allow override for CI flexibility.
LIB_PATH="${LIB_PATH:-$(cd "$(dirname "$0")" && pwd)/lib/llm_proxy_preflight.sh}"
# shellcheck source=lib/llm_proxy_preflight.sh
# shellcheck disable=SC1091
source "$LIB_PATH"
# Start a tiny Python HTTP server to stand in for the LLM proxy. We use
# Python's http.server because it ships in the base image and doesn't
# require extra dependencies. Each test picks a free port via Python's
# socket binding (avoids race conditions in test parallelism).
PY_SERVER_PORT=""
PY_SERVER_LOG=$(mktemp)
PY_SERVER_PID=
start_test_server() {
local mode="$1" # "ok" | "down" | "empty_200"
# Pick a free port via socket binding; pass it explicitly to the server.
local port
port=$(python3 -c "
import socket
s = socket.socket()
s.bind(('127.0.0.1', 0))
print(s.getsockname()[1])
s.close()
")
cat > /tmp/_llm_preflight_test_server.py <<PYEOF
import http.server, json, sys
mode = "$mode"
port = $port
class H(http.server.BaseHTTPRequestHandler):
def do_POST(self):
if mode == "down":
self.send_error(503, "simulated outage")
return
if mode == "empty_200":
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(b'{"error":"upstream silent"}')
return
# ok
body = {"choices":[{"message":{"role":"assistant","content":"pong"}}]}
payload = json.dumps(body).encode()
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(payload)))
self.end_headers()
self.wfile.write(payload)
def log_message(self, *args, **kwargs):
pass
http.server.HTTPServer(("127.0.0.1", port), H).serve_forever()
PYEOF
python3 /tmp/_llm_preflight_test_server.py >"$PY_SERVER_LOG" 2>&1 &
PY_SERVER_PID=$!
# Give the server a moment to bind.
sleep 0.3
PY_SERVER_PORT="$port"
}
stop_test_server() {
if [ -n "$PY_SERVER_PID" ]; then
kill "$PY_SERVER_PID" 2>/dev/null || true
wait "$PY_SERVER_PID" 2>/dev/null || true
fi
rm -f /tmp/_llm_preflight_test_server.py "$PY_SERVER_LOG"
}
trap stop_test_server EXIT
# Test 1: config-missing path.
test_config_missing() {
unset E2E_LLM_PROXY_URL
unset MOLECULE_CP_URL
local out rc
out=$(llm_proxy_preflight 2>&1)
rc=$?
if [ "$rc" -ne 71 ]; then
echo "FAIL: test_config_missing expected exit 71, got $rc"
echo " output: $out"
return 1
fi
# Config-missing emits CONFIG-MISSING, NOT DEP-DOWN — see the lib's
# comment on the status description prefixes. The two dedup buckets
# are distinct in the redgate-reporter.
if ! echo "$out" | grep -q "CONFIG-MISSING:staging-llm-proxy-url"; then
echo "FAIL: test_config_missing output missing CONFIG-MISSING:staging-llm-proxy-url prefix"
echo " output: $out"
return 1
fi
if echo "$out" | grep -q "DEP-DOWN:staging-llm"; then
echo "FAIL: test_config_missing output should NOT contain DEP-DOWN:staging-llm (config-missing is a separate dedup bucket)"
echo " output: $out"
return 1
fi
echo "PASS: test_config_missing"
return 0
}
# Test 2: proxy unreachable (TCP connection refused) → exit 70.
test_proxy_unreachable() {
PY_SERVER_PORT=1 # port 1 is privileged, will refuse
start_test_server "ok" # we ignore the server, just want the lib to hit a dead port
sleep 0.3
E2E_LLM_PROXY_URL="http://127.0.0.1:1/v1/chat/completions"
local out rc
out=$(llm_proxy_preflight 2>&1)
rc=$?
if [ "$rc" -ne 70 ]; then
echo "FAIL: test_proxy_unreachable expected exit 70, got $rc"
echo " output: $out"
return 1
fi
if ! echo "$out" | grep -q "DEP-DOWN:staging-llm"; then
echo "FAIL: test_proxy_unreachable output missing DEP-DOWN:staging-llm prefix"
echo " output: $out"
return 1
fi
echo "PASS: test_proxy_unreachable"
return 0
}
# Test 3: proxy returns 200 with malformed body → exit 70.
test_200_empty_body() {
PY_SERVER_PORT=0
start_test_server "empty_200"
# E2E_LLM_PROXY_URL is read by the sourced llm_proxy_preflight helper
# (lib/llm_proxy_preflight.sh) via ${E2E_LLM_PROXY_URL:-}. Export it
# here so shellcheck doesn't false-positive SC2034 (appears unused) when
# the test file is checked in isolation.
export E2E_LLM_PROXY_URL="http://127.0.0.1:${PY_SERVER_PORT}/v1/chat/completions"
local out rc
out=$(llm_proxy_preflight 2>&1)
rc=$?
if [ "$rc" -ne 70 ]; then
echo "FAIL: test_200_empty_body expected exit 70, got $rc"
echo " output: $out"
return 1
fi
if ! echo "$out" | grep -q "DEP-DOWN:staging-llm"; then
echo "FAIL: test_200_empty_body output missing DEP-DOWN:staging-llm prefix"
echo " output: $out"
return 1
fi
stop_test_server
PY_SERVER_PID=
echo "PASS: test_200_empty_body"
return 0
}
# Test 4: happy path → exit 0.
test_ok() {
PY_SERVER_PORT=0
start_test_server "ok"
# E2E_LLM_PROXY_URL is read by the sourced llm_proxy_preflight helper
# (lib/llm_proxy_preflight.sh) via ${E2E_LLM_PROXY_URL:-}. Export it
# here so shellcheck doesn't false-positive SC2034 (appears unused) when
# the test file is checked in isolation.
export E2E_LLM_PROXY_URL="http://127.0.0.1:${PY_SERVER_PORT}/v1/chat/completions"
local out rc
out=$(llm_proxy_preflight 2>&1)
rc=$?
if [ "$rc" -ne 0 ]; then
echo "FAIL: test_ok expected exit 0, got $rc"
echo " output: $out"
return 1
fi
stop_test_server
PY_SERVER_PID=
echo "PASS: test_ok"
return 0
}
# Test 5: proxy returns 503 (simulated outage) → exit 70.
test_503() {
PY_SERVER_PORT=0
start_test_server "down"
# E2E_LLM_PROXY_URL is read by the sourced llm_proxy_preflight helper
# (lib/llm_proxy_preflight.sh) via ${E2E_LLM_PROXY_URL:-}. Export it
# here so shellcheck doesn't false-positive SC2034 (appears unused) when
# the test file is checked in isolation.
export E2E_LLM_PROXY_URL="http://127.0.0.1:${PY_SERVER_PORT}/v1/chat/completions"
local out rc
out=$(llm_proxy_preflight 2>&1)
rc=$?
if [ "$rc" -ne 70 ]; then
echo "FAIL: test_503 expected exit 70, got $rc"
echo " output: $out"
return 1
fi
stop_test_server
PY_SERVER_PID=
echo "PASS: test_503"
return 0
}
failed=0
test_config_missing || failed=$((failed+1))
test_proxy_unreachable || failed=$((failed+1))
test_200_empty_body || failed=$((failed+1))
test_ok || failed=$((failed+1))
test_503 || failed=$((failed+1))
if [ "$failed" -gt 0 ]; then
echo "FAILED: $failed test(s)"
exit 1
fi
echo "All llm_proxy_preflight unit tests passed"