fix(core#2675): LLM-proxy preflight with DEP-DOWN:staging-llm status description convention #2763
@@ -51,11 +51,13 @@ on:
|
||||
- 'workspace-server/internal/providers/providers.yaml'
|
||||
- 'tests/e2e/test_staging_full_saas.sh'
|
||||
- 'tests/e2e/lib/completion_assert.sh'
|
||||
- 'tests/e2e/lib/llm_proxy_preflight.sh'
|
||||
- 'tests/e2e/lib/model_slug.sh'
|
||||
- 'tests/e2e/lib/aws_leak_check.sh'
|
||||
- 'tests/e2e/test_aws_leak_check.sh'
|
||||
- 'tests/e2e/test_staging_concierge_e2e.sh'
|
||||
- 'tests/e2e/test_staging_concierge_creates_workspace_e2e.sh'
|
||||
- 'tests/e2e/test_llm_proxy_preflight_unit.sh'
|
||||
- 'workspace-server/internal/staginge2e/**'
|
||||
- 'workspace-server/internal/handlers/platform_agent.go'
|
||||
- 'workspace-server/internal/handlers/user_tasks.go'
|
||||
@@ -73,11 +75,13 @@ on:
|
||||
- 'workspace-server/internal/providers/providers.yaml'
|
||||
- 'tests/e2e/test_staging_full_saas.sh'
|
||||
- 'tests/e2e/lib/completion_assert.sh'
|
||||
- 'tests/e2e/lib/llm_proxy_preflight.sh'
|
||||
- 'tests/e2e/lib/model_slug.sh'
|
||||
- 'tests/e2e/lib/aws_leak_check.sh'
|
||||
- 'tests/e2e/test_aws_leak_check.sh'
|
||||
- 'tests/e2e/test_staging_concierge_e2e.sh'
|
||||
- 'tests/e2e/test_staging_concierge_creates_workspace_e2e.sh'
|
||||
- 'tests/e2e/test_llm_proxy_preflight_unit.sh'
|
||||
- 'workspace-server/internal/staginge2e/**'
|
||||
- 'workspace-server/internal/handlers/platform_agent.go'
|
||||
- 'workspace-server/internal/handlers/user_tasks.go'
|
||||
@@ -288,6 +292,19 @@ jobs:
|
||||
fi
|
||||
echo "Staging CP healthy ✓"
|
||||
|
||||
# core#2675: completion-gated lanes must distinguish "staging LLM proxy
|
||||
# down" from "real code bug" with a distinct, machine-readable status
|
||||
# description prefix `DEP-DOWN:staging-llm` so the redgate-reporter
|
||||
# can dedup N identical reds into ONE incident issue. Wired into the
|
||||
# pr-validate job first; the same source line is replicated in the
|
||||
# other 3 completion-gated lanes in a follow-up commit (the file's
|
||||
# 5 nearly-identical job blocks are mechanically derivable).
|
||||
- name: LLM proxy preflight (DEP-DOWN:staging-llm)
|
||||
run: |
|
||||
# shellcheck source=lib/llm_proxy_preflight.sh
|
||||
source tests/e2e/lib/llm_proxy_preflight.sh
|
||||
llm_proxy_preflight
|
||||
|
||||
- name: Run full-lifecycle E2E
|
||||
id: e2e
|
||||
run: bash tests/e2e/test_staging_full_saas.sh
|
||||
|
||||
Executable
+117
@@ -0,0 +1,117 @@
|
||||
#!/usr/bin/env bash
|
||||
# LLM-proxy preflight helper for completion-gated e2e lanes (core#2675).
|
||||
#
|
||||
# PURPOSE
|
||||
# =======
|
||||
# Before booting workspaces (an expensive, multi-minute operation), confirm
|
||||
# the staging LLM proxy can serve a cheap completion. The 2026-06-12 staging
|
||||
# LLM outage (~21:10-21:38Z) produced 4 identical red CI lanes — Staging SaaS
|
||||
# x3 + Local Provision — with no machine-readable signal distinguishing
|
||||
# "dependency down" from "real code bug." Triage required forensic log-diffing
|
||||
# across lanes and (per the issue) initially mis-attributed an unrelated
|
||||
# deploy-path bug to the outage.
|
||||
#
|
||||
# This preflight fast-fails the lane with a DISTINCT, machine-readable status
|
||||
# description prefix `DEP-DOWN:staging-llm` so the redgate-reporter can:
|
||||
# 1. file ONE incident issue for the dependency outage (dedup), and
|
||||
# 2. let operators skip the lane's workspace-boot logic while the
|
||||
# dependency is being restored.
|
||||
#
|
||||
# The convention (status description prefix + per-run dedup) is the whole
|
||||
# deliverable; the actual LLM-proxy endpoint is configurable via env so the
|
||||
# same helper works across lanes with different proxy URLs (e.g. the
|
||||
# staging SaaS stack uses a different LLM proxy than the local-provision
|
||||
# dev proxy).
|
||||
#
|
||||
# CONVENTIONS
|
||||
# ===========
|
||||
# - Source this lib AFTER the host script defines fail()/ok()/log().
|
||||
# - Call `llm_proxy_preflight` (no args). It reads E2E_LLM_PROXY_URL
|
||||
# (or falls back to deriving one from MOLECULE_CP_URL) and exits the
|
||||
# whole lane on failure.
|
||||
# - The status description prefix `DEP-DOWN:staging-llm` is the SSOT —
|
||||
# `redgate-reporter` parses this and dedups. Do NOT change the prefix
|
||||
# without coordinating the redgate-reporter's parser.
|
||||
#
|
||||
# STATUS CODES
|
||||
# ============
|
||||
# 0 preflight OK (the proxy answered a cheap completion cleanly)
|
||||
# 70 DEP-DOWN:staging-llm (proxy unreachable, slow, or auth-failed)
|
||||
# 71 E2E_LLM_PROXY_URL not set and the URL could not be derived
|
||||
#
|
||||
# Why distinct exit codes: the redgate-reporter and the workflow's notify
|
||||
# step can use them to differentiate "infrastructure down" from "config
|
||||
# missing" (the latter is operator error and should not dedup against
|
||||
# live dependency outages).
|
||||
|
||||
# e2e_llm_proxy_preflight
|
||||
# Source the lib's `llm_proxy_preflight` function. Returns 0 on success,
|
||||
# 70/71 on the dedicated DEP-DOWN / config-missing cases.
|
||||
llm_proxy_preflight() {
|
||||
local proxy_url="${E2E_LLM_PROXY_URL:-}"
|
||||
local timeout_secs="${E2E_LLM_PROXY_TIMEOUT:-30}"
|
||||
|
||||
if [ -z "$proxy_url" ]; then
|
||||
# Derive from the CP URL when not set. The platform-managed LLM proxy
|
||||
# is exposed at <cp_url>/api/v1/internal/llm/openai/v1; the staging
|
||||
# instance lives at staging-api.moleculesai.app. E2E_LLM_PROXY_URL
|
||||
# override stays available for lanes that point at a different proxy
|
||||
# (local provision uses the local workspace-server's built-in proxy).
|
||||
if [ -n "${MOLECULE_CP_URL:-}" ]; then
|
||||
proxy_url="${MOLECULE_CP_URL%/}/api/v1/internal/llm/openai/v1/chat/completions"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -z "$proxy_url" ]; then
|
||||
# Config-missing is NOT a dependency-down condition — it is operator
|
||||
# error (an E2E_LANE was wired without setting E2E_LLM_PROXY_URL or
|
||||
# MOLECULE_CP_URL). Emit a distinct CONFIG-MISSING prefix so the
|
||||
# redgate-reporter dedups separately: DEP-DOWN dedups against
|
||||
# live dependency outages; CONFIG-MISSING dedups against the same
|
||||
# misconfiguration across runs/lanes. Do NOT change the prefix
|
||||
# without coordinating the redgate-reporter's parser.
|
||||
echo "::error::CONFIG-MISSING:staging-llm-proxy-url E2E_LLM_PROXY_URL is unset and could not be derived from MOLECULE_CP_URL"
|
||||
return 71
|
||||
fi
|
||||
|
||||
# Cheap completion: minimal token count, no streaming. The exact model
|
||||
# name is a no-op for the liveness check (any model id that the proxy
|
||||
# will accept is fine; the proxy returns 200 + completion for healthy
|
||||
# provider keys and 5xx/timeout for outage conditions).
|
||||
local body
|
||||
body=$(cat <<'JSON'
|
||||
{"model":"minimax/MiniMax-M2.7","max_tokens":1,"messages":[{"role":"user","content":"pong"}]}
|
||||
JSON
|
||||
)
|
||||
|
||||
local tmpfile http_code
|
||||
tmpfile=$(mktemp)
|
||||
# shellcheck disable=SC2064
|
||||
trap "rm -f '$tmpfile'" RETURN
|
||||
|
||||
http_code=$(curl -sS -o "$tmpfile" -w "%{http_code}" \
|
||||
--max-time "$timeout_secs" \
|
||||
-H "Content-Type: application/json" \
|
||||
-X POST \
|
||||
-d "$body" \
|
||||
"$proxy_url" 2>/dev/null) || http_code="000"
|
||||
|
||||
if [ "$http_code" != "200" ]; then
|
||||
# NOTE: the prefix `DEP-DOWN:staging-llm` is the SSOT that the
|
||||
# redgate-reporter parses for dedup. Do not edit without coordinating
|
||||
# with the redgate-reporter's parser in molecule-ci.
|
||||
echo "::error::DEP-DOWN:staging-llm preflight failed: proxy=$proxy_url http_code=$http_code body=$(head -c 500 "$tmpfile" 2>/dev/null)"
|
||||
return 70
|
||||
fi
|
||||
|
||||
# Even on 200, sanity-check the response shape — an LLM proxy that
|
||||
# returns 200 with an empty/malformed body is itself a class of outage
|
||||
# (the 2026-06-12 incident had a few minutes of 200 + empty body for
|
||||
# one of the affected providers).
|
||||
if ! grep -q '"choices"' "$tmpfile" 2>/dev/null; then
|
||||
echo "::error::DEP-DOWN:staging-llm preflight failed: 200 with malformed body: $(head -c 500 "$tmpfile" 2>/dev/null)"
|
||||
return 70
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
Executable
+228
@@ -0,0 +1,228 @@
|
||||
#!/usr/bin/env bash
|
||||
# Unit tests for tests/e2e/lib/llm_proxy_preflight.sh (core#2675).
|
||||
#
|
||||
# Verifies:
|
||||
# 1. Config-missing path (exit 71) when E2E_LLM_PROXY_URL is unset AND
|
||||
# MOLECULE_CP_URL is unset.
|
||||
# 2. DEP-DOWN path (exit 70) when the proxy URL is unreachable.
|
||||
# 3. DEP-DOWN path (exit 70) when the proxy returns 200 with a
|
||||
# malformed body (the 2026-06-12 incident's "200 with empty body"
|
||||
# class of outage — see lib doc).
|
||||
# 4. Happy path (exit 0) when the proxy returns 200 with a normal
|
||||
# completion body containing "choices".
|
||||
# 5. The error message starts with the `DEP-DOWN:staging-llm` prefix
|
||||
# that the redgate-reporter parses for dedup.
|
||||
#
|
||||
# These tests use a small Python helper as a stand-in for the actual LLM
|
||||
# proxy (avoids needing a real proxy in the test environment). The Python
|
||||
# helper listens on a localhost port and serves a configurable response.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
# Find the lib under test. Allow override for CI flexibility.
|
||||
LIB_PATH="${LIB_PATH:-$(cd "$(dirname "$0")" && pwd)/lib/llm_proxy_preflight.sh}"
|
||||
|
||||
# shellcheck source=lib/llm_proxy_preflight.sh
|
||||
# shellcheck disable=SC1091
|
||||
source "$LIB_PATH"
|
||||
|
||||
# Start a tiny Python HTTP server to stand in for the LLM proxy. We use
|
||||
# Python's http.server because it ships in the base image and doesn't
|
||||
# require extra dependencies. Each test picks a free port via Python's
|
||||
# socket binding (avoids race conditions in test parallelism).
|
||||
PY_SERVER_PORT=""
|
||||
PY_SERVER_LOG=$(mktemp)
|
||||
PY_SERVER_PID=
|
||||
|
||||
start_test_server() {
|
||||
local mode="$1" # "ok" | "down" | "empty_200"
|
||||
# Pick a free port via socket binding; pass it explicitly to the server.
|
||||
local port
|
||||
port=$(python3 -c "
|
||||
import socket
|
||||
s = socket.socket()
|
||||
s.bind(('127.0.0.1', 0))
|
||||
print(s.getsockname()[1])
|
||||
s.close()
|
||||
")
|
||||
cat > /tmp/_llm_preflight_test_server.py <<PYEOF
|
||||
import http.server, json, sys
|
||||
mode = "$mode"
|
||||
port = $port
|
||||
class H(http.server.BaseHTTPRequestHandler):
|
||||
def do_POST(self):
|
||||
if mode == "down":
|
||||
self.send_error(503, "simulated outage")
|
||||
return
|
||||
if mode == "empty_200":
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.end_headers()
|
||||
self.wfile.write(b'{"error":"upstream silent"}')
|
||||
return
|
||||
# ok
|
||||
body = {"choices":[{"message":{"role":"assistant","content":"pong"}}]}
|
||||
payload = json.dumps(body).encode()
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.send_header("Content-Length", str(len(payload)))
|
||||
self.end_headers()
|
||||
self.wfile.write(payload)
|
||||
def log_message(self, *args, **kwargs):
|
||||
pass
|
||||
http.server.HTTPServer(("127.0.0.1", port), H).serve_forever()
|
||||
PYEOF
|
||||
python3 /tmp/_llm_preflight_test_server.py >"$PY_SERVER_LOG" 2>&1 &
|
||||
PY_SERVER_PID=$!
|
||||
# Give the server a moment to bind.
|
||||
sleep 0.3
|
||||
PY_SERVER_PORT="$port"
|
||||
}
|
||||
|
||||
stop_test_server() {
|
||||
if [ -n "$PY_SERVER_PID" ]; then
|
||||
kill "$PY_SERVER_PID" 2>/dev/null || true
|
||||
wait "$PY_SERVER_PID" 2>/dev/null || true
|
||||
fi
|
||||
rm -f /tmp/_llm_preflight_test_server.py "$PY_SERVER_LOG"
|
||||
}
|
||||
trap stop_test_server EXIT
|
||||
|
||||
# Test 1: config-missing path.
|
||||
test_config_missing() {
|
||||
unset E2E_LLM_PROXY_URL
|
||||
unset MOLECULE_CP_URL
|
||||
local out rc
|
||||
out=$(llm_proxy_preflight 2>&1)
|
||||
rc=$?
|
||||
if [ "$rc" -ne 71 ]; then
|
||||
echo "FAIL: test_config_missing expected exit 71, got $rc"
|
||||
echo " output: $out"
|
||||
return 1
|
||||
fi
|
||||
# Config-missing emits CONFIG-MISSING, NOT DEP-DOWN — see the lib's
|
||||
# comment on the status description prefixes. The two dedup buckets
|
||||
# are distinct in the redgate-reporter.
|
||||
if ! echo "$out" | grep -q "CONFIG-MISSING:staging-llm-proxy-url"; then
|
||||
echo "FAIL: test_config_missing output missing CONFIG-MISSING:staging-llm-proxy-url prefix"
|
||||
echo " output: $out"
|
||||
return 1
|
||||
fi
|
||||
if echo "$out" | grep -q "DEP-DOWN:staging-llm"; then
|
||||
echo "FAIL: test_config_missing output should NOT contain DEP-DOWN:staging-llm (config-missing is a separate dedup bucket)"
|
||||
echo " output: $out"
|
||||
return 1
|
||||
fi
|
||||
echo "PASS: test_config_missing"
|
||||
return 0
|
||||
}
|
||||
|
||||
# Test 2: proxy unreachable (TCP connection refused) → exit 70.
|
||||
test_proxy_unreachable() {
|
||||
PY_SERVER_PORT=1 # port 1 is privileged, will refuse
|
||||
start_test_server "ok" # we ignore the server, just want the lib to hit a dead port
|
||||
sleep 0.3
|
||||
E2E_LLM_PROXY_URL="http://127.0.0.1:1/v1/chat/completions"
|
||||
local out rc
|
||||
out=$(llm_proxy_preflight 2>&1)
|
||||
rc=$?
|
||||
if [ "$rc" -ne 70 ]; then
|
||||
echo "FAIL: test_proxy_unreachable expected exit 70, got $rc"
|
||||
echo " output: $out"
|
||||
return 1
|
||||
fi
|
||||
if ! echo "$out" | grep -q "DEP-DOWN:staging-llm"; then
|
||||
echo "FAIL: test_proxy_unreachable output missing DEP-DOWN:staging-llm prefix"
|
||||
echo " output: $out"
|
||||
return 1
|
||||
fi
|
||||
echo "PASS: test_proxy_unreachable"
|
||||
return 0
|
||||
}
|
||||
|
||||
# Test 3: proxy returns 200 with malformed body → exit 70.
|
||||
test_200_empty_body() {
|
||||
PY_SERVER_PORT=0
|
||||
start_test_server "empty_200"
|
||||
# E2E_LLM_PROXY_URL is read by the sourced llm_proxy_preflight helper
|
||||
# (lib/llm_proxy_preflight.sh) via ${E2E_LLM_PROXY_URL:-}. Export it
|
||||
# here so shellcheck doesn't false-positive SC2034 (appears unused) when
|
||||
# the test file is checked in isolation.
|
||||
export E2E_LLM_PROXY_URL="http://127.0.0.1:${PY_SERVER_PORT}/v1/chat/completions"
|
||||
local out rc
|
||||
out=$(llm_proxy_preflight 2>&1)
|
||||
rc=$?
|
||||
if [ "$rc" -ne 70 ]; then
|
||||
echo "FAIL: test_200_empty_body expected exit 70, got $rc"
|
||||
echo " output: $out"
|
||||
return 1
|
||||
fi
|
||||
if ! echo "$out" | grep -q "DEP-DOWN:staging-llm"; then
|
||||
echo "FAIL: test_200_empty_body output missing DEP-DOWN:staging-llm prefix"
|
||||
echo " output: $out"
|
||||
return 1
|
||||
fi
|
||||
stop_test_server
|
||||
PY_SERVER_PID=
|
||||
echo "PASS: test_200_empty_body"
|
||||
return 0
|
||||
}
|
||||
|
||||
# Test 4: happy path → exit 0.
|
||||
test_ok() {
|
||||
PY_SERVER_PORT=0
|
||||
start_test_server "ok"
|
||||
# E2E_LLM_PROXY_URL is read by the sourced llm_proxy_preflight helper
|
||||
# (lib/llm_proxy_preflight.sh) via ${E2E_LLM_PROXY_URL:-}. Export it
|
||||
# here so shellcheck doesn't false-positive SC2034 (appears unused) when
|
||||
# the test file is checked in isolation.
|
||||
export E2E_LLM_PROXY_URL="http://127.0.0.1:${PY_SERVER_PORT}/v1/chat/completions"
|
||||
local out rc
|
||||
out=$(llm_proxy_preflight 2>&1)
|
||||
rc=$?
|
||||
if [ "$rc" -ne 0 ]; then
|
||||
echo "FAIL: test_ok expected exit 0, got $rc"
|
||||
echo " output: $out"
|
||||
return 1
|
||||
fi
|
||||
stop_test_server
|
||||
PY_SERVER_PID=
|
||||
echo "PASS: test_ok"
|
||||
return 0
|
||||
}
|
||||
|
||||
# Test 5: proxy returns 503 (simulated outage) → exit 70.
|
||||
test_503() {
|
||||
PY_SERVER_PORT=0
|
||||
start_test_server "down"
|
||||
# E2E_LLM_PROXY_URL is read by the sourced llm_proxy_preflight helper
|
||||
# (lib/llm_proxy_preflight.sh) via ${E2E_LLM_PROXY_URL:-}. Export it
|
||||
# here so shellcheck doesn't false-positive SC2034 (appears unused) when
|
||||
# the test file is checked in isolation.
|
||||
export E2E_LLM_PROXY_URL="http://127.0.0.1:${PY_SERVER_PORT}/v1/chat/completions"
|
||||
local out rc
|
||||
out=$(llm_proxy_preflight 2>&1)
|
||||
rc=$?
|
||||
if [ "$rc" -ne 70 ]; then
|
||||
echo "FAIL: test_503 expected exit 70, got $rc"
|
||||
echo " output: $out"
|
||||
return 1
|
||||
fi
|
||||
stop_test_server
|
||||
PY_SERVER_PID=
|
||||
echo "PASS: test_503"
|
||||
return 0
|
||||
}
|
||||
|
||||
failed=0
|
||||
test_config_missing || failed=$((failed+1))
|
||||
test_proxy_unreachable || failed=$((failed+1))
|
||||
test_200_empty_body || failed=$((failed+1))
|
||||
test_ok || failed=$((failed+1))
|
||||
test_503 || failed=$((failed+1))
|
||||
|
||||
if [ "$failed" -gt 0 ]; then
|
||||
echo "FAILED: $failed test(s)"
|
||||
exit 1
|
||||
fi
|
||||
echo "All llm_proxy_preflight unit tests passed"
|
||||
Reference in New Issue
Block a user