diff --git a/.gitea/workflows/e2e-staging-saas.yml b/.gitea/workflows/e2e-staging-saas.yml index baaed9e3f..3af5cc969 100644 --- a/.gitea/workflows/e2e-staging-saas.yml +++ b/.gitea/workflows/e2e-staging-saas.yml @@ -51,11 +51,13 @@ on: - 'workspace-server/internal/providers/providers.yaml' - 'tests/e2e/test_staging_full_saas.sh' - 'tests/e2e/lib/completion_assert.sh' + - 'tests/e2e/lib/llm_proxy_preflight.sh' - 'tests/e2e/lib/model_slug.sh' - 'tests/e2e/lib/aws_leak_check.sh' - 'tests/e2e/test_aws_leak_check.sh' - 'tests/e2e/test_staging_concierge_e2e.sh' - 'tests/e2e/test_staging_concierge_creates_workspace_e2e.sh' + - 'tests/e2e/test_llm_proxy_preflight_unit.sh' - 'workspace-server/internal/staginge2e/**' - 'workspace-server/internal/handlers/platform_agent.go' - 'workspace-server/internal/handlers/user_tasks.go' @@ -73,11 +75,13 @@ on: - 'workspace-server/internal/providers/providers.yaml' - 'tests/e2e/test_staging_full_saas.sh' - 'tests/e2e/lib/completion_assert.sh' + - 'tests/e2e/lib/llm_proxy_preflight.sh' - 'tests/e2e/lib/model_slug.sh' - 'tests/e2e/lib/aws_leak_check.sh' - 'tests/e2e/test_aws_leak_check.sh' - 'tests/e2e/test_staging_concierge_e2e.sh' - 'tests/e2e/test_staging_concierge_creates_workspace_e2e.sh' + - 'tests/e2e/test_llm_proxy_preflight_unit.sh' - 'workspace-server/internal/staginge2e/**' - 'workspace-server/internal/handlers/platform_agent.go' - 'workspace-server/internal/handlers/user_tasks.go' @@ -288,6 +292,19 @@ jobs: fi echo "Staging CP healthy ✓" + # core#2675: completion-gated lanes must distinguish "staging LLM proxy + # down" from "real code bug" with a distinct, machine-readable status + # description prefix `DEP-DOWN:staging-llm` so the redgate-reporter + # can dedup N identical reds into ONE incident issue. Wired into the + # pr-validate job first; the same source line is replicated in the + # other 3 completion-gated lanes in a follow-up commit (the file's + # 5 nearly-identical job blocks are mechanically derivable). + - name: LLM proxy preflight (DEP-DOWN:staging-llm) + run: | + # shellcheck source=lib/llm_proxy_preflight.sh + source tests/e2e/lib/llm_proxy_preflight.sh + llm_proxy_preflight + - name: Run full-lifecycle E2E id: e2e run: bash tests/e2e/test_staging_full_saas.sh diff --git a/tests/e2e/lib/llm_proxy_preflight.sh b/tests/e2e/lib/llm_proxy_preflight.sh new file mode 100755 index 000000000..144709f55 --- /dev/null +++ b/tests/e2e/lib/llm_proxy_preflight.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash +# LLM-proxy preflight helper for completion-gated e2e lanes (core#2675). +# +# PURPOSE +# ======= +# Before booting workspaces (an expensive, multi-minute operation), confirm +# the staging LLM proxy can serve a cheap completion. The 2026-06-12 staging +# LLM outage (~21:10-21:38Z) produced 4 identical red CI lanes — Staging SaaS +# x3 + Local Provision — with no machine-readable signal distinguishing +# "dependency down" from "real code bug." Triage required forensic log-diffing +# across lanes and (per the issue) initially mis-attributed an unrelated +# deploy-path bug to the outage. +# +# This preflight fast-fails the lane with a DISTINCT, machine-readable status +# description prefix `DEP-DOWN:staging-llm` so the redgate-reporter can: +# 1. file ONE incident issue for the dependency outage (dedup), and +# 2. let operators skip the lane's workspace-boot logic while the +# dependency is being restored. +# +# The convention (status description prefix + per-run dedup) is the whole +# deliverable; the actual LLM-proxy endpoint is configurable via env so the +# same helper works across lanes with different proxy URLs (e.g. the +# staging SaaS stack uses a different LLM proxy than the local-provision +# dev proxy). +# +# CONVENTIONS +# =========== +# - Source this lib AFTER the host script defines fail()/ok()/log(). +# - Call `llm_proxy_preflight` (no args). It reads E2E_LLM_PROXY_URL +# (or falls back to deriving one from MOLECULE_CP_URL) and exits the +# whole lane on failure. +# - The status description prefix `DEP-DOWN:staging-llm` is the SSOT — +# `redgate-reporter` parses this and dedups. Do NOT change the prefix +# without coordinating the redgate-reporter's parser. +# +# STATUS CODES +# ============ +# 0 preflight OK (the proxy answered a cheap completion cleanly) +# 70 DEP-DOWN:staging-llm (proxy unreachable, slow, or auth-failed) +# 71 E2E_LLM_PROXY_URL not set and the URL could not be derived +# +# Why distinct exit codes: the redgate-reporter and the workflow's notify +# step can use them to differentiate "infrastructure down" from "config +# missing" (the latter is operator error and should not dedup against +# live dependency outages). + +# e2e_llm_proxy_preflight +# Source the lib's `llm_proxy_preflight` function. Returns 0 on success, +# 70/71 on the dedicated DEP-DOWN / config-missing cases. +llm_proxy_preflight() { + local proxy_url="${E2E_LLM_PROXY_URL:-}" + local timeout_secs="${E2E_LLM_PROXY_TIMEOUT:-30}" + + if [ -z "$proxy_url" ]; then + # Derive from the CP URL when not set. The platform-managed LLM proxy + # is exposed at /api/v1/internal/llm/openai/v1; the staging + # instance lives at staging-api.moleculesai.app. E2E_LLM_PROXY_URL + # override stays available for lanes that point at a different proxy + # (local provision uses the local workspace-server's built-in proxy). + if [ -n "${MOLECULE_CP_URL:-}" ]; then + proxy_url="${MOLECULE_CP_URL%/}/api/v1/internal/llm/openai/v1/chat/completions" + fi + fi + + if [ -z "$proxy_url" ]; then + # Config-missing is NOT a dependency-down condition — it is operator + # error (an E2E_LANE was wired without setting E2E_LLM_PROXY_URL or + # MOLECULE_CP_URL). Emit a distinct CONFIG-MISSING prefix so the + # redgate-reporter dedups separately: DEP-DOWN dedups against + # live dependency outages; CONFIG-MISSING dedups against the same + # misconfiguration across runs/lanes. Do NOT change the prefix + # without coordinating the redgate-reporter's parser. + echo "::error::CONFIG-MISSING:staging-llm-proxy-url E2E_LLM_PROXY_URL is unset and could not be derived from MOLECULE_CP_URL" + return 71 + fi + + # Cheap completion: minimal token count, no streaming. The exact model + # name is a no-op for the liveness check (any model id that the proxy + # will accept is fine; the proxy returns 200 + completion for healthy + # provider keys and 5xx/timeout for outage conditions). + local body + body=$(cat <<'JSON' +{"model":"minimax/MiniMax-M2.7","max_tokens":1,"messages":[{"role":"user","content":"pong"}]} +JSON +) + + local tmpfile http_code + tmpfile=$(mktemp) + # shellcheck disable=SC2064 + trap "rm -f '$tmpfile'" RETURN + + http_code=$(curl -sS -o "$tmpfile" -w "%{http_code}" \ + --max-time "$timeout_secs" \ + -H "Content-Type: application/json" \ + -X POST \ + -d "$body" \ + "$proxy_url" 2>/dev/null) || http_code="000" + + if [ "$http_code" != "200" ]; then + # NOTE: the prefix `DEP-DOWN:staging-llm` is the SSOT that the + # redgate-reporter parses for dedup. Do not edit without coordinating + # with the redgate-reporter's parser in molecule-ci. + echo "::error::DEP-DOWN:staging-llm preflight failed: proxy=$proxy_url http_code=$http_code body=$(head -c 500 "$tmpfile" 2>/dev/null)" + return 70 + fi + + # Even on 200, sanity-check the response shape — an LLM proxy that + # returns 200 with an empty/malformed body is itself a class of outage + # (the 2026-06-12 incident had a few minutes of 200 + empty body for + # one of the affected providers). + if ! grep -q '"choices"' "$tmpfile" 2>/dev/null; then + echo "::error::DEP-DOWN:staging-llm preflight failed: 200 with malformed body: $(head -c 500 "$tmpfile" 2>/dev/null)" + return 70 + fi + + return 0 +} diff --git a/tests/e2e/test_llm_proxy_preflight_unit.sh b/tests/e2e/test_llm_proxy_preflight_unit.sh new file mode 100755 index 000000000..46c06f12a --- /dev/null +++ b/tests/e2e/test_llm_proxy_preflight_unit.sh @@ -0,0 +1,228 @@ +#!/usr/bin/env bash +# Unit tests for tests/e2e/lib/llm_proxy_preflight.sh (core#2675). +# +# Verifies: +# 1. Config-missing path (exit 71) when E2E_LLM_PROXY_URL is unset AND +# MOLECULE_CP_URL is unset. +# 2. DEP-DOWN path (exit 70) when the proxy URL is unreachable. +# 3. DEP-DOWN path (exit 70) when the proxy returns 200 with a +# malformed body (the 2026-06-12 incident's "200 with empty body" +# class of outage — see lib doc). +# 4. Happy path (exit 0) when the proxy returns 200 with a normal +# completion body containing "choices". +# 5. The error message starts with the `DEP-DOWN:staging-llm` prefix +# that the redgate-reporter parses for dedup. +# +# These tests use a small Python helper as a stand-in for the actual LLM +# proxy (avoids needing a real proxy in the test environment). The Python +# helper listens on a localhost port and serves a configurable response. + +set -uo pipefail + +# Find the lib under test. Allow override for CI flexibility. +LIB_PATH="${LIB_PATH:-$(cd "$(dirname "$0")" && pwd)/lib/llm_proxy_preflight.sh}" + +# shellcheck source=lib/llm_proxy_preflight.sh +# shellcheck disable=SC1091 +source "$LIB_PATH" + +# Start a tiny Python HTTP server to stand in for the LLM proxy. We use +# Python's http.server because it ships in the base image and doesn't +# require extra dependencies. Each test picks a free port via Python's +# socket binding (avoids race conditions in test parallelism). +PY_SERVER_PORT="" +PY_SERVER_LOG=$(mktemp) +PY_SERVER_PID= + +start_test_server() { + local mode="$1" # "ok" | "down" | "empty_200" + # Pick a free port via socket binding; pass it explicitly to the server. + local port + port=$(python3 -c " +import socket +s = socket.socket() +s.bind(('127.0.0.1', 0)) +print(s.getsockname()[1]) +s.close() +") + cat > /tmp/_llm_preflight_test_server.py <"$PY_SERVER_LOG" 2>&1 & + PY_SERVER_PID=$! + # Give the server a moment to bind. + sleep 0.3 + PY_SERVER_PORT="$port" +} + +stop_test_server() { + if [ -n "$PY_SERVER_PID" ]; then + kill "$PY_SERVER_PID" 2>/dev/null || true + wait "$PY_SERVER_PID" 2>/dev/null || true + fi + rm -f /tmp/_llm_preflight_test_server.py "$PY_SERVER_LOG" +} +trap stop_test_server EXIT + +# Test 1: config-missing path. +test_config_missing() { + unset E2E_LLM_PROXY_URL + unset MOLECULE_CP_URL + local out rc + out=$(llm_proxy_preflight 2>&1) + rc=$? + if [ "$rc" -ne 71 ]; then + echo "FAIL: test_config_missing expected exit 71, got $rc" + echo " output: $out" + return 1 + fi + # Config-missing emits CONFIG-MISSING, NOT DEP-DOWN — see the lib's + # comment on the status description prefixes. The two dedup buckets + # are distinct in the redgate-reporter. + if ! echo "$out" | grep -q "CONFIG-MISSING:staging-llm-proxy-url"; then + echo "FAIL: test_config_missing output missing CONFIG-MISSING:staging-llm-proxy-url prefix" + echo " output: $out" + return 1 + fi + if echo "$out" | grep -q "DEP-DOWN:staging-llm"; then + echo "FAIL: test_config_missing output should NOT contain DEP-DOWN:staging-llm (config-missing is a separate dedup bucket)" + echo " output: $out" + return 1 + fi + echo "PASS: test_config_missing" + return 0 +} + +# Test 2: proxy unreachable (TCP connection refused) → exit 70. +test_proxy_unreachable() { + PY_SERVER_PORT=1 # port 1 is privileged, will refuse + start_test_server "ok" # we ignore the server, just want the lib to hit a dead port + sleep 0.3 + E2E_LLM_PROXY_URL="http://127.0.0.1:1/v1/chat/completions" + local out rc + out=$(llm_proxy_preflight 2>&1) + rc=$? + if [ "$rc" -ne 70 ]; then + echo "FAIL: test_proxy_unreachable expected exit 70, got $rc" + echo " output: $out" + return 1 + fi + if ! echo "$out" | grep -q "DEP-DOWN:staging-llm"; then + echo "FAIL: test_proxy_unreachable output missing DEP-DOWN:staging-llm prefix" + echo " output: $out" + return 1 + fi + echo "PASS: test_proxy_unreachable" + return 0 +} + +# Test 3: proxy returns 200 with malformed body → exit 70. +test_200_empty_body() { + PY_SERVER_PORT=0 + start_test_server "empty_200" + # E2E_LLM_PROXY_URL is read by the sourced llm_proxy_preflight helper + # (lib/llm_proxy_preflight.sh) via ${E2E_LLM_PROXY_URL:-}. Export it + # here so shellcheck doesn't false-positive SC2034 (appears unused) when + # the test file is checked in isolation. + export E2E_LLM_PROXY_URL="http://127.0.0.1:${PY_SERVER_PORT}/v1/chat/completions" + local out rc + out=$(llm_proxy_preflight 2>&1) + rc=$? + if [ "$rc" -ne 70 ]; then + echo "FAIL: test_200_empty_body expected exit 70, got $rc" + echo " output: $out" + return 1 + fi + if ! echo "$out" | grep -q "DEP-DOWN:staging-llm"; then + echo "FAIL: test_200_empty_body output missing DEP-DOWN:staging-llm prefix" + echo " output: $out" + return 1 + fi + stop_test_server + PY_SERVER_PID= + echo "PASS: test_200_empty_body" + return 0 +} + +# Test 4: happy path → exit 0. +test_ok() { + PY_SERVER_PORT=0 + start_test_server "ok" + # E2E_LLM_PROXY_URL is read by the sourced llm_proxy_preflight helper + # (lib/llm_proxy_preflight.sh) via ${E2E_LLM_PROXY_URL:-}. Export it + # here so shellcheck doesn't false-positive SC2034 (appears unused) when + # the test file is checked in isolation. + export E2E_LLM_PROXY_URL="http://127.0.0.1:${PY_SERVER_PORT}/v1/chat/completions" + local out rc + out=$(llm_proxy_preflight 2>&1) + rc=$? + if [ "$rc" -ne 0 ]; then + echo "FAIL: test_ok expected exit 0, got $rc" + echo " output: $out" + return 1 + fi + stop_test_server + PY_SERVER_PID= + echo "PASS: test_ok" + return 0 +} + +# Test 5: proxy returns 503 (simulated outage) → exit 70. +test_503() { + PY_SERVER_PORT=0 + start_test_server "down" + # E2E_LLM_PROXY_URL is read by the sourced llm_proxy_preflight helper + # (lib/llm_proxy_preflight.sh) via ${E2E_LLM_PROXY_URL:-}. Export it + # here so shellcheck doesn't false-positive SC2034 (appears unused) when + # the test file is checked in isolation. + export E2E_LLM_PROXY_URL="http://127.0.0.1:${PY_SERVER_PORT}/v1/chat/completions" + local out rc + out=$(llm_proxy_preflight 2>&1) + rc=$? + if [ "$rc" -ne 70 ]; then + echo "FAIL: test_503 expected exit 70, got $rc" + echo " output: $out" + return 1 + fi + stop_test_server + PY_SERVER_PID= + echo "PASS: test_503" + return 0 +} + +failed=0 +test_config_missing || failed=$((failed+1)) +test_proxy_unreachable || failed=$((failed+1)) +test_200_empty_body || failed=$((failed+1)) +test_ok || failed=$((failed+1)) +test_503 || failed=$((failed+1)) + +if [ "$failed" -gt 0 ]; then + echo "FAILED: $failed test(s)" + exit 1 +fi +echo "All llm_proxy_preflight unit tests passed"