From aa2ae25ac85383dd67b801637e2dce75acb0a581 Mon Sep 17 00:00:00 2001
From: "Molecule AI Dev Engineer A (Kimi)"
 <dev-engineer-a-kimi@agents.moleculesai.app>
Date: Sun, 14 Jun 2026 16:41:34 +0000
Subject: [PATCH 1/2] fix(e2e): #76 staging LLM preflight uses correct model
 slug + optional auth

The preflight hard-coded the namespaced model slug ,
which the staging LLM proxy rejects, causing a false DEP-DOWN while the
real E2E (bare slug) would succeed. It also sent no Authorization header,
so proxies that require auth were mis-classified as down.

Changes:
- Default preflight model to the bare slug .
- Add  override for lanes that need a different
  provider/model slug.
- Add  override; when set, sent as
  .
- Add  to curl so redirects from the proxy are followed.
- Update unit tests to cover custom model and auth header.

Fixes #76
---
 tests/e2e/lib/llm_proxy_preflight.sh       | 30 +++++++--
 tests/e2e/test_llm_proxy_preflight_unit.sh | 78 +++++++++++++++++++++-
 2 files changed, 99 insertions(+), 9 deletions(-)

diff --git a/tests/e2e/lib/llm_proxy_preflight.sh b/tests/e2e/lib/llm_proxy_preflight.sh
index 144709f5..9f493ea2 100755
--- a/tests/e2e/lib/llm_proxy_preflight.sh
+++ b/tests/e2e/lib/llm_proxy_preflight.sh
@@ -78,20 +78,38 @@ llm_proxy_preflight() {
   # name is a no-op for the liveness check (any model id that the proxy
   # will accept is fine; the proxy returns 200 + completion for healthy
   # provider keys and 5xx/timeout for outage conditions).
+  #
+  # #76 root cause: the hard-coded namespaced slug `minimax/MiniMax-M2.7`
+  # is rejected by the staging LLM proxy's model validation, so the
+  # preflight false-reds while the real E2E (which uses the bare slug
+  # `MiniMax-M2.7`) would succeed. Default to the bare slug and allow
+  # per-lane override via E2E_LLM_PREFLIGHT_MODEL.
+  local model="${E2E_LLM_PREFLIGHT_MODEL:-MiniMax-M2.7}"
+  local api_key="${E2E_LLM_PREFLIGHT_API_KEY:-}"
   local body
-  body=$(cat <<'JSON'
-{"model":"minimax/MiniMax-M2.7","max_tokens":1,"messages":[{"role":"user","content":"pong"}]}
+  body=$(cat <<JSON
+{"model":"$model","max_tokens":1,"messages":[{"role":"user","content":"pong"}]}
 JSON
 )
 
-  local tmpfile http_code
+  local tmpfile http_code curl_opts
   tmpfile=$(mktemp)
   # shellcheck disable=SC2064
   trap "rm -f '$tmpfile'" RETURN
 
-  http_code=$(curl -sS -o "$tmpfile" -w "%{http_code}" \
-    --max-time "$timeout_secs" \
-    -H "Content-Type: application/json" \
+  curl_opts=(
+    -sS
+    -L
+    -o "$tmpfile"
+    -w "%{http_code}"
+    --max-time "$timeout_secs"
+    -H "Content-Type: application/json"
+  )
+  if [ -n "$api_key" ]; then
+    curl_opts+=(-H "Authorization: Bearer $api_key")
+  fi
+
+  http_code=$(curl "${curl_opts[@]}" \
     -X POST \
     -d "$body" \
     "$proxy_url" 2>/dev/null) || http_code="000"
diff --git a/tests/e2e/test_llm_proxy_preflight_unit.sh b/tests/e2e/test_llm_proxy_preflight_unit.sh
index 46c06f12..4bc83f5b 100755
--- a/tests/e2e/test_llm_proxy_preflight_unit.sh
+++ b/tests/e2e/test_llm_proxy_preflight_unit.sh
@@ -35,7 +35,7 @@ PY_SERVER_LOG=$(mktemp)
 PY_SERVER_PID=
 
 start_test_server() {
-  local mode="$1"  # "ok" | "down" | "empty_200"
+  local mode="$1"  # "ok" | "down" | "empty_200" | "echo" | "auth"
   # Pick a free port via socket binding; pass it explicitly to the server.
   local port
   port=$(python3 -c "
@@ -51,6 +51,12 @@ mode = "$mode"
 port = $port
 class H(http.server.BaseHTTPRequestHandler):
     def do_POST(self):
+        length = int(self.headers.get('Content-Length', 0))
+        raw = self.rfile.read(length).decode('utf-8', errors='replace')
+        try:
+            req = json.loads(raw) if raw else {}
+        except json.JSONDecodeError:
+            req = {}
         if mode == "down":
             self.send_error(503, "simulated outage")
             return
@@ -60,8 +66,24 @@ class H(http.server.BaseHTTPRequestHandler):
             self.end_headers()
             self.wfile.write(b'{"error":"upstream silent"}')
             return
-        # ok
-        body = {"choices":[{"message":{"role":"assistant","content":"pong"}}]}
+        if mode == "auth":
+            auth = self.headers.get('Authorization', '')
+            if not auth.startswith('Bearer '):
+                self.send_response(401)
+                self.end_headers()
+                self.wfile.write(b'{"error":"missing auth"}')
+                return
+            # fall through to ok response
+        # ok / echo / auth-success: echo model back so tests can verify
+        # the request body was sent correctly. Also persist the full request
+        # to a well-known file for tests that need to inspect it.
+        req_path = "/tmp/_llm_preflight_last_request.json"
+        try:
+            with open(req_path, "w") as fh:
+                json.dump(req, fh)
+        except Exception:
+            pass
+        body = {"choices":[{"message":{"role":"assistant","content":req.get("model","pong")}}]}
         payload = json.dumps(body).encode()
         self.send_response(200)
         self.send_header("Content-Type", "application/json")
@@ -214,12 +236,62 @@ test_503() {
   return 0
 }
 
+# Test 6: custom model slug via E2E_LLM_PREFLIGHT_MODEL is sent in the request body.
+test_model_override() {
+  PY_SERVER_PORT=0
+  start_test_server "echo"
+  export E2E_LLM_PROXY_URL="http://127.0.0.1:${PY_SERVER_PORT}/v1/chat/completions"
+  export E2E_LLM_PREFLIGHT_MODEL="custom-model-42"
+  rm -f /tmp/_llm_preflight_last_request.json
+  local out rc
+  out=$(llm_proxy_preflight 2>&1)
+  rc=$?
+  unset E2E_LLM_PREFLIGHT_MODEL
+  stop_test_server
+  PY_SERVER_PID=
+  if [ "$rc" -ne 0 ]; then
+    echo "FAIL: test_model_override expected exit 0, got $rc"
+    echo "  output: $out"
+    return 1
+  fi
+  if ! python3 -c "import json; d=json.load(open('/tmp/_llm_preflight_last_request.json')); assert d.get('model')=='custom-model-42'; print('model ok')" 2>&1; then
+    echo "FAIL: test_model_override did not send the custom model in the request body"
+    echo "  request file: $(cat /tmp/_llm_preflight_last_request.json 2>/dev/null || echo '<missing>')"
+    return 1
+  fi
+  echo "PASS: test_model_override"
+  return 0
+}
+
+# Test 7: optional Authorization header is sent when E2E_LLM_PREFLIGHT_API_KEY is set.
+test_auth_header() {
+  PY_SERVER_PORT=0
+  start_test_server "auth"
+  export E2E_LLM_PROXY_URL="http://127.0.0.1:${PY_SERVER_PORT}/v1/chat/completions"
+  export E2E_LLM_PREFLIGHT_API_KEY="test-token-123"
+  local out rc
+  out=$(llm_proxy_preflight 2>&1)
+  rc=$?
+  unset E2E_LLM_PREFLIGHT_API_KEY
+  stop_test_server
+  PY_SERVER_PID=
+  if [ "$rc" -ne 0 ]; then
+    echo "FAIL: test_auth_header expected exit 0, got $rc"
+    echo "  output: $out"
+    return 1
+  fi
+  echo "PASS: test_auth_header"
+  return 0
+}
+
 failed=0
 test_config_missing || failed=$((failed+1))
 test_proxy_unreachable || failed=$((failed+1))
 test_200_empty_body || failed=$((failed+1))
 test_ok || failed=$((failed+1))
 test_503 || failed=$((failed+1))
+test_model_override || failed=$((failed+1))
+test_auth_header || failed=$((failed+1))
 
 if [ "$failed" -gt 0 ]; then
   echo "FAILED: $failed test(s)"
-- 
2.52.0


From 2234b4ace188c2b949d94a7e7f923138260d31e3 Mon Sep 17 00:00:00 2001
From: "Molecule AI Dev Engineer A (Kimi)"
 <dev-engineer-a-kimi@agents.moleculesai.app>
Date: Sun, 14 Jun 2026 17:17:09 +0000
Subject: [PATCH 2/2] fix(e2e): #76 staging LLM preflight treats any HTTP
 response as UP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The preflight was classifying the staging LLM proxy's 401 response to
an unauthenticated probe as DEP-DOWN, causing fleet-wide false
staging-down incidents since 2026-06-13.

Adopt Option 1 from the driver brief: the preflight only needs to prove
REACHABILITY. Any HTTP response (including 401/403/404) means the proxy
is up; only transport failures (connection refused, timeout) or 5xx
server errors classify as DEP-DOWN.

Changes:
- Reclassify non-5xx HTTP responses as preflight OK.
- Remove the optional Authorization header path (no credential needed).
- Update unit tests: 401 now passes, 5xx/unreachable still fail.

Fixes #76

🤖 Generated with [Claude Code](https://claude.com/claude-code)
---
 tests/e2e/lib/llm_proxy_preflight.sh       |  65 +++++-------
 tests/e2e/test_llm_proxy_preflight_unit.sh | 113 ++++-----------------
 2 files changed, 43 insertions(+), 135 deletions(-)

diff --git a/tests/e2e/lib/llm_proxy_preflight.sh b/tests/e2e/lib/llm_proxy_preflight.sh
index 9f493ea2..e767d3c5 100755
--- a/tests/e2e/lib/llm_proxy_preflight.sh
+++ b/tests/e2e/lib/llm_proxy_preflight.sh
@@ -35,14 +35,23 @@
 #
 # STATUS CODES
 # ============
-#   0   preflight OK (the proxy answered a cheap completion cleanly)
-#   70  DEP-DOWN:staging-llm (proxy unreachable, slow, or auth-failed)
+#   0   preflight OK (the proxy is reachable and returned an HTTP response)
+#   70  DEP-DOWN:staging-llm (proxy unreachable, slow, or returned a 5xx)
 #   71  E2E_LLM_PROXY_URL not set and the URL could not be derived
 #
 # Why distinct exit codes: the redgate-reporter and the workflow's notify
 # step can use them to differentiate "infrastructure down" from "config
 # missing" (the latter is operator error and should not dedup against
 # live dependency outages).
+#
+# SEMANTICS NOTE (#76 root cause, 2026-06-13):
+#   The preflight sends an UNauthenticated probe. A healthy staging LLM proxy
+#   that requires auth correctly returns 401. Previously any non-200 status
+#   (including 401) was classified as DEP-DOWN, causing fleet-wide false
+#   staging-down incidents. The preflight only needs to prove REACHABILITY:
+#   any HTTP response (including 401/403/404) means the proxy is up. Only
+#   transport failures (connection refused, timeout) or 5xx server errors
+#   classify as DEP-DOWN.
 
 # e2e_llm_proxy_preflight
 #   Source the lib's `llm_proxy_preflight` function. Returns 0 on success,
@@ -74,47 +83,32 @@ llm_proxy_preflight() {
     return 71
   fi
 
-  # Cheap completion: minimal token count, no streaming. The exact model
-  # name is a no-op for the liveness check (any model id that the proxy
-  # will accept is fine; the proxy returns 200 + completion for healthy
-  # provider keys and 5xx/timeout for outage conditions).
-  #
-  # #76 root cause: the hard-coded namespaced slug `minimax/MiniMax-M2.7`
-  # is rejected by the staging LLM proxy's model validation, so the
-  # preflight false-reds while the real E2E (which uses the bare slug
-  # `MiniMax-M2.7`) would succeed. Default to the bare slug and allow
-  # per-lane override via E2E_LLM_PREFLIGHT_MODEL.
-  local model="${E2E_LLM_PREFLIGHT_MODEL:-MiniMax-M2.7}"
-  local api_key="${E2E_LLM_PREFLIGHT_API_KEY:-}"
+  # Cheap, auth-less reachability probe: minimal token count, no streaming.
+  # The model name is a no-op for reachability; the bare slug avoids a
+  # provider-specific 400 on proxies that validate model IDs.
   local body
-  body=$(cat <<JSON
-{"model":"$model","max_tokens":1,"messages":[{"role":"user","content":"pong"}]}
+  body=$(cat <<'JSON'
+{"model":"MiniMax-M2.7","max_tokens":1,"messages":[{"role":"user","content":"pong"}]}
 JSON
 )
 
-  local tmpfile http_code curl_opts
+  local tmpfile http_code
   tmpfile=$(mktemp)
   # shellcheck disable=SC2064
   trap "rm -f '$tmpfile'" RETURN
 
-  curl_opts=(
-    -sS
-    -L
-    -o "$tmpfile"
-    -w "%{http_code}"
-    --max-time "$timeout_secs"
-    -H "Content-Type: application/json"
-  )
-  if [ -n "$api_key" ]; then
-    curl_opts+=(-H "Authorization: Bearer $api_key")
-  fi
-
-  http_code=$(curl "${curl_opts[@]}" \
+  http_code=$(curl -sS -o "$tmpfile" -w "%{http_code}" \
+    --max-time "$timeout_secs" \
+    -H "Content-Type: application/json" \
     -X POST \
     -d "$body" \
     "$proxy_url" 2>/dev/null) || http_code="000"
 
-  if [ "$http_code" != "200" ]; then
+  # #76 semantics fix: the preflight only needs to prove the proxy is
+  # reachable and speaking HTTP. An auth-required proxy returns 401; a
+  # mis-routed probe returns 404 — both mean the proxy is UP. Only
+  # transport failures (http_code=000) or 5xx server errors mean DOWN.
+  if [ "$http_code" = "000" ] || [[ "$http_code" == 5* ]]; then
     # NOTE: the prefix `DEP-DOWN:staging-llm` is the SSOT that the
     # redgate-reporter parses for dedup. Do not edit without coordinating
     # with the redgate-reporter's parser in molecule-ci.
@@ -122,14 +116,5 @@ JSON
     return 70
   fi
 
-  # Even on 200, sanity-check the response shape — an LLM proxy that
-  # returns 200 with an empty/malformed body is itself a class of outage
-  # (the 2026-06-12 incident had a few minutes of 200 + empty body for
-  # one of the affected providers).
-  if ! grep -q '"choices"' "$tmpfile" 2>/dev/null; then
-    echo "::error::DEP-DOWN:staging-llm preflight failed: 200 with malformed body: $(head -c 500 "$tmpfile" 2>/dev/null)"
-    return 70
-  fi
-
   return 0
 }
diff --git a/tests/e2e/test_llm_proxy_preflight_unit.sh b/tests/e2e/test_llm_proxy_preflight_unit.sh
index 4bc83f5b..7aa90ce4 100755
--- a/tests/e2e/test_llm_proxy_preflight_unit.sh
+++ b/tests/e2e/test_llm_proxy_preflight_unit.sh
@@ -5,11 +5,11 @@
 #   1. Config-missing path (exit 71) when E2E_LLM_PROXY_URL is unset AND
 #      MOLECULE_CP_URL is unset.
 #   2. DEP-DOWN path (exit 70) when the proxy URL is unreachable.
-#   3. DEP-DOWN path (exit 70) when the proxy returns 200 with a
-#      malformed body (the 2026-06-12 incident's "200 with empty body"
-#      class of outage — see lib doc).
-#   4. Happy path (exit 0) when the proxy returns 200 with a normal
-#      completion body containing "choices".
+#   3. DEP-DOWN path (exit 70) when the proxy returns 5xx.
+#   4. Happy path (exit 0) when the proxy returns any HTTP response,
+#      including 401 (the #76 semantics fix: an unauthenticated probe
+#      against an auth-required proxy must NOT be classified as
+#      dependency-down).
 #   5. The error message starts with the `DEP-DOWN:staging-llm` prefix
 #      that the redgate-reporter parses for dedup.
 #
@@ -35,7 +35,7 @@ PY_SERVER_LOG=$(mktemp)
 PY_SERVER_PID=
 
 start_test_server() {
-  local mode="$1"  # "ok" | "down" | "empty_200" | "echo" | "auth"
+  local mode="$1"  # "ok" | "down" | "unauth"
   # Pick a free port via socket binding; pass it explicitly to the server.
   local port
   port=$(python3 -c "
@@ -51,39 +51,17 @@ mode = "$mode"
 port = $port
 class H(http.server.BaseHTTPRequestHandler):
     def do_POST(self):
-        length = int(self.headers.get('Content-Length', 0))
-        raw = self.rfile.read(length).decode('utf-8', errors='replace')
-        try:
-            req = json.loads(raw) if raw else {}
-        except json.JSONDecodeError:
-            req = {}
         if mode == "down":
             self.send_error(503, "simulated outage")
             return
-        if mode == "empty_200":
-            self.send_response(200)
+        if mode == "unauth":
+            self.send_response(401)
             self.send_header("Content-Type", "application/json")
             self.end_headers()
-            self.wfile.write(b'{"error":"upstream silent"}')
+            self.wfile.write(b'{"error":"unauthorized"}')
             return
-        if mode == "auth":
-            auth = self.headers.get('Authorization', '')
-            if not auth.startswith('Bearer '):
-                self.send_response(401)
-                self.end_headers()
-                self.wfile.write(b'{"error":"missing auth"}')
-                return
-            # fall through to ok response
-        # ok / echo / auth-success: echo model back so tests can verify
-        # the request body was sent correctly. Also persist the full request
-        # to a well-known file for tests that need to inspect it.
-        req_path = "/tmp/_llm_preflight_last_request.json"
-        try:
-            with open(req_path, "w") as fh:
-                json.dump(req, fh)
-        except Exception:
-            pass
-        body = {"choices":[{"message":{"role":"assistant","content":req.get("model","pong")}}]}
+        # ok
+        body = {"choices":[{"message":{"role":"assistant","content":"pong"}}]}
         payload = json.dumps(body).encode()
         self.send_response(200)
         self.send_header("Content-Type", "application/json")
@@ -162,10 +140,10 @@ test_proxy_unreachable() {
   return 0
 }
 
-# Test 3: proxy returns 200 with malformed body → exit 70.
-test_200_empty_body() {
+# Test 3: proxy returns 401 (auth required) → exit 0 (#76 semantics fix).
+test_401_reachable() {
   PY_SERVER_PORT=0
-  start_test_server "empty_200"
+  start_test_server "unauth"
   # E2E_LLM_PROXY_URL is read by the sourced llm_proxy_preflight helper
   # (lib/llm_proxy_preflight.sh) via ${E2E_LLM_PROXY_URL:-}. Export it
   # here so shellcheck doesn't false-positive SC2034 (appears unused) when
@@ -174,19 +152,14 @@ test_200_empty_body() {
   local out rc
   out=$(llm_proxy_preflight 2>&1)
   rc=$?
-  if [ "$rc" -ne 70 ]; then
-    echo "FAIL: test_200_empty_body expected exit 70, got $rc"
-    echo "  output: $out"
-    return 1
-  fi
-  if ! echo "$out" | grep -q "DEP-DOWN:staging-llm"; then
-    echo "FAIL: test_200_empty_body output missing DEP-DOWN:staging-llm prefix"
+  if [ "$rc" -ne 0 ]; then
+    echo "FAIL: test_401_reachable expected exit 0, got $rc"
     echo "  output: $out"
     return 1
   fi
   stop_test_server
   PY_SERVER_PID=
-  echo "PASS: test_200_empty_body"
+  echo "PASS: test_401_reachable"
   return 0
 }
 
@@ -236,62 +209,12 @@ test_503() {
   return 0
 }
 
-# Test 6: custom model slug via E2E_LLM_PREFLIGHT_MODEL is sent in the request body.
-test_model_override() {
-  PY_SERVER_PORT=0
-  start_test_server "echo"
-  export E2E_LLM_PROXY_URL="http://127.0.0.1:${PY_SERVER_PORT}/v1/chat/completions"
-  export E2E_LLM_PREFLIGHT_MODEL="custom-model-42"
-  rm -f /tmp/_llm_preflight_last_request.json
-  local out rc
-  out=$(llm_proxy_preflight 2>&1)
-  rc=$?
-  unset E2E_LLM_PREFLIGHT_MODEL
-  stop_test_server
-  PY_SERVER_PID=
-  if [ "$rc" -ne 0 ]; then
-    echo "FAIL: test_model_override expected exit 0, got $rc"
-    echo "  output: $out"
-    return 1
-  fi
-  if ! python3 -c "import json; d=json.load(open('/tmp/_llm_preflight_last_request.json')); assert d.get('model')=='custom-model-42'; print('model ok')" 2>&1; then
-    echo "FAIL: test_model_override did not send the custom model in the request body"
-    echo "  request file: $(cat /tmp/_llm_preflight_last_request.json 2>/dev/null || echo '<missing>')"
-    return 1
-  fi
-  echo "PASS: test_model_override"
-  return 0
-}
-
-# Test 7: optional Authorization header is sent when E2E_LLM_PREFLIGHT_API_KEY is set.
-test_auth_header() {
-  PY_SERVER_PORT=0
-  start_test_server "auth"
-  export E2E_LLM_PROXY_URL="http://127.0.0.1:${PY_SERVER_PORT}/v1/chat/completions"
-  export E2E_LLM_PREFLIGHT_API_KEY="test-token-123"
-  local out rc
-  out=$(llm_proxy_preflight 2>&1)
-  rc=$?
-  unset E2E_LLM_PREFLIGHT_API_KEY
-  stop_test_server
-  PY_SERVER_PID=
-  if [ "$rc" -ne 0 ]; then
-    echo "FAIL: test_auth_header expected exit 0, got $rc"
-    echo "  output: $out"
-    return 1
-  fi
-  echo "PASS: test_auth_header"
-  return 0
-}
-
 failed=0
 test_config_missing || failed=$((failed+1))
 test_proxy_unreachable || failed=$((failed+1))
-test_200_empty_body || failed=$((failed+1))
+test_401_reachable || failed=$((failed+1))
 test_ok || failed=$((failed+1))
 test_503 || failed=$((failed+1))
-test_model_override || failed=$((failed+1))
-test_auth_header || failed=$((failed+1))
 
 if [ "$failed" -gt 0 ]; then
   echo "FAILED: $failed test(s)"
-- 
2.52.0