From 8135ee4c3af9fe3aa8993b9720249d5d86c491c6 Mon Sep 17 00:00:00 2001
From: core-devops <core-devops@agents.moleculesai.app>
Date: Fri, 5 Jun 2026 11:54:50 -0700
Subject: [PATCH] fix(e2e): reconciler platform-path model + surface boot error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The e2e-staging-reconciler workflow set E2E_LLM_PATH=platform (sends
secrets={}, platform-managed billing) AND E2E_MODEL_SLUG=MiniMax-M2.
In pick_model_slug (tests/e2e/lib/model_slug.sh) E2E_MODEL_SLUG wins
over the E2E_LLM_PATH=platform branch, so the workspace was created
with the BARE id `MiniMax-M2` — a member of the providers.yaml
claude-code `minimax` BYOK arm (provider=minimax, requires
MINIMAX_API_KEY) — while NO key was injected. A keyless BYOK-minimax
model cannot resolve a serving path, so the workspace booted straight
to status=failed and never reached online ("never reached
status=online within 900s, last status=failed").

This is a test-config contradiction, not a workspace-server boot bug:
the log even prints the mismatch — "LLM path: PLATFORM-MANAGED ...
moonshot/kimi-k2.6" immediately followed by "MODEL_SLUG=MiniMax-M2"
then "→ failed" (run 223233, job 295646).

Fix (workflow-only): drop E2E_MODEL_SLUG and the misleading E2E_*_API_KEY
wiring so the platform path is coherent — pick_model_slug now returns the
platform default moonshot/kimi-k2.6 (a providers.yaml claude-code
`platform` arm member → provider=platform, CP-proxy billed, no tenant
key). Mirrors the e2e-staging-platform-boot job in e2e-staging-saas.yml,
which is the proven-clean keyless platform create combo.

Also (#2310-class): on the online-timeout, last_sample_error came back
EMPTY (the agent failed before its first heartbeat), so "err=" was
opaque. Add a diagnostic burst that dumps the model/llm_path/secrets,
every plausible error field, and the full /workspaces/<id> record — so
a future boot-failure names its own cause without a re-run.

Test-only/workflow-only. bash -n + shellcheck clean; test_model_slug.sh
21/0; YAML valid.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .gitea/workflows/e2e-staging-reconciler.yml   | 27 +++++++++++++------
 ...st_reconciler_heals_terminated_instance.sh | 18 ++++++++++++-
 2 files changed, 36 insertions(+), 9 deletions(-)
diff --git a/.gitea/workflows/e2e-staging-reconciler.yml b/.gitea/workflows/e2e-staging-reconciler.yml
index da3333223..13dc04847 100644
--- a/.gitea/workflows/e2e-staging-reconciler.yml
+++ b/.gitea/workflows/e2e-staging-reconciler.yml
@@ -101,18 +101,29 @@ jobs:
       # so teardown MUST positively confirm no slug-tagged box survives.
       E2E_AWS_LEAK_CHECK: required
       E2E_AWS_TERMINATE_LEAKS: '1'
-      # claude-code + MiniMax is the cheapest boot-to-online path (same as the
-      # saas job). The reconciler test never makes a completion, but the key is
-      # wired so the first boot reaches online on the same path the saas
-      # harness uses. First non-empty wins in the script's priority chain.
-      E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
-      E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
-      E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }}
       E2E_RUNTIME: claude-code
       # Platform-managed create path (moonshot/kimi-k2.6, no tenant key) — the
       # combo proven to create cleanly; this test only needs the ws online.
+      #
+      # DELIBERATELY no E2E_MODEL_SLUG and no E2E_*_API_KEY here — mirror the
+      # e2e-staging-platform-boot job in e2e-staging-saas.yml. On
+      # E2E_LLM_PATH=platform the harness sends EMPTY secrets and lets
+      # pick_model_slug return the platform default moonshot/kimi-k2.6 (a member
+      # of the providers.yaml claude-code `platform` arm → provider=platform,
+      # billed by the CP LLM proxy, NO tenant key required).
+      #
+      # The previous wiring set E2E_MODEL_SLUG: MiniMax-M2 (a BARE id in the
+      # providers.yaml `minimax` BYOK arm → provider=minimax, requires
+      # MINIMAX_API_KEY) while sending secrets={} on the platform path. Because
+      # E2E_MODEL_SLUG wins over the E2E_LLM_PATH=platform branch in
+      # pick_model_slug, the workspace got a keyless BYOK-minimax model, could
+      # not resolve a serving path, and booted to status=failed — never online
+      # (run 223233: "MODEL_SLUG=MiniMax-M2" then "→ failed", "never reached
+      # status=online within 900s"). The BYOK key wiring was equally misleading:
+      # the harness ignores E2E_*_API_KEY on E2E_LLM_PATH=platform, so the keys
+      # only made the contradiction harder to spot. Platform-only is correct
+      # here — this test exercises instance-state, never an LLM completion.
       E2E_LLM_PATH: platform
-      E2E_MODEL_SLUG: MiniMax-M2
       E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
       E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
 
diff --git a/tests/e2e/test_reconciler_heals_terminated_instance.sh b/tests/e2e/test_reconciler_heals_terminated_instance.sh
index b1c791299..b8a21d7b8 100755
--- a/tests/e2e/test_reconciler_heals_terminated_instance.sh
+++ b/tests/e2e/test_reconciler_heals_terminated_instance.sh
@@ -389,8 +389,24 @@ INSTANCE_ID_GRACE_SECS="${E2E_INSTANCE_ID_GRACE_SECS:-45}"
 WS_LAST_STATUS=""
 while true; do
   if [ "$(date +%s)" -gt "$ONLINE_DEADLINE" ]; then
+    # Boot-failure diagnostic burst (#2310-class): last_sample_error is often
+    # EMPTY for a config-resolution failure (the agent never sampled — it
+    # failed before its first heartbeat), so a bare "err=" tells us nothing
+    # (run 223233). Surface the FULL workspace record + every plausible error
+    # field so the actual reason (e.g. unservable provider, missing key, wrong
+    # model arm) is visible without re-running.
     WS_LAST_ERR=$(ws_field "$WS_ID" "last_sample_error")
-    fail "Workspace $WS_ID never reached status=online within ${WORKSPACE_ONLINE_TIMEOUT_SECS}s (last status=$WS_LAST_STATUS, err=$WS_LAST_ERR)"
+    log "── DIAGNOSTIC BURST (step 4 — workspace never reached online) ──"
+    log "    model=$MODEL_SLUG  llm_path=${E2E_LLM_PATH:-platform}  secrets=$([ "$SECRETS_JSON" = '{}' ] && echo '(none)' || echo '(set)')"
+    for f in status last_sample_error last_error error provisioning_error instance_id instance_status; do
+      log "    ${f}=$(ws_field "$WS_ID" "$f")"
+    done
+    log "    full record:"
+    tenant_call GET "/workspaces/$WS_ID" 2>/dev/null \
+      | python3 -m json.tool 2>/dev/null | sed 's/^/      /' \
+      || log "      (could not fetch /workspaces/$WS_ID)"
+    log "── END DIAGNOSTIC ──"
+    fail "Workspace $WS_ID never reached status=online within ${WORKSPACE_ONLINE_TIMEOUT_SECS}s (last status=$WS_LAST_STATUS, err=$WS_LAST_ERR; see diagnostic burst above)"
   fi
   WS_STATUS=$(ws_field "$WS_ID" "status")
   if [ "$WS_STATUS" != "$WS_LAST_STATUS" ]; then
-- 
2.52.0