From 6447edd2fd0354f54097808c5bf9d95808f05d39 Mon Sep 17 00:00:00 2001 From: core-devops Date: Thu, 4 Jun 2026 19:21:52 -0700 Subject: [PATCH 1/2] =?UTF-8?q?fix(e2e):=20reconciler=20e2e=20=E2=80=94=20?= =?UTF-8?q?use=20platform-managed=20create=20path=20+=20capture=20400=20bo?= =?UTF-8?q?dy=20(core#2261)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes so the live reconciler e2e can actually reach its assertion: 1. The create 400'd because the script used the BYOK path (MiniMax-M2 + MINIMAX_API_KEY secret) — a combo that fails workspace-create. Add the E2E_LLM_PATH=platform branch (DEFAULT) mirroring test_staging_full_saas.sh: moonshot/kimi-k2.6, no tenant key — the create combo proven to succeed. This test only needs the workspace status=online (then it kills the EC2), so it doesn't need a real LLM completion. 2. set -e + curl --fail-with-body aborted the create command-substitution before the fail line could echo $WS_RESP, hiding the real HTTP-400 reason. Capture the body via `|| { fail "...$WS_RESP" }` so any future create failure is diagnosable. core#2261 Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitea/workflows/e2e-staging-reconciler.yml | 3 ++ ...st_reconciler_heals_terminated_instance.sh | 28 +++++++++++++++---- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/.gitea/workflows/e2e-staging-reconciler.yml b/.gitea/workflows/e2e-staging-reconciler.yml index 341fecf40..da3333223 100644 --- a/.gitea/workflows/e2e-staging-reconciler.yml +++ b/.gitea/workflows/e2e-staging-reconciler.yml @@ -109,6 +109,9 @@ jobs: E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }} E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }} E2E_RUNTIME: claude-code + # Platform-managed create path (moonshot/kimi-k2.6, no tenant key) — the + # combo proven to create cleanly; this test only needs the ws online. + E2E_LLM_PATH: platform E2E_MODEL_SLUG: MiniMax-M2 E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}" E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }} diff --git a/tests/e2e/test_reconciler_heals_terminated_instance.sh b/tests/e2e/test_reconciler_heals_terminated_instance.sh index 8869fd638..dddbeea90 100755 --- a/tests/e2e/test_reconciler_heals_terminated_instance.sh +++ b/tests/e2e/test_reconciler_heals_terminated_instance.sh @@ -325,7 +325,18 @@ ws_field() { # tolerable — but wiring the same keys keeps boot behaviour identical to the # sibling and avoids a config path that only this test would exercise. SECRETS_JSON='{}' -if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then +# Platform-managed path (E2E_LLM_PATH=platform, the DEFAULT for this test): +# the workspace boots on the CP LLM proxy with NO tenant key, model +# moonshot/kimi-k2.6 — the exact create combo test_staging_full_saas.sh uses +# successfully. This test only needs the workspace to reach status=online so +# it can kill the EC2 and assert the reconciler heals it; it does NOT exercise +# a real LLM completion, so the platform path is both sufficient and the one +# proven to create cleanly. (The BYOK key paths below 400'd at create — see +# the create-failure capture added below — which is why platform is default.) +if [ "${E2E_LLM_PATH:-platform}" = "platform" ]; then + log " LLM path: PLATFORM-MANAGED (no tenant key; moonshot/kimi-k2.6 via proxy)" + SECRETS_JSON='{}' +elif [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then SECRETS_JSON=$(python3 -c "import json,os; print(json.dumps({'MINIMAX_API_KEY': os.environ['E2E_MINIMAX_API_KEY']}))") elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then SECRETS_JSON=$(python3 -c "import json,os; print(json.dumps({'ANTHROPIC_API_KEY': os.environ['E2E_ANTHROPIC_API_KEY']}))") @@ -345,15 +356,22 @@ print(json.dumps({ ") fi -MODEL_SLUG=$(pick_model_slug "$RUNTIME") +E2E_LLM_PATH="${E2E_LLM_PATH:-platform}" MODEL_SLUG=$(E2E_LLM_PATH="${E2E_LLM_PATH:-platform}" pick_model_slug "$RUNTIME") log " MODEL_SLUG=$MODEL_SLUG" log "4/6 Provisioning workspace (runtime=$RUNTIME)..." +# --fail-with-body makes curl exit non-zero on a 4xx/5xx but STILL writes the +# response body to stdout; the `|| { ... }` catches that so the body is printed +# instead of `set -e` aborting the command-substitution silently (the old bug +# that hid the real HTTP-400 reason). $WS_RESP holds the body either way. WS_RESP=$(tenant_call POST /workspaces \ -H "Content-Type: application/json" \ - -d "{\"name\":\"E2E Reconciler\",\"runtime\":\"$RUNTIME\",\"tier\":2,\"model\":\"$MODEL_SLUG\",\"secrets\":$SECRETS_JSON}") -WS_ID=$(echo "$WS_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['id'])") -[ -z "$WS_ID" ] && fail "Workspace create response missing 'id': $WS_RESP" + -d "{\"name\":\"E2E Reconciler\",\"runtime\":\"$RUNTIME\",\"tier\":2,\"model\":\"$MODEL_SLUG\",\"secrets\":$SECRETS_JSON}") || { + rc=$? + fail "Workspace create failed (curl rc=$rc, model=$MODEL_SLUG). Response body: $WS_RESP" +} +WS_ID=$(echo "$WS_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null) +[ -z "$WS_ID" ] && fail "Workspace create response missing 'id' (model=$MODEL_SLUG): $WS_RESP" log " WS_ID=$WS_ID" # Wait for the workspace to reach status=online and capture its instance_id. -- 2.52.0 From d8ff0b2503a8e7a79e1d3f477c60a4148efbc677 Mon Sep 17 00:00:00 2001 From: core-devops Date: Thu, 4 Jun 2026 19:51:20 -0700 Subject: [PATCH 2/2] =?UTF-8?q?fix(e2e):=20reconciler=20e2e=20=E2=80=94=20?= =?UTF-8?q?fail=20fast=20on=20online-wait=20(900s)=20to=20avoid=20EC2=20le?= =?UTF-8?q?ak=20(core#2261)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run 216031 hung ~32min in the boot-to-online poll (3600s default) and leaked a running staging e2e-rec EC2 — the workspace never reached online (a staging boot/serving issue, same root as the full-saas A2A failures, upstream of the reconciler this test exercises). Reduce the online timeout default to 900s so a non-booting workspace fails fast and the teardown trap terminates the EC2 instead of hanging ~1h. Does not change what the test proves once staging can boot a workspace online. core#2261 Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/e2e/test_reconciler_heals_terminated_instance.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/e2e/test_reconciler_heals_terminated_instance.sh b/tests/e2e/test_reconciler_heals_terminated_instance.sh index dddbeea90..7b6850ac7 100755 --- a/tests/e2e/test_reconciler_heals_terminated_instance.sh +++ b/tests/e2e/test_reconciler_heals_terminated_instance.sh @@ -50,7 +50,11 @@ # Optional env (mirrors the full-saas harness where they overlap): # E2E_RUNTIME claude-code (default) # E2E_PROVISION_TIMEOUT_SECS default 900 (cold EC2 budget) -# E2E_WORKSPACE_ONLINE_TIMEOUT_SECS default 3600 (cold-boot worst-case) +# E2E_WORKSPACE_ONLINE_TIMEOUT_SECS default 900 (15min). A workspace that +# cannot reach online in 15min is a staging/boot problem, +# not slow cold-boot — fail fast so the trap tears down the +# EC2 instead of hanging ~1h and leaking a running instance +# (observed: run 216031 hung 32min with a live e2e-rec EC2). # E2E_RECONCILE_OFFLINE_TIMEOUT_SECS default 180 (PRIMARY: leave 'online'. # Reconciler cadence is 60s — 3 cycles + # AWS terminate-visibility slack.) @@ -82,7 +86,7 @@ CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}" ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}" RUNTIME="${E2E_RUNTIME:-claude-code}" PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}" -WORKSPACE_ONLINE_TIMEOUT_SECS="${E2E_WORKSPACE_ONLINE_TIMEOUT_SECS:-3600}" +WORKSPACE_ONLINE_TIMEOUT_SECS="${E2E_WORKSPACE_ONLINE_TIMEOUT_SECS:-900}" # PRIMARY bound: the reconciler ticks every 60s; it needs one cycle to see # the dead instance after AWS makes the terminate visible to DescribeInstances # (typically seconds, but can lag). 180s = ~3 cycles + slack. -- 2.52.0