From d0ab3d7c4b88b1499a656341cf74a59b26c76aa5 Mon Sep 17 00:00:00 2001 From: core-devops Date: Thu, 4 Jun 2026 18:49:56 -0700 Subject: [PATCH] fix(e2e): staging SaaS canary uses namespaced minimax:MiniMax-M2.7 (#2263) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The staging SaaS E2E provisioned its claude-code canary with the BARE id `MiniMax-M2`. The deployed staging tenant ws-server's compiled model registry lags source, so validateRegisteredModelForRuntime returns HTTP 400 on the bare id at workspace-create. The sibling Platform Boot job, on the SAME image, succeeds with the NAMESPACED `moonshot/kimi-k2.6` — only the id form differs (deploy-skew, internal#718; NOT flaky). Harness-side fix: switch the claude-code MiniMax default from bare `MiniMax-M2` to the COLON-namespaced `minimax:MiniMax-M2.7`. Crucially this is the colon (BYOK) form, NOT the slash/platform form `minimax/MiniMax-M2.7` the issue floated: the canary injects E2E_MINIMAX_API_KEY (BYOK), so the #1994 byok-not-platform guard asserts provider_selection=minimax. The colon form stays in the BYOK `minimax` arm (providers.yaml:851 → provider=minimax, passes the guard); the slash form resolves to provider=platform and would trip it. Mirrors how the proven-working kimi BYOK colon-form is registered. Changed both the operator-override default in e2e-staging-saas.yml (which sets E2E_MODEL_SLUG and wins over pick_model_slug) and the pick_model_slug fallback in lib/model_slug.sh, plus the pinned unit-test expectations. Also: widen the known-answer A2A POST retry grep to include the Cloudflare-shaped literal `error code: 502/504` token, matching the cold-start PONG probe and delegation loops. A single un-retried edge 502 right after a healthy round-trip (Platform Boot, task 268859) fell through to break and failed the gate on the first attempt. Bounded by the existing 6-attempt/sleep-10 loop — no new sleep-as-fix. NOTE: harness-side only. The durable fix is promoting the staging tenant ws-server runtime image to a build whose compiled registry includes the bare id. Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitea/workflows/e2e-staging-saas.yml | 11 ++++++++++- tests/e2e/lib/model_slug.sh | 17 +++++++++++++++-- tests/e2e/test_model_slug.sh | 4 ++-- tests/e2e/test_staging_full_saas.sh | 11 +++++++++-- 4 files changed, 36 insertions(+), 7 deletions(-) diff --git a/.gitea/workflows/e2e-staging-saas.yml b/.gitea/workflows/e2e-staging-saas.yml index 82b3c46d8..584349142 100644 --- a/.gitea/workflows/e2e-staging-saas.yml +++ b/.gitea/workflows/e2e-staging-saas.yml @@ -172,7 +172,16 @@ jobs: # and defeats the cost saving. Operators can override via the # workflow_dispatch flow (no input wired here yet — runtime # override is enough for ad-hoc). - E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'codex' && 'openai/gpt-4o' || github.event.inputs.runtime == 'google-adk' && 'google_genai:gemini-2.5-pro' || 'MiniMax-M2' }} + # + # #2263 deploy-skew: the claude-code default is the COLON-namespaced BYOK + # id `minimax:MiniMax-M2.7`, NOT bare `MiniMax-M2`. The deployed staging + # ws-server's compiled registry can lag source; validateRegisteredModelForRuntime + # 400s the bare form on an older image (the sibling Platform Boot job, on + # the SAME image, succeeds with namespaced `moonshot/kimi-k2.6`). The colon + # form stays in the BYOK `minimax` arm (providers.yaml:851) so it resolves + # provider=minimax (BYOK) and the #1994 byok-not-platform guard still + # passes — the slash/platform form `minimax/MiniMax-M2.7` would not. + E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'codex' && 'openai/gpt-4o' || github.event.inputs.runtime == 'google-adk' && 'google_genai:gemini-2.5-pro' || 'minimax:MiniMax-M2.7' }} E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}" E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }} diff --git a/tests/e2e/lib/model_slug.sh b/tests/e2e/lib/model_slug.sh index 93207c96f..efb5fd71f 100755 --- a/tests/e2e/lib/model_slug.sh +++ b/tests/e2e/lib/model_slug.sh @@ -11,7 +11,10 @@ # default + 401, see PR #1714.) # # claude-code → auth-aware: -# E2E_MINIMAX_API_KEY → "MiniMax-M2" +# E2E_MINIMAX_API_KEY → "minimax:MiniMax-M2.7" +# (colon-namespaced BYOK id; bare +# "MiniMax-M2" 400s on a deploy-skewed +# staging registry — #2263) # E2E_ANTHROPIC_API_KEY → "claude-sonnet-4-6" # otherwise → "sonnet" # @@ -82,7 +85,17 @@ pick_model_slug() { hermes) printf 'openai/gpt-4o' ;; claude-code) if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then - printf 'MiniMax-M2' + # Namespaced (colon) BYOK id, not bare "MiniMax-M2" (#2263 deploy-skew): + # bare ids can lag the deployed staging ws-server's compiled registry, + # so workspace-create's validateRegisteredModelForRuntime 400s the bare + # form on an older image. The colon-namespaced `minimax:MiniMax-M2.7` + # resolves the same way the proven-working sibling `moonshot/kimi-k2.6` + # does. It stays in the BYOK `minimax` arm (providers.yaml:851), so + # DeriveProvider -> provider_selection=minimax (BYOK) and the #1994 + # byok-not-platform guard (test_staging_full_saas.sh:1000) still passes — + # unlike the slash/platform form `minimax/MiniMax-M2.7`, which resolves + # to provider=platform and would trip that guard. + printf 'minimax:MiniMax-M2.7' elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then printf 'claude-sonnet-4-6' else diff --git a/tests/e2e/test_model_slug.sh b/tests/e2e/test_model_slug.sh index e3282c41b..32b805fb0 100755 --- a/tests/e2e/test_model_slug.sh +++ b/tests/e2e/test_model_slug.sh @@ -49,13 +49,13 @@ run_test "codex → slash-form fallback" codex run_test "claude-code → OAuth/default alias" claude-code "sonnet" got=$(unset E2E_MODEL_SLUG E2E_ANTHROPIC_API_KEY; E2E_MINIMAX_API_KEY="mx-test" pick_model_slug claude-code) -assert_eq "claude-code + MiniMax key → MiniMax model" "$got" "MiniMax-M2" +assert_eq "claude-code + MiniMax key → MiniMax model" "$got" "minimax:MiniMax-M2.7" got=$(unset E2E_MODEL_SLUG E2E_MINIMAX_API_KEY; E2E_ANTHROPIC_API_KEY="sk-ant-test" pick_model_slug claude-code) assert_eq "claude-code + Anthropic API key → Anthropic API model" "$got" "claude-sonnet-4-6" got=$(unset E2E_MODEL_SLUG; E2E_MINIMAX_API_KEY="mx-priority" E2E_ANTHROPIC_API_KEY="sk-ant-loser" pick_model_slug claude-code) -assert_eq "claude-code + both keys → MiniMax priority" "$got" "MiniMax-M2" +assert_eq "claude-code + both keys → MiniMax priority" "$got" "minimax:MiniMax-M2.7" # ── Fallback for unknown runtime ── # Picks slash-form (hermes-shaped) since hermes is the historical diff --git a/tests/e2e/test_staging_full_saas.sh b/tests/e2e/test_staging_full_saas.sh index 3761ace5f..40b6e5030 100755 --- a/tests/e2e/test_staging_full_saas.sh +++ b/tests/e2e/test_staging_full_saas.sh @@ -886,7 +886,7 @@ fi # identical on main's scheduled synthetic E2E and on PRs (so it is an # environmental backend regression, never PR-introduced). if echo "$AGENT_TEXT" | grep -qiF "message contained no text content"; then - fail "A2A — EMPTY COMPLETION (backend regression, NOT a platform/workspace-server bug). The configured model (MODEL_SLUG=${MODEL_SLUG:-?}) returned a 2xx completion with no text part; the runtime surfaced 'message contained no text content.'. Operator action: check the staging LLM backend / proxy for the canary model (MiniMax-M2 since #2710) — empty assistant turns, not an auth/quota/boot fault. Raw: $AGENT_TEXT" + fail "A2A — EMPTY COMPLETION (backend regression, NOT a platform/workspace-server bug). The configured model (MODEL_SLUG=${MODEL_SLUG:-?}) returned a 2xx completion with no text part; the runtime surfaced 'message contained no text content.'. Operator action: check the staging LLM backend / proxy for the canary model (the claude-code default is minimax:MiniMax-M2.7 since #2263; was bare MiniMax-M2 #2710) — empty assistant turns, not an auth/quota/boot fault. Raw: $AGENT_TEXT" fi # Generic catch-all — falls through if none of the known regressions hit. if echo "$AGENT_TEXT" | grep -qiE "error|exception"; then @@ -952,7 +952,14 @@ for KA_ATTEMPT in $(seq 1 6); do KA_SAFE_BODY=$(printf '%s' "$KA_RESP" | sanitize_http_body) # Retry ONLY on transient transport errors — never on an agent-level # error (those must surface and fail the gate). - if echo "$KA_CODE" | grep -Eq '^(502|503|504)$' && echo "$KA_SAFE_BODY" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|workspace agent unreachable|connection refused|no healthy upstream|workspace agent busy|native_session'; then + # #2263: include the Cloudflare-shaped literal `error code: 502/504` token so a + # bare edge/gateway 502 (no "Bad Gateway" body) is retried here the same way the + # cold-start PONG probe (line ~800) and the delegation loop (line ~1234) already + # do. Without it, a single un-retried edge 502 right after a healthy round-trip + # fell through to break and failed the gate on the first attempt (Platform Boot + # job, task 268859). Bounded by the existing 6-attempt / sleep-10 loop — no new + # sleep-as-fix; this only widens the transient-match to the sibling pattern. + if echo "$KA_CODE" | grep -Eq '^(502|503|504)$' && echo "$KA_SAFE_BODY" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|error code: 502|error code: 504|workspace agent unreachable|connection refused|no healthy upstream|workspace agent busy|native_session'; then log " known-answer A2A transient $KA_CODE attempt $KA_ATTEMPT/6: $KA_SAFE_BODY" if [ "$KA_ATTEMPT" -lt 6 ]; then sleep 10; continue; fi fi -- 2.52.0