2026-06-05 03:25:29 +00:00
4 changed files with 36 additions and 7 deletions
@@ -172,7 +172,16 @@ jobs:
      # and defeats the cost saving. Operators can override via the
      # workflow_dispatch flow (no input wired here yet — runtime
      # override is enough for ad-hoc).
-      E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'codex' && 'openai/gpt-4o' || github.event.inputs.runtime == 'google-adk' && 'google_genai:gemini-2.5-pro' || 'MiniMax-M2' }}
+      #
+      # #2263 deploy-skew: the claude-code default is the COLON-namespaced BYOK
+      # id `minimax:MiniMax-M2.7`, NOT bare `MiniMax-M2`. The deployed staging
+      # ws-server's compiled registry can lag source; validateRegisteredModelForRuntime
+      # 400s the bare form on an older image (the sibling Platform Boot job, on
+      # the SAME image, succeeds with namespaced `moonshot/kimi-k2.6`). The colon
+      # form stays in the BYOK `minimax` arm (providers.yaml:851) so it resolves
+      # provider=minimax (BYOK) and the #1994 byok-not-platform guard still
+      # passes — the slash/platform form `minimax/MiniMax-M2.7` would not.
+      E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'codex' && 'openai/gpt-4o' || github.event.inputs.runtime == 'google-adk' && 'google_genai:gemini-2.5-pro' || 'minimax:MiniMax-M2.7' }}
      E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
      E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}

@@ -11,7 +11,10 @@
 #                                    default + 401, see PR #1714.)
 #
 #   claude-code → auth-aware:
-#                  E2E_MINIMAX_API_KEY    → "MiniMax-M2"
+#                  E2E_MINIMAX_API_KEY    → "minimax:MiniMax-M2.7"
+#                                           (colon-namespaced BYOK id; bare
+#                                            "MiniMax-M2" 400s on a deploy-skewed
+#                                            staging registry — #2263)
 #                  E2E_ANTHROPIC_API_KEY  → "claude-sonnet-4-6"
 #                  otherwise              → "sonnet"
 #
@@ -82,7 +85,17 @@ pick_model_slug() {
    hermes)      printf 'openai/gpt-4o' ;;
    claude-code)
      if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
-        printf 'MiniMax-M2'
+        # Namespaced (colon) BYOK id, not bare "MiniMax-M2" (#2263 deploy-skew):
+        # bare ids can lag the deployed staging ws-server's compiled registry,
+        # so workspace-create's validateRegisteredModelForRuntime 400s the bare
+        # form on an older image. The colon-namespaced `minimax:MiniMax-M2.7`
+        # resolves the same way the proven-working sibling `moonshot/kimi-k2.6`
+        # does. It stays in the BYOK `minimax` arm (providers.yaml:851), so
+        # DeriveProvider -> provider_selection=minimax (BYOK) and the #1994
+        # byok-not-platform guard (test_staging_full_saas.sh:1000) still passes —
+        # unlike the slash/platform form `minimax/MiniMax-M2.7`, which resolves
+        # to provider=platform and would trip that guard.
+        printf 'minimax:MiniMax-M2.7'
      elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
        printf 'claude-sonnet-4-6'
      else
@@ -49,13 +49,13 @@ run_test "codex → slash-form fallback"                             codex
 run_test "claude-code → OAuth/default alias"                      claude-code "sonnet"

 got=$(unset E2E_MODEL_SLUG E2E_ANTHROPIC_API_KEY; E2E_MINIMAX_API_KEY="mx-test" pick_model_slug claude-code)
-assert_eq "claude-code + MiniMax key → MiniMax model"             "$got" "MiniMax-M2"
+assert_eq "claude-code + MiniMax key → MiniMax model"             "$got" "minimax:MiniMax-M2.7"

 got=$(unset E2E_MODEL_SLUG E2E_MINIMAX_API_KEY; E2E_ANTHROPIC_API_KEY="sk-ant-test" pick_model_slug claude-code)
 assert_eq "claude-code + Anthropic API key → Anthropic API model" "$got" "claude-sonnet-4-6"

 got=$(unset E2E_MODEL_SLUG; E2E_MINIMAX_API_KEY="mx-priority" E2E_ANTHROPIC_API_KEY="sk-ant-loser" pick_model_slug claude-code)
-assert_eq "claude-code + both keys → MiniMax priority"            "$got" "MiniMax-M2"
+assert_eq "claude-code + both keys → MiniMax priority"            "$got" "minimax:MiniMax-M2.7"

 # ── Fallback for unknown runtime ──
 # Picks slash-form (hermes-shaped) since hermes is the historical
@@ -886,7 +886,7 @@ fi
 # identical on main's scheduled synthetic E2E and on PRs (so it is an
 # environmental backend regression, never PR-introduced).
 if echo "$AGENT_TEXT" | grep -qiF "message contained no text content"; then
-  fail "A2A — EMPTY COMPLETION (backend regression, NOT a platform/workspace-server bug). The configured model (MODEL_SLUG=${MODEL_SLUG:-?}) returned a 2xx completion with no text part; the runtime surfaced 'message contained no text content.'. Operator action: check the staging LLM backend / proxy for the canary model (MiniMax-M2 since #2710) — empty assistant turns, not an auth/quota/boot fault. Raw: $AGENT_TEXT"
+  fail "A2A — EMPTY COMPLETION (backend regression, NOT a platform/workspace-server bug). The configured model (MODEL_SLUG=${MODEL_SLUG:-?}) returned a 2xx completion with no text part; the runtime surfaced 'message contained no text content.'. Operator action: check the staging LLM backend / proxy for the canary model (the claude-code default is minimax:MiniMax-M2.7 since #2263; was bare MiniMax-M2 #2710) — empty assistant turns, not an auth/quota/boot fault. Raw: $AGENT_TEXT"
 fi
 # Generic catch-all — falls through if none of the known regressions hit.
 if echo "$AGENT_TEXT" | grep -qiE "error|exception"; then
@@ -952,7 +952,14 @@ for KA_ATTEMPT in $(seq 1 6); do
  KA_SAFE_BODY=$(printf '%s' "$KA_RESP" | sanitize_http_body)
  # Retry ONLY on transient transport errors — never on an agent-level
  # error (those must surface and fail the gate).
-  if echo "$KA_CODE" | grep -Eq '^(502|503|504)$' && echo "$KA_SAFE_BODY" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|workspace agent unreachable|connection refused|no healthy upstream|workspace agent busy|native_session'; then
+  # #2263: include the Cloudflare-shaped literal `error code: 502/504` token so a
+  # bare edge/gateway 502 (no "Bad Gateway" body) is retried here the same way the
+  # cold-start PONG probe (line ~800) and the delegation loop (line ~1234) already
+  # do. Without it, a single un-retried edge 502 right after a healthy round-trip
+  # fell through to break and failed the gate on the first attempt (Platform Boot
+  # job, task 268859). Bounded by the existing 6-attempt / sleep-10 loop — no new
+  # sleep-as-fix; this only widens the transient-match to the sibling pattern.
+  if echo "$KA_CODE" | grep -Eq '^(502|503|504)$' && echo "$KA_SAFE_BODY" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|error code: 502|error code: 504|workspace agent unreachable|connection refused|no healthy upstream|workspace agent busy|native_session'; then
    log "    known-answer A2A transient $KA_CODE attempt $KA_ATTEMPT/6: $KA_SAFE_BODY"
    if [ "$KA_ATTEMPT" -lt 6 ]; then sleep 10; continue; fi
  fi