diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 29fb69434..8cacae807 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -399,10 +399,9 @@ jobs: # a manual action that determinism made obsolete. name: Canvas Deploy Status runs-on: docker-host - # Job-level `if:` so ci-required-drift.py's ci_job_names() detects this as - # github.ref-gated and skips it from the required-context F1 set (mc#1982). + # Per-step no-op (not job-level `if:`) so the job reaches SUCCESS on PRs + # instead of skipped — skipped poisons the PR combined status (internal#817). # Step-level exit 0 handles the "not a canvas main push" case. - if: ${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/staging' }} needs: [changes, canvas-build] steps: - name: Record canvas ordered-deploy status @@ -514,9 +513,8 @@ jobs: # The `needs:` list MUST stay in lockstep with ci-required-drift.py's # F1 check (`ci_job_names()` = every job MINUS the sentinel MINUS jobs # whose `if:` gates on github.event_name/github.ref). canvas-deploy- - # reminder is event-gated (`if: github.ref == refs/heads/{main,staging}`) - # so it is intentionally EXCLUDED — it skips on PRs and a `needs:` on a - # skipped job would never let the sentinel run. If a new always-running + # status is per-step-gated (not job-level `if:`) so it reaches SUCCESS + # on PRs and is included here — internal#817. If a new always-running # CI job is added, add it here too or ci-required-drift F1 will flag it. # # Stays on the dedicated `ci-meta` lane (no docker work, so the @@ -530,6 +528,7 @@ jobs: - canvas-build - shellcheck - python-lint + - canvas-deploy-status continue-on-error: false runs-on: ci-meta timeout-minutes: 5 @@ -548,6 +547,7 @@ jobs: CANVAS_RESULT: ${{ needs.canvas-build.result }} SHELLCHECK_RESULT: ${{ needs.shellcheck.result }} PYTHON_LINT_RESULT: ${{ needs.python-lint.result }} + CANVAS_DEPLOY_RESULT: ${{ needs.canvas-deploy-status.result }} run: | set -euo pipefail fail=0 @@ -569,6 +569,7 @@ jobs: check "Canvas (Next.js)" "$CANVAS_RESULT" check "Shellcheck (E2E scripts)" "$SHELLCHECK_RESULT" check "Python Lint & Test" "$PYTHON_LINT_RESULT" + check "Canvas Deploy Status" "$CANVAS_DEPLOY_RESULT" if [ "$fail" -ne 0 ]; then echo "::error::all-required: one or more aggregated CI jobs did not succeed" exit 1 diff --git a/.gitea/workflows/continuous-synth-e2e.yml b/.gitea/workflows/continuous-synth-e2e.yml index 6071b9165..ea5cf06af 100644 --- a/.gitea/workflows/continuous-synth-e2e.yml +++ b/.gitea/workflows/continuous-synth-e2e.yml @@ -118,7 +118,7 @@ jobs: timeout-minutes: 20 env: # claude-code default: cold-start ~5 min (comparable to langgraph), - # but uses MiniMax-M2 via the template's third-party- + # but uses MiniMax-M2.7 via the template's third-party- # Anthropic-compat path (workspace-configs-templates/claude-code- # default/config.yaml:64-69). MiniMax is ~5-10x cheaper than # gpt-4.1-mini per token AND avoids the recurring OpenAI quota- @@ -131,9 +131,9 @@ jobs: # on the per-runtime default ("sonnet" → routes to direct # Anthropic, defeats the cost saving). Operators can override # via workflow_dispatch by setting a different E2E_MODEL_SLUG - # input if they need to exercise a specific model. MiniMax-M2 is the + # input if they need to exercise a specific model. MiniMax-M2.7 is the # stable staging MiniMax path used by the full-SaaS smoke. - E2E_MODEL_SLUG: ${{ github.event.inputs.model_slug || 'MiniMax-M2' }} + E2E_MODEL_SLUG: ${{ github.event.inputs.model_slug || 'MiniMax-M2.7' }} # Bound to 10 min so a stuck provision fails the run instead of # holding up the next cron firing. 15-min default in the script # is for the on-PR full lifecycle where we have more headroom. diff --git a/.gitea/workflows/e2e-staging-saas.yml b/.gitea/workflows/e2e-staging-saas.yml index 82b3c46d8..c41cbf048 100644 --- a/.gitea/workflows/e2e-staging-saas.yml +++ b/.gitea/workflows/e2e-staging-saas.yml @@ -172,7 +172,7 @@ jobs: # and defeats the cost saving. Operators can override via the # workflow_dispatch flow (no input wired here yet — runtime # override is enough for ad-hoc). - E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'codex' && 'openai/gpt-4o' || github.event.inputs.runtime == 'google-adk' && 'google_genai:gemini-2.5-pro' || 'MiniMax-M2' }} + E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'codex' && 'openai/gpt-4o' || github.event.inputs.runtime == 'google-adk' && 'google_genai:gemini-2.5-pro' || 'MiniMax-M2.7' }} E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}" E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }} diff --git a/.gitea/workflows/staging-smoke.yml b/.gitea/workflows/staging-smoke.yml index 9e3fce6a8..5c5295fe7 100644 --- a/.gitea/workflows/staging-smoke.yml +++ b/.gitea/workflows/staging-smoke.yml @@ -112,9 +112,9 @@ jobs: E2E_RUNTIME: claude-code # Pin the smoke to a specific MiniMax model rather than relying # on the per-runtime default (which could resolve to "sonnet" → - # direct Anthropic and defeat the cost saving). MiniMax-M2 is the + # direct Anthropic and defeat the cost saving). MiniMax-M2.7 is the # stable staging MiniMax path used by the full-SaaS smoke. - E2E_MODEL_SLUG: MiniMax-M2 + E2E_MODEL_SLUG: MiniMax-M2.7 E2E_RUN_ID: "smoke-${{ github.run_id }}" # Debug-only: when an operator dispatches with keep_on_failure=true, # the smoke script's E2E_KEEP_ORG=1 path skips teardown so the diff --git a/tests/e2e/lib/model_slug.sh b/tests/e2e/lib/model_slug.sh index 93207c96f..efa64f9f4 100755 --- a/tests/e2e/lib/model_slug.sh +++ b/tests/e2e/lib/model_slug.sh @@ -11,7 +11,7 @@ # default + 401, see PR #1714.) # # claude-code → auth-aware: -# E2E_MINIMAX_API_KEY → "MiniMax-M2" +# E2E_MINIMAX_API_KEY → "MiniMax-M2.7" # E2E_ANTHROPIC_API_KEY → "claude-sonnet-4-6" # otherwise → "sonnet" # @@ -82,7 +82,7 @@ pick_model_slug() { hermes) printf 'openai/gpt-4o' ;; claude-code) if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then - printf 'MiniMax-M2' + printf 'MiniMax-M2.7' elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then printf 'claude-sonnet-4-6' else diff --git a/tests/e2e/test_model_slug.sh b/tests/e2e/test_model_slug.sh index e3282c41b..c6bc57b73 100755 --- a/tests/e2e/test_model_slug.sh +++ b/tests/e2e/test_model_slug.sh @@ -49,13 +49,13 @@ run_test "codex → slash-form fallback" codex run_test "claude-code → OAuth/default alias" claude-code "sonnet" got=$(unset E2E_MODEL_SLUG E2E_ANTHROPIC_API_KEY; E2E_MINIMAX_API_KEY="mx-test" pick_model_slug claude-code) -assert_eq "claude-code + MiniMax key → MiniMax model" "$got" "MiniMax-M2" +assert_eq "claude-code + MiniMax key → MiniMax model" "$got" "MiniMax-M2.7" got=$(unset E2E_MODEL_SLUG E2E_MINIMAX_API_KEY; E2E_ANTHROPIC_API_KEY="sk-ant-test" pick_model_slug claude-code) assert_eq "claude-code + Anthropic API key → Anthropic API model" "$got" "claude-sonnet-4-6" got=$(unset E2E_MODEL_SLUG; E2E_MINIMAX_API_KEY="mx-priority" E2E_ANTHROPIC_API_KEY="sk-ant-loser" pick_model_slug claude-code) -assert_eq "claude-code + both keys → MiniMax priority" "$got" "MiniMax-M2" +assert_eq "claude-code + both keys → MiniMax priority" "$got" "MiniMax-M2.7" # ── Fallback for unknown runtime ── # Picks slash-form (hermes-shaped) since hermes is the historical diff --git a/tests/e2e/test_staging_full_saas.sh b/tests/e2e/test_staging_full_saas.sh index 3761ace5f..3fc13bebb 100755 --- a/tests/e2e/test_staging_full_saas.sh +++ b/tests/e2e/test_staging_full_saas.sh @@ -881,12 +881,13 @@ fi # and NOT NOT_CONFIGURED (that fails earlier, at boot). Name it explicitly # so the canary alert points at the model, not the platform: a generic # "error-shaped response" misdirects triage to workspace-server. Observed -# 2026-06-03/04 across every staging canary on MODEL_SLUG=MiniMax-M2 (the -# canary default since #2710) — 100% on the parent's first cold turn, +# 2026-06-03/04 across every staging canary on MODEL_SLUG=MiniMax-M2.7 (the +# canary default since #2710, updated to M2.7 2026-06-04 after M2 was +# rejected by staging with HTTP 400) — 100% on the parent's first cold turn, # identical on main's scheduled synthetic E2E and on PRs (so it is an # environmental backend regression, never PR-introduced). if echo "$AGENT_TEXT" | grep -qiF "message contained no text content"; then - fail "A2A — EMPTY COMPLETION (backend regression, NOT a platform/workspace-server bug). The configured model (MODEL_SLUG=${MODEL_SLUG:-?}) returned a 2xx completion with no text part; the runtime surfaced 'message contained no text content.'. Operator action: check the staging LLM backend / proxy for the canary model (MiniMax-M2 since #2710) — empty assistant turns, not an auth/quota/boot fault. Raw: $AGENT_TEXT" + fail "A2A — EMPTY COMPLETION (backend regression, NOT a platform/workspace-server bug). The configured model (MODEL_SLUG=${MODEL_SLUG:-?}) returned a 2xx completion with no text part; the runtime surfaced 'message contained no text content.'. Operator action: check the staging LLM backend / proxy for the canary model (MiniMax-M2.7 since #2710, updated 2026-06-04) — empty assistant turns, not an auth/quota/boot fault. Raw: $AGENT_TEXT" fi # Generic catch-all — falls through if none of the known regressions hit. if echo "$AGENT_TEXT" | grep -qiE "error|exception"; then @@ -952,7 +953,7 @@ for KA_ATTEMPT in $(seq 1 6); do KA_SAFE_BODY=$(printf '%s' "$KA_RESP" | sanitize_http_body) # Retry ONLY on transient transport errors — never on an agent-level # error (those must surface and fail the gate). - if echo "$KA_CODE" | grep -Eq '^(502|503|504)$' && echo "$KA_SAFE_BODY" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|workspace agent unreachable|connection refused|no healthy upstream|workspace agent busy|native_session'; then + if echo "$KA_CODE" | grep -Eq '^(502|503|504)$' && echo "$KA_SAFE_BODY" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|error code: 502|error code: 504|workspace agent unreachable|connection refused|no healthy upstream|workspace agent busy|native_session'; then log " known-answer A2A transient $KA_CODE attempt $KA_ATTEMPT/6: $KA_SAFE_BODY" if [ "$KA_ATTEMPT" -lt 6 ]; then sleep 10; continue; fi fi diff --git a/tests/test_e2e_minimax_defaults.py b/tests/test_e2e_minimax_defaults.py index b658ff369..057486daf 100644 --- a/tests/test_e2e_minimax_defaults.py +++ b/tests/test_e2e_minimax_defaults.py @@ -1,3 +1,4 @@ +import re from pathlib import Path @@ -14,5 +15,6 @@ def test_staging_e2e_workflows_use_stable_minimax_default() -> None: for rel in workflow_paths: text = (ROOT / rel).read_text() - assert "MiniMax-M2.7-highspeed" not in text - assert "MiniMax-M2" in text + # Reject bare MiniMax-M2 (not followed by "."), but allow MiniMax-M2.7 + assert re.search(r"MiniMax-M2(?!\.)", text) is None + assert "MiniMax-M2.7" in text