From eaee113416215fef012fff4d2b775faee07ae53d Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 00:20:36 -0700 Subject: [PATCH] e2e-staging-saas: same migration off OpenAI default to claude-code+MiniMax MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bundles the same hermes+OpenAI → claude-code+MiniMax migration onto the full-lifecycle E2E that's been red on every provisioning-critical push since 2026-05-01. Same root cause as the canary fix in the prior commit: MOLECULE_STAGING_OPENAI_KEY hit insufficient_quota and there's no SLA on operator billing top-up. Same shape as canary commit: claude-code as default runtime + MiniMax as primary key + hermes/langgraph kept as workflow_dispatch options with OpenAI fallback. Per-runtime verify-key case-statement matches canary-staging.yml + continuous-synth-e2e.yml byte-for-byte. Two extra wrinkles vs canary: - Dispatch input `runtime` default flipped from "hermes" to "claude-code" so operators dispatching from the UI get the safe path by default. They can still pick hermes/langgraph from the dropdown when they specifically want to exercise OpenAI. - E2E_MODEL_SLUG is dispatch-aware: MiniMax-M2.7-highspeed for claude-code, openai/gpt-4o for hermes (slash-form per derive-provider.sh), openai:gpt-4o for langgraph (colon-form per init_chat_model). The branch comment in lib/model_slug.sh covers the rationale; pinning the slug here keeps the dispatch UX stable even when operators don't override. After this lands + the canary commit lands, the only OpenAI-dependent E2E surface is the operator-dispatch fallback. The cron canary, the synth E2E, AND the full-lifecycle gate are all on MiniMax — separate billing account, no OpenAI quota dependency on auto-runs. --- .github/workflows/e2e-staging-saas.yml | 57 +++++++++++++++++++++----- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/.github/workflows/e2e-staging-saas.yml b/.github/workflows/e2e-staging-saas.yml index 2a7efe16..2c252d10 100644 --- a/.github/workflows/e2e-staging-saas.yml +++ b/.github/workflows/e2e-staging-saas.yml @@ -48,9 +48,9 @@ on: workflow_dispatch: inputs: runtime: - description: "Runtime to test (hermes | claude-code | langgraph)" + description: "Runtime to test (claude-code [default, MiniMax] | hermes [OpenAI] | langgraph [OpenAI])" required: false - default: "hermes" + default: "claude-code" keep_org: description: "Skip teardown for debugging (only use via manual dispatch!)" required: false @@ -83,11 +83,27 @@ jobs: # retrieval + teardown. Configure in # Settings → Secrets and variables → Actions → Repository secrets. MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} - # OpenAI key for workspace LLM calls (section 8 A2A). Without it, - # Hermes runtime crashes at boot with "No provider API key found". - # Configure at Settings → Secrets → Actions → MOLECULE_STAGING_OPENAI_KEY. + # MiniMax is the PRIMARY LLM auth path post-2026-05-04. Switched + # from hermes+OpenAI default after #2578 (the staging OpenAI key + # account went over quota and stayed dead for 36+ hours, taking + # the full-lifecycle E2E red on every provisioning-critical push). + # claude-code template's `minimax` provider routes + # ANTHROPIC_BASE_URL to api.minimax.io/anthropic and reads + # MINIMAX_API_KEY at boot — separate billing account so an + # OpenAI quota collapse no longer wedges the gate. Mirrors the + # canary-staging.yml + continuous-synth-e2e.yml migrations. + E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }} + # OpenAI fallback — kept wired so an operator-dispatched run with + # E2E_RUNTIME=hermes or =langgraph via workflow_dispatch can still + # exercise the OpenAI path. E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }} - E2E_RUNTIME: ${{ github.event.inputs.runtime || 'hermes' }} + E2E_RUNTIME: ${{ github.event.inputs.runtime || 'claude-code' }} + # Pin the model when running on the default claude-code path — + # the per-runtime default ("sonnet") routes to direct Anthropic + # and defeats the cost saving. Operators can override via the + # workflow_dispatch flow (no input wired here yet — runtime + # override is enough for ad-hoc). + E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'langgraph' && 'openai:gpt-4o' || 'MiniMax-M2.7-highspeed' }} E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}" E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }} @@ -102,13 +118,34 @@ jobs: fi echo "Admin token present ✓" - - name: Verify OpenAI key present + - name: Verify LLM key present run: | - if [ -z "$E2E_OPENAI_API_KEY" ]; then - echo "::error::MOLECULE_STAGING_OPENAI_KEY secret not set — workspaces will fail at boot with 'No provider API key found'" + # Per-runtime key check — claude-code uses MiniMax; hermes / + # langgraph (operator-dispatched only) use OpenAI. Hard-fail + # rather than soft-skip per #2578's lesson — empty key + # silently falls through to the wrong SECRETS_JSON branch and + # produces a confusing auth error 5 min later instead of the + # clean "secret missing" message at the top. + case "${E2E_RUNTIME}" in + claude-code) + required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY" + required_secret_value="${E2E_MINIMAX_API_KEY:-}" + ;; + langgraph|hermes) + required_secret_name="MOLECULE_STAGING_OPENAI_KEY" + required_secret_value="${E2E_OPENAI_API_KEY:-}" + ;; + *) + echo "::warning::Unknown E2E_RUNTIME='${E2E_RUNTIME}' — skipping LLM-key check" + required_secret_name="" + required_secret_value="present" + ;; + esac + if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then + echo "::error::${required_secret_name} secret not set for runtime=${E2E_RUNTIME} — workspaces will fail at boot with 'No provider API key found'" exit 2 fi - echo "OpenAI key present ✓ (len=${#E2E_OPENAI_API_KEY})" + echo "LLM key present ✓ (runtime=${E2E_RUNTIME}, key=${required_secret_name}, len=${#required_secret_value})" - name: CP staging health preflight run: |