From 6f8f9789759a0bfa9988b3f4d0cdebad842641ad Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 00:18:03 -0700 Subject: [PATCH 1/2] canary-staging: migrate from hermes+OpenAI to claude-code+MiniMax MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror the migration continuous-synth-e2e.yml made on 2026-05-03 (#265). Both workflows hit the same MOLECULE_STAGING_OPENAI_KEY which went over quota on 2026-05-01 (#2578) and stayed dead — the canary has been red for 36+ hours waiting on operator billing top-up. This switch breaks the canary's dependency on OpenAI billing entirely: claude-code template's `minimax` provider routes ANTHROPIC_BASE_URL to api.minimax.io/anthropic and reads MINIMAX_API_KEY at boot. MiniMax is ~5-10x cheaper per token than gpt-4.1-mini AND on a separate billing account, so a future OpenAI quota collapse no longer wedges the canary's "is staging alive?" signal. Changes: - E2E_RUNTIME: hermes → claude-code - Add E2E_MODEL_SLUG: MiniMax-M2.7-highspeed (pin to MiniMax — the per-runtime claude-code default is "sonnet" which routes to direct Anthropic and would defeat the cost saving) - Add E2E_MINIMAX_API_KEY env wired to MOLECULE_STAGING_MINIMAX_API_KEY - Keep E2E_OPENAI_API_KEY as fallback for operator-dispatched runs that set E2E_RUNTIME=hermes via workflow_dispatch - "Verify OpenAI key present" → per-runtime "Verify LLM key present" case statement matching synth E2E's exact shape (claude-code requires MiniMax, langgraph/hermes require OpenAI). Hard-fail on missing required key per #2578's lesson — soft-skip silently fell through to the wrong SECRETS_JSON branch and produced a confusing auth error 5 min later instead of the clean "secret missing" message at the top. Verifies #2578 root cause won't recur on the canary path. The synth E2E and the manual e2e-staging-saas dispatch can still hit OpenAI when explicitly chosen — only the cron canary moves off it. --- .github/workflows/canary-staging.yml | 63 +++++++++++++++++++++------- 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/.github/workflows/canary-staging.yml b/.github/workflows/canary-staging.yml index 93f53ca7..37037156 100644 --- a/.github/workflows/canary-staging.yml +++ b/.github/workflows/canary-staging.yml @@ -50,19 +50,30 @@ jobs: env: MOLECULE_CP_URL: https://staging-api.moleculesai.app MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} - # Without an LLM key the test_staging_full_saas.sh script provisions - # the workspace with empty secrets, hermes derive-provider.sh resolves - # `openai/gpt-4o` to PROVIDER=openrouter, no OPENROUTER_API_KEY is - # found in env, and A2A returns "No LLM provider configured" at - # request time (canary step 8/11). The full-lifecycle workflow - # (e2e-staging-saas.yml) has carried this secret since launch — the - # canary regressed when it was first split out and lost the env - # block. Issue #1500 had ~30 consecutive failures before this was - # spotted; do NOT remove without re-reading the script's secrets- - # injection block. + # MiniMax is the canary's PRIMARY LLM auth path post-2026-05-04. + # Switched from hermes+OpenAI after #2578 (the staging OpenAI key + # account went over quota and stayed dead for 36+ hours, taking + # the canary red the entire time). claude-code template's + # `minimax` provider routes ANTHROPIC_BASE_URL to + # api.minimax.io/anthropic and reads MINIMAX_API_KEY at boot — + # ~5-10x cheaper per token than gpt-4.1-mini AND on a separate + # billing account, so OpenAI quota collapse no longer wedges the + # canary. Mirrors the migration continuous-synth-e2e.yml made on + # 2026-05-03 (#265) for the same reason. tests/e2e/test_staging_ + # full_saas.sh branches SECRETS_JSON on which key is present — + # MiniMax wins when set. + E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }} + # OpenAI fallback — kept wired so an operator-dispatched run with + # E2E_RUNTIME=hermes overridden via workflow_dispatch can still + # exercise the OpenAI path without re-editing the workflow. E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }} E2E_MODE: canary - E2E_RUNTIME: hermes + E2E_RUNTIME: claude-code + # Pin the canary to a specific MiniMax model rather than relying + # on the per-runtime default (which could resolve to "sonnet" → + # direct Anthropic and defeat the cost saving). M2.7-highspeed + # is "Token Plan only" but cheap-per-token and fast. + E2E_MODEL_SLUG: MiniMax-M2.7-highspeed E2E_RUN_ID: "canary-${{ github.run_id }}" steps: @@ -75,13 +86,35 @@ jobs: exit 2 fi - - name: Verify OpenAI key present + - name: Verify LLM key present run: | - if [ -z "$E2E_OPENAI_API_KEY" ]; then - echo "::error::MOLECULE_STAGING_OPENAI_KEY secret not set — A2A will fail at request time with 'No LLM provider configured'" + # Per-runtime key check — claude-code uses MiniMax; hermes / + # langgraph (operator-dispatched only) use OpenAI. Hard-fail + # rather than soft-skip per the lesson from synth E2E #2578: + # an empty key silently falls through to the wrong + # SECRETS_JSON branch and the canary fails 5 min later with + # a confusing auth error instead of the clean "secret + # missing" message at the top. + case "${E2E_RUNTIME}" in + claude-code) + required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY" + required_secret_value="${E2E_MINIMAX_API_KEY:-}" + ;; + langgraph|hermes) + required_secret_name="MOLECULE_STAGING_OPENAI_KEY" + required_secret_value="${E2E_OPENAI_API_KEY:-}" + ;; + *) + echo "::warning::Unknown E2E_RUNTIME='${E2E_RUNTIME}' — skipping LLM-key check" + required_secret_name="" + required_secret_value="present" + ;; + esac + if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then + echo "::error::${required_secret_name} secret not set for runtime=${E2E_RUNTIME} — A2A will fail at request time with 'No LLM provider configured'" exit 2 fi - echo "OpenAI key present ✓ (len=${#E2E_OPENAI_API_KEY})" + echo "LLM key present ✓ (runtime=${E2E_RUNTIME}, key=${required_secret_name}, len=${#required_secret_value})" - name: Canary run id: canary From eaee113416215fef012fff4d2b775faee07ae53d Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 00:20:36 -0700 Subject: [PATCH 2/2] e2e-staging-saas: same migration off OpenAI default to claude-code+MiniMax MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bundles the same hermes+OpenAI → claude-code+MiniMax migration onto the full-lifecycle E2E that's been red on every provisioning-critical push since 2026-05-01. Same root cause as the canary fix in the prior commit: MOLECULE_STAGING_OPENAI_KEY hit insufficient_quota and there's no SLA on operator billing top-up. Same shape as canary commit: claude-code as default runtime + MiniMax as primary key + hermes/langgraph kept as workflow_dispatch options with OpenAI fallback. Per-runtime verify-key case-statement matches canary-staging.yml + continuous-synth-e2e.yml byte-for-byte. Two extra wrinkles vs canary: - Dispatch input `runtime` default flipped from "hermes" to "claude-code" so operators dispatching from the UI get the safe path by default. They can still pick hermes/langgraph from the dropdown when they specifically want to exercise OpenAI. - E2E_MODEL_SLUG is dispatch-aware: MiniMax-M2.7-highspeed for claude-code, openai/gpt-4o for hermes (slash-form per derive-provider.sh), openai:gpt-4o for langgraph (colon-form per init_chat_model). The branch comment in lib/model_slug.sh covers the rationale; pinning the slug here keeps the dispatch UX stable even when operators don't override. After this lands + the canary commit lands, the only OpenAI-dependent E2E surface is the operator-dispatch fallback. The cron canary, the synth E2E, AND the full-lifecycle gate are all on MiniMax — separate billing account, no OpenAI quota dependency on auto-runs. --- .github/workflows/e2e-staging-saas.yml | 57 +++++++++++++++++++++----- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/.github/workflows/e2e-staging-saas.yml b/.github/workflows/e2e-staging-saas.yml index 2a7efe16..2c252d10 100644 --- a/.github/workflows/e2e-staging-saas.yml +++ b/.github/workflows/e2e-staging-saas.yml @@ -48,9 +48,9 @@ on: workflow_dispatch: inputs: runtime: - description: "Runtime to test (hermes | claude-code | langgraph)" + description: "Runtime to test (claude-code [default, MiniMax] | hermes [OpenAI] | langgraph [OpenAI])" required: false - default: "hermes" + default: "claude-code" keep_org: description: "Skip teardown for debugging (only use via manual dispatch!)" required: false @@ -83,11 +83,27 @@ jobs: # retrieval + teardown. Configure in # Settings → Secrets and variables → Actions → Repository secrets. MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} - # OpenAI key for workspace LLM calls (section 8 A2A). Without it, - # Hermes runtime crashes at boot with "No provider API key found". - # Configure at Settings → Secrets → Actions → MOLECULE_STAGING_OPENAI_KEY. + # MiniMax is the PRIMARY LLM auth path post-2026-05-04. Switched + # from hermes+OpenAI default after #2578 (the staging OpenAI key + # account went over quota and stayed dead for 36+ hours, taking + # the full-lifecycle E2E red on every provisioning-critical push). + # claude-code template's `minimax` provider routes + # ANTHROPIC_BASE_URL to api.minimax.io/anthropic and reads + # MINIMAX_API_KEY at boot — separate billing account so an + # OpenAI quota collapse no longer wedges the gate. Mirrors the + # canary-staging.yml + continuous-synth-e2e.yml migrations. + E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }} + # OpenAI fallback — kept wired so an operator-dispatched run with + # E2E_RUNTIME=hermes or =langgraph via workflow_dispatch can still + # exercise the OpenAI path. E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }} - E2E_RUNTIME: ${{ github.event.inputs.runtime || 'hermes' }} + E2E_RUNTIME: ${{ github.event.inputs.runtime || 'claude-code' }} + # Pin the model when running on the default claude-code path — + # the per-runtime default ("sonnet") routes to direct Anthropic + # and defeats the cost saving. Operators can override via the + # workflow_dispatch flow (no input wired here yet — runtime + # override is enough for ad-hoc). + E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'langgraph' && 'openai:gpt-4o' || 'MiniMax-M2.7-highspeed' }} E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}" E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }} @@ -102,13 +118,34 @@ jobs: fi echo "Admin token present ✓" - - name: Verify OpenAI key present + - name: Verify LLM key present run: | - if [ -z "$E2E_OPENAI_API_KEY" ]; then - echo "::error::MOLECULE_STAGING_OPENAI_KEY secret not set — workspaces will fail at boot with 'No provider API key found'" + # Per-runtime key check — claude-code uses MiniMax; hermes / + # langgraph (operator-dispatched only) use OpenAI. Hard-fail + # rather than soft-skip per #2578's lesson — empty key + # silently falls through to the wrong SECRETS_JSON branch and + # produces a confusing auth error 5 min later instead of the + # clean "secret missing" message at the top. + case "${E2E_RUNTIME}" in + claude-code) + required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY" + required_secret_value="${E2E_MINIMAX_API_KEY:-}" + ;; + langgraph|hermes) + required_secret_name="MOLECULE_STAGING_OPENAI_KEY" + required_secret_value="${E2E_OPENAI_API_KEY:-}" + ;; + *) + echo "::warning::Unknown E2E_RUNTIME='${E2E_RUNTIME}' — skipping LLM-key check" + required_secret_name="" + required_secret_value="present" + ;; + esac + if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then + echo "::error::${required_secret_name} secret not set for runtime=${E2E_RUNTIME} — workspaces will fail at boot with 'No provider API key found'" exit 2 fi - echo "OpenAI key present ✓ (len=${#E2E_OPENAI_API_KEY})" + echo "LLM key present ✓ (runtime=${E2E_RUNTIME}, key=${required_secret_name}, len=${#required_secret_value})" - name: CP staging health preflight run: |