From 6f8f9789759a0bfa9988b3f4d0cdebad842641ad Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangrabbit@gmail.com>
Date: Mon, 4 May 2026 00:18:03 -0700
Subject: [PATCH 1/2] canary-staging: migrate from hermes+OpenAI to
 claude-code+MiniMax
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirror the migration continuous-synth-e2e.yml made on 2026-05-03 (#265).
Both workflows hit the same MOLECULE_STAGING_OPENAI_KEY which went over
quota on 2026-05-01 (#2578) and stayed dead — the canary has been red
for 36+ hours waiting on operator billing top-up.

This switch breaks the canary's dependency on OpenAI billing entirely:
claude-code template's `minimax` provider routes ANTHROPIC_BASE_URL to
api.minimax.io/anthropic and reads MINIMAX_API_KEY at boot. MiniMax is
~5-10x cheaper per token than gpt-4.1-mini AND on a separate billing
account, so a future OpenAI quota collapse no longer wedges the
canary's "is staging alive?" signal.

Changes:
- E2E_RUNTIME: hermes → claude-code
- Add E2E_MODEL_SLUG: MiniMax-M2.7-highspeed (pin to MiniMax — the
  per-runtime claude-code default is "sonnet" which routes to direct
  Anthropic and would defeat the cost saving)
- Add E2E_MINIMAX_API_KEY env wired to MOLECULE_STAGING_MINIMAX_API_KEY
- Keep E2E_OPENAI_API_KEY as fallback for operator-dispatched runs that
  set E2E_RUNTIME=hermes via workflow_dispatch
- "Verify OpenAI key present" → per-runtime "Verify LLM key present"
  case statement matching synth E2E's exact shape (claude-code requires
  MiniMax, langgraph/hermes require OpenAI). Hard-fail on missing
  required key per #2578's lesson — soft-skip silently fell through to
  the wrong SECRETS_JSON branch and produced a confusing auth error
  5 min later instead of the clean "secret missing" message at the top.

Verifies #2578 root cause won't recur on the canary path. The synth
E2E and the manual e2e-staging-saas dispatch can still hit OpenAI when
explicitly chosen — only the cron canary moves off it.
---
 .github/workflows/canary-staging.yml | 63 +++++++++++++++++++++-------
 1 file changed, 48 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/canary-staging.yml b/.github/workflows/canary-staging.yml
index 93f53ca7..37037156 100644
--- a/.github/workflows/canary-staging.yml
+++ b/.github/workflows/canary-staging.yml
@@ -50,19 +50,30 @@ jobs:
     env:
       MOLECULE_CP_URL: https://staging-api.moleculesai.app
       MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
-      # Without an LLM key the test_staging_full_saas.sh script provisions
-      # the workspace with empty secrets, hermes derive-provider.sh resolves
-      # `openai/gpt-4o` to PROVIDER=openrouter, no OPENROUTER_API_KEY is
-      # found in env, and A2A returns "No LLM provider configured" at
-      # request time (canary step 8/11). The full-lifecycle workflow
-      # (e2e-staging-saas.yml) has carried this secret since launch — the
-      # canary regressed when it was first split out and lost the env
-      # block. Issue #1500 had ~30 consecutive failures before this was
-      # spotted; do NOT remove without re-reading the script's secrets-
-      # injection block.
+      # MiniMax is the canary's PRIMARY LLM auth path post-2026-05-04.
+      # Switched from hermes+OpenAI after #2578 (the staging OpenAI key
+      # account went over quota and stayed dead for 36+ hours, taking
+      # the canary red the entire time). claude-code template's
+      # `minimax` provider routes ANTHROPIC_BASE_URL to
+      # api.minimax.io/anthropic and reads MINIMAX_API_KEY at boot —
+      # ~5-10x cheaper per token than gpt-4.1-mini AND on a separate
+      # billing account, so OpenAI quota collapse no longer wedges the
+      # canary. Mirrors the migration continuous-synth-e2e.yml made on
+      # 2026-05-03 (#265) for the same reason. tests/e2e/test_staging_
+      # full_saas.sh branches SECRETS_JSON on which key is present —
+      # MiniMax wins when set.
+      E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
+      # OpenAI fallback — kept wired so an operator-dispatched run with
+      # E2E_RUNTIME=hermes overridden via workflow_dispatch can still
+      # exercise the OpenAI path without re-editing the workflow.
       E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }}
       E2E_MODE: canary
-      E2E_RUNTIME: hermes
+      E2E_RUNTIME: claude-code
+      # Pin the canary to a specific MiniMax model rather than relying
+      # on the per-runtime default (which could resolve to "sonnet" →
+      # direct Anthropic and defeat the cost saving). M2.7-highspeed
+      # is "Token Plan only" but cheap-per-token and fast.
+      E2E_MODEL_SLUG: MiniMax-M2.7-highspeed
       E2E_RUN_ID: "canary-${{ github.run_id }}"
 
     steps:
@@ -75,13 +86,35 @@ jobs:
             exit 2
           fi
 
-      - name: Verify OpenAI key present
+      - name: Verify LLM key present
         run: |
-          if [ -z "$E2E_OPENAI_API_KEY" ]; then
-            echo "::error::MOLECULE_STAGING_OPENAI_KEY secret not set — A2A will fail at request time with 'No LLM provider configured'"
+          # Per-runtime key check — claude-code uses MiniMax; hermes /
+          # langgraph (operator-dispatched only) use OpenAI. Hard-fail
+          # rather than soft-skip per the lesson from synth E2E #2578:
+          # an empty key silently falls through to the wrong
+          # SECRETS_JSON branch and the canary fails 5 min later with
+          # a confusing auth error instead of the clean "secret
+          # missing" message at the top.
+          case "${E2E_RUNTIME}" in
+            claude-code)
+              required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY"
+              required_secret_value="${E2E_MINIMAX_API_KEY:-}"
+              ;;
+            langgraph|hermes)
+              required_secret_name="MOLECULE_STAGING_OPENAI_KEY"
+              required_secret_value="${E2E_OPENAI_API_KEY:-}"
+              ;;
+            *)
+              echo "::warning::Unknown E2E_RUNTIME='${E2E_RUNTIME}' — skipping LLM-key check"
+              required_secret_name=""
+              required_secret_value="present"
+              ;;
+          esac
+          if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then
+            echo "::error::${required_secret_name} secret not set for runtime=${E2E_RUNTIME} — A2A will fail at request time with 'No LLM provider configured'"
             exit 2
           fi
-          echo "OpenAI key present ✓ (len=${#E2E_OPENAI_API_KEY})"
+          echo "LLM key present ✓ (runtime=${E2E_RUNTIME}, key=${required_secret_name}, len=${#required_secret_value})"
 
       - name: Canary run
         id: canary

From eaee113416215fef012fff4d2b775faee07ae53d Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangrabbit@gmail.com>
Date: Mon, 4 May 2026 00:20:36 -0700
Subject: [PATCH 2/2] e2e-staging-saas: same migration off OpenAI default to
 claude-code+MiniMax
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bundles the same hermes+OpenAI → claude-code+MiniMax migration onto
the full-lifecycle E2E that's been red on every provisioning-critical
push since 2026-05-01. Same root cause as the canary fix in the prior
commit: MOLECULE_STAGING_OPENAI_KEY hit insufficient_quota and there's
no SLA on operator billing top-up.

Same shape as canary commit: claude-code as default runtime + MiniMax
as primary key + hermes/langgraph kept as workflow_dispatch options
with OpenAI fallback. Per-runtime verify-key case-statement matches
canary-staging.yml + continuous-synth-e2e.yml byte-for-byte.

Two extra wrinkles vs canary:
- Dispatch input `runtime` default flipped from "hermes" to "claude-code"
  so operators dispatching from the UI get the safe path by default.
  They can still pick hermes/langgraph from the dropdown when they
  specifically want to exercise OpenAI.
- E2E_MODEL_SLUG is dispatch-aware: MiniMax-M2.7-highspeed for
  claude-code, openai/gpt-4o for hermes (slash-form per
  derive-provider.sh), openai:gpt-4o for langgraph (colon-form per
  init_chat_model). The branch comment in lib/model_slug.sh covers
  the rationale; pinning the slug here keeps the dispatch UX stable
  even when operators don't override.

After this lands + the canary commit lands, the only OpenAI-dependent
E2E surface is the operator-dispatch fallback. The cron canary, the
synth E2E, AND the full-lifecycle gate are all on MiniMax — separate
billing account, no OpenAI quota dependency on auto-runs.
---
 .github/workflows/e2e-staging-saas.yml | 57 +++++++++++++++++++++-----
 1 file changed, 47 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/e2e-staging-saas.yml b/.github/workflows/e2e-staging-saas.yml
index 2a7efe16..2c252d10 100644
--- a/.github/workflows/e2e-staging-saas.yml
+++ b/.github/workflows/e2e-staging-saas.yml
@@ -48,9 +48,9 @@ on:
   workflow_dispatch:
     inputs:
       runtime:
-        description: "Runtime to test (hermes | claude-code | langgraph)"
+        description: "Runtime to test (claude-code [default, MiniMax] | hermes [OpenAI] | langgraph [OpenAI])"
         required: false
-        default: "hermes"
+        default: "claude-code"
       keep_org:
         description: "Skip teardown for debugging (only use via manual dispatch!)"
         required: false
@@ -83,11 +83,27 @@ jobs:
       # retrieval + teardown. Configure in
       # Settings → Secrets and variables → Actions → Repository secrets.
       MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
-      # OpenAI key for workspace LLM calls (section 8 A2A). Without it,
-      # Hermes runtime crashes at boot with "No provider API key found".
-      # Configure at Settings → Secrets → Actions → MOLECULE_STAGING_OPENAI_KEY.
+      # MiniMax is the PRIMARY LLM auth path post-2026-05-04. Switched
+      # from hermes+OpenAI default after #2578 (the staging OpenAI key
+      # account went over quota and stayed dead for 36+ hours, taking
+      # the full-lifecycle E2E red on every provisioning-critical push).
+      # claude-code template's `minimax` provider routes
+      # ANTHROPIC_BASE_URL to api.minimax.io/anthropic and reads
+      # MINIMAX_API_KEY at boot — separate billing account so an
+      # OpenAI quota collapse no longer wedges the gate. Mirrors the
+      # canary-staging.yml + continuous-synth-e2e.yml migrations.
+      E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
+      # OpenAI fallback — kept wired so an operator-dispatched run with
+      # E2E_RUNTIME=hermes or =langgraph via workflow_dispatch can still
+      # exercise the OpenAI path.
       E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }}
-      E2E_RUNTIME: ${{ github.event.inputs.runtime || 'hermes' }}
+      E2E_RUNTIME: ${{ github.event.inputs.runtime || 'claude-code' }}
+      # Pin the model when running on the default claude-code path —
+      # the per-runtime default ("sonnet") routes to direct Anthropic
+      # and defeats the cost saving. Operators can override via the
+      # workflow_dispatch flow (no input wired here yet — runtime
+      # override is enough for ad-hoc).
+      E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'langgraph' && 'openai:gpt-4o' || 'MiniMax-M2.7-highspeed' }}
       E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
       E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
 
@@ -102,13 +118,34 @@ jobs:
           fi
           echo "Admin token present ✓"
 
-      - name: Verify OpenAI key present
+      - name: Verify LLM key present
         run: |
-          if [ -z "$E2E_OPENAI_API_KEY" ]; then
-            echo "::error::MOLECULE_STAGING_OPENAI_KEY secret not set — workspaces will fail at boot with 'No provider API key found'"
+          # Per-runtime key check — claude-code uses MiniMax; hermes /
+          # langgraph (operator-dispatched only) use OpenAI. Hard-fail
+          # rather than soft-skip per #2578's lesson — empty key
+          # silently falls through to the wrong SECRETS_JSON branch and
+          # produces a confusing auth error 5 min later instead of the
+          # clean "secret missing" message at the top.
+          case "${E2E_RUNTIME}" in
+            claude-code)
+              required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY"
+              required_secret_value="${E2E_MINIMAX_API_KEY:-}"
+              ;;
+            langgraph|hermes)
+              required_secret_name="MOLECULE_STAGING_OPENAI_KEY"
+              required_secret_value="${E2E_OPENAI_API_KEY:-}"
+              ;;
+            *)
+              echo "::warning::Unknown E2E_RUNTIME='${E2E_RUNTIME}' — skipping LLM-key check"
+              required_secret_name=""
+              required_secret_value="present"
+              ;;
+          esac
+          if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then
+            echo "::error::${required_secret_name} secret not set for runtime=${E2E_RUNTIME} — workspaces will fail at boot with 'No provider API key found'"
             exit 2
           fi
-          echo "OpenAI key present ✓ (len=${#E2E_OPENAI_API_KEY})"
+          echo "LLM key present ✓ (runtime=${E2E_RUNTIME}, key=${required_secret_name}, len=${#required_secret_value})"
 
       - name: CP staging health preflight
         run: |