From 79a0203798503cfe0bf433c7d6ef6277a3fadd5e Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 3 May 2026 15:35:14 -0700 Subject: [PATCH] feat(synth-e2e): switch canary to claude-code + MiniMax-M2.7-highspeed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cuts the per-run LLM cost ~10x (MiniMax M2.7 vs gpt-4.1-mini) and removes the recurring OpenAI-quota-exhaustion failure mode that took the canary down on 2026-05-03 (#265 — staging quota burnt for ~16h). Path: E2E_RUNTIME=claude-code (default) → workspace-configs-templates/claude-code-default/config.yaml's `minimax` provider (lines 64-69) → ANTHROPIC_BASE_URL auto-set to api.minimax.io/anthropic → reads MINIMAX_API_KEY (per-vendor env, no collision with GLM/Z.ai etc.) Workflow changes (continuous-synth-e2e.yml): - Default runtime: langgraph → claude-code - New env: E2E_MODEL_SLUG (defaults to MiniMax-M2.7-highspeed, overridable via workflow_dispatch) - New secret wire: E2E_MINIMAX_API_KEY ← secrets.MOLECULE_STAGING_MINIMAX_API_KEY - Per-runtime missing-secret guard: claude-code requires MINIMAX, langgraph/hermes require OPENAI. Cron firing hard-fails on missing key for the active runtime; dispatch soft-skips so operators can ad-hoc test without setting up the secret first - Operators can still pick langgraph/hermes via workflow_dispatch; the OpenAI fallback path stays wired Script changes (tests/e2e/test_staging_full_saas.sh): - SECRETS_JSON branches on which key is set: E2E_MINIMAX_API_KEY → {MINIMAX_API_KEY: } (claude-code path) E2E_OPENAI_API_KEY → {OPENAI_API_KEY, HERMES_*, MODEL_PROVIDER} (legacy) MiniMax wins when both are present — claude-code default canary must not accidentally consume the OpenAI key Tests (new tests/e2e/test_secrets_dispatch.sh): - 10 cases pinning the precedence + payload shape per branch - Discipline check verified: 5 of 10 FAIL on a swapped if/elif (precedence inversion), all 10 PASS on the fix - Anchors on the section-comment header so a structural refactor fails loudly rather than silently sourcing nothing The model_slug dispatcher (lib/model_slug.sh) needs no change: E2E_MODEL_SLUG override path is already wired (line 41), and claude-code template's `minimax-` prefix matcher catches "MiniMax-M2.7-highspeed" via lowercase-on-lookup. Operator action required to land green: - Set MOLECULE_STAGING_MINIMAX_API_KEY in repo secrets (Settings → Secrets and Variables → Actions). Use `gh secret set MOLECULE_STAGING_MINIMAX_API_KEY -R Molecule-AI/molecule-core` to avoid leaking the value into shell history. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/continuous-synth-e2e.yml | 90 ++++++++++--- tests/e2e/test_secrets_dispatch.sh | 145 +++++++++++++++++++++ tests/e2e/test_staging_full_saas.sh | 54 ++++---- 3 files changed, 248 insertions(+), 41 deletions(-) create mode 100755 tests/e2e/test_secrets_dispatch.sh diff --git a/.github/workflows/continuous-synth-e2e.yml b/.github/workflows/continuous-synth-e2e.yml index c6c482b8..924684e8 100644 --- a/.github/workflows/continuous-synth-e2e.yml +++ b/.github/workflows/continuous-synth-e2e.yml @@ -39,9 +39,14 @@ on: workflow_dispatch: inputs: runtime: - description: "Runtime to provision (langgraph = fastest, default; hermes = slower but covers SDK-native path; claude-code = needs OAUTH token in tenant env)" + description: "Runtime to provision (claude-code = default + cheapest via MiniMax; langgraph = OpenAI-only; hermes = SDK-native path, slower)" required: false - default: "langgraph" + default: "claude-code" + type: string + model_slug: + description: "Model id to provision the workspace with (default MiniMax-M2.7-highspeed; e.g. 'sonnet' to test direct Anthropic, 'openai/gpt-4o' for hermes)" + required: false + default: "MiniMax-M2.7-highspeed" type: string keep_org: description: "Skip teardown for post-mortem debugging (only manual dispatch — never set this for cron runs)" @@ -70,13 +75,23 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 12 env: - # langgraph default keeps cold-start under 5 min on staging EC2. - # hermes is slower (~7-10 min) and isn't needed for the - # regression class this gate exists to catch (deployment-pipeline - # + schema-drift + integration). Operators can pick hermes via - # workflow_dispatch when they need to exercise the SDK-native - # session path. - E2E_RUNTIME: ${{ github.event.inputs.runtime || 'langgraph' }} + # claude-code default: cold-start ~5 min (comparable to langgraph), + # but uses MiniMax-M2.7-highspeed via the template's third-party- + # Anthropic-compat path (workspace-configs-templates/claude-code- + # default/config.yaml:64-69). MiniMax is ~5-10x cheaper than + # gpt-4.1-mini per token AND avoids the recurring OpenAI quota- + # exhaustion class that took the canary down 2026-05-03 (#265). + # Operators can pick langgraph / hermes via workflow_dispatch + # when they specifically need to exercise the OpenAI or SDK- + # native paths. + E2E_RUNTIME: ${{ github.event.inputs.runtime || 'claude-code' }} + # Pin the canary to a specific MiniMax model rather than relying + # on the per-runtime default ("sonnet" → routes to direct + # Anthropic, defeats the cost saving). Operators can override + # via workflow_dispatch by setting a different E2E_MODEL_SLUG + # input if they need to exercise a specific model. M2.7-highspeed + # is "Token Plan only" but cheap-per-token and fast. + E2E_MODEL_SLUG: ${{ github.event.inputs.model_slug || 'MiniMax-M2.7-highspeed' }} # Bound to 10 min so a stuck provision fails the run instead of # holding up the next cron firing. 15-min default in the script # is for the on-PR full lifecycle where we have more headroom. @@ -88,19 +103,26 @@ jobs: E2E_KEEP_ORG: ${{ github.event.inputs.keep_org == 'true' && '1' || '' }} MOLECULE_CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }} MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} - # Provisioned tenant's default model (langgraph: openai:gpt-4.1-mini) - # needs OPENAI_API_KEY at first call. Sibling workflows - # e2e-staging-saas.yml + canary-staging.yml use the same secret; - # without this wire-up the tenant boots, accepts a2a messages, - # then returns "Could not resolve authentication method" — masked - # earlier by the a2a-sdk task-mode contract bugs PR #2558+#2563 - # fixed. tests/e2e/test_staging_full_saas.sh:325 reads this and - # persists it as a workspace_secret on tenant create. + # MiniMax key is the canary's PRIMARY auth path. claude-code + # template's `minimax` provider routes ANTHROPIC_BASE_URL to + # api.minimax.io/anthropic and reads MINIMAX_API_KEY at boot. + # tests/e2e/test_staging_full_saas.sh branches SECRETS_JSON on + # which key is present — MiniMax wins when set. + E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }} + # OpenAI fallback — kept wired so operators can dispatch with + # E2E_RUNTIME=langgraph or =hermes and still have a working + # canary path. The script picks the right blob shape based on + # which key is non-empty. E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - name: Verify required secret present + - name: Verify required secrets present + env: + # Re-bind so the per-runtime LLM key check below sees the right + # secret. The job-level env block already reads both; this just + # makes them visible inside the conditional shell. + IS_DISPATCH: ${{ github.event_name == 'workflow_dispatch' }} run: | # Schedule-vs-dispatch hardening (mirrors the sweep-cf-* and # redeploy-tenants-on-* workflows): hard-fail on missing secret @@ -109,7 +131,7 @@ jobs: # dispatch — operators can dispatch ad-hoc to verify a fix # without setting up the secret first. if [ -z "${MOLECULE_ADMIN_TOKEN:-}" ]; then - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + if [ "$IS_DISPATCH" = "true" ]; then echo "::warning::CP_STAGING_ADMIN_API_TOKEN not set — synth E2E cannot run" echo "::warning::Set it at Settings → Secrets and Variables → Actions" exit 0 @@ -119,6 +141,36 @@ jobs: exit 1 fi + # LLM-key requirement is per-runtime: claude-code uses MiniMax + # (MOLECULE_STAGING_MINIMAX_API_KEY), langgraph + hermes use + # OpenAI (MOLECULE_STAGING_OPENAI_KEY). Cron firing must have + # the right key for the active runtime; dispatch can soft-skip. + case "${E2E_RUNTIME}" in + claude-code) + required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY" + required_secret_value="${E2E_MINIMAX_API_KEY:-}" + ;; + langgraph|hermes) + required_secret_name="MOLECULE_STAGING_OPENAI_KEY" + required_secret_value="${E2E_OPENAI_API_KEY:-}" + ;; + *) + echo "::warning::Unknown E2E_RUNTIME='${E2E_RUNTIME}' — skipping LLM-key check" + required_secret_name="" + required_secret_value="present" + ;; + esac + if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then + if [ "$IS_DISPATCH" = "true" ]; then + echo "::warning::${required_secret_name} not set — synth E2E with runtime=${E2E_RUNTIME} cannot reach an LLM" + echo "::warning::Set it at Settings → Secrets and Variables → Actions, OR dispatch with a different runtime" + exit 0 + fi + echo "::error::${required_secret_name} secret missing — runtime=${E2E_RUNTIME} cannot authenticate against its LLM provider" + echo "::error::Set it at Settings → Secrets and Variables → Actions" + exit 1 + fi + - name: Install required tools run: | # The script depends on jq + curl (already on ubuntu-latest) diff --git a/tests/e2e/test_secrets_dispatch.sh b/tests/e2e/test_secrets_dispatch.sh new file mode 100755 index 00000000..36b4af2d --- /dev/null +++ b/tests/e2e/test_secrets_dispatch.sh @@ -0,0 +1,145 @@ +#!/usr/bin/env bash +# Regression test for the SECRETS_JSON branching in +# tests/e2e/test_staging_full_saas.sh (lines ~322-368). +# +# The synth-E2E canary picks one of two LLM auth paths based on which +# E2E_*_API_KEY is set. The branch order is load-bearing: +# +# E2E_MINIMAX_API_KEY first → claude-code MiniMax path (cheap canary +# default since 2026-05-03; routes via +# workspace-configs-templates/claude- +# code-default/config.yaml's `minimax` +# provider entry). +# +# E2E_OPENAI_API_KEY second → langgraph + hermes legacy path (kept +# as fallback for operator dispatches +# that need the OpenAI-shaped +# HERMES_CUSTOM_* env block). +# +# Without this gate, a future "tidy up the if/elif" refactor could +# silently flip the precedence (OpenAI wins when both are set → +# claude-code workspace boots without MINIMAX_API_KEY → 401 at first +# turn → canary red without any signal that the wrong key shape was +# selected). The 2026-05-03 OpenAI-quota incident took ~16h to +# diagnose for exactly this class of "looks like an LLM problem, +# was actually a wiring problem" failure. + +set -uo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SAAS_SCRIPT="$SCRIPT_DIR/test_staging_full_saas.sh" + +if [ ! -f "$SAAS_SCRIPT" ]; then + echo "FATAL: cannot locate test_staging_full_saas.sh at $SAAS_SCRIPT" >&2 + exit 2 +fi + +PASS=0 +FAIL=0 + +assert_eq() { + local label="$1" got="$2" want="$3" + if [ "$got" = "$want" ]; then + echo " ✓ $label" + PASS=$((PASS+1)) + else + echo " ✗ $label" >&2 + echo " got: $got" >&2 + echo " want: $want" >&2 + FAIL=$((FAIL+1)) + fi +} + +# Extract just the SECRETS_JSON block from the saas script and source +# it into a sub-shell so we can run the branching logic in isolation. +# Anchor on the comment header so a structural refactor that moves the +# block fails this test loudly rather than silently sourcing nothing. +extract_block() { + awk ' + /^# ─── 5\. Provision parent workspace/ {capture=1; next} + capture && /^MODEL_SLUG=/ {exit} + capture {print} + ' "$SAAS_SCRIPT" +} + +BLOCK=$(extract_block) +if [ -z "$BLOCK" ]; then + echo "FATAL: SECRETS_JSON block not found in $SAAS_SCRIPT — refactor anchor changed?" >&2 + exit 2 +fi + +# Run the extracted block in a clean env, capturing SECRETS_JSON. +run_block() { + # Caller passes vars on the command line, e.g. + # run_block E2E_MINIMAX_API_KEY=mx-test + env -i PATH="$PATH" "$@" bash -c " + set -uo pipefail + $BLOCK + echo \"\$SECRETS_JSON\" + " 2>/dev/null | tail -1 +} + +# Resolve a JSON key from the captured payload using python3 (already +# a hard dep of the saas script). Returns empty string on missing key. +get_json_key() { + local payload="$1" key="$2" + python3 -c " +import json, sys +p = json.loads(sys.argv[1]) +print(p.get(sys.argv[2], '')) +" "$payload" "$key" +} + +list_json_keys() { + python3 -c " +import json, sys +p = json.loads(sys.argv[1]) +print(','.join(sorted(p.keys()))) +" "$1" +} + +echo "Test: SECRETS_JSON branching in test_staging_full_saas.sh" +echo + +# ── Branch 1: MiniMax wins when set ── +SECRETS_JSON=$(run_block E2E_MINIMAX_API_KEY=mx-test) +assert_eq "MiniMax key set → MINIMAX_API_KEY in payload" \ + "$(get_json_key "$SECRETS_JSON" MINIMAX_API_KEY)" "mx-test" +assert_eq "MiniMax-only payload contains exactly MINIMAX_API_KEY" \ + "$(list_json_keys "$SECRETS_JSON")" "MINIMAX_API_KEY" + +# ── Branch 1 precedence: MiniMax beats OpenAI when both set ── +# Critical: the 2026-05-03 incident shape was "two paths exist, wrong +# one wins". The bash if/elif must keep MiniMax above OpenAI so the +# claude-code default canary doesn't accidentally use the (more +# expensive, quota-burnt) OpenAI key. +SECRETS_JSON=$(run_block E2E_MINIMAX_API_KEY=mx-priority E2E_OPENAI_API_KEY=oai-loser) +assert_eq "Both keys set → MiniMax wins" \ + "$(get_json_key "$SECRETS_JSON" MINIMAX_API_KEY)" "mx-priority" +assert_eq "Both keys set → OpenAI block NOT emitted" \ + "$(get_json_key "$SECRETS_JSON" OPENAI_API_KEY)" "" +assert_eq "Both keys set → no HERMES_* leakage from OpenAI branch" \ + "$(get_json_key "$SECRETS_JSON" HERMES_INFERENCE_PROVIDER)" "" + +# ── Branch 2: OpenAI used when MiniMax absent ── +SECRETS_JSON=$(run_block E2E_OPENAI_API_KEY=oai-test) +assert_eq "Only OpenAI set → OPENAI_API_KEY in payload" \ + "$(get_json_key "$SECRETS_JSON" OPENAI_API_KEY)" "oai-test" +assert_eq "Only OpenAI set → HERMES_CUSTOM_API_KEY mirrors OpenAI key" \ + "$(get_json_key "$SECRETS_JSON" HERMES_CUSTOM_API_KEY)" "oai-test" +assert_eq "Only OpenAI set → MODEL_PROVIDER pinned to colon-form" \ + "$(get_json_key "$SECRETS_JSON" MODEL_PROVIDER)" "openai:gpt-4o" +assert_eq "Only OpenAI set → MINIMAX_API_KEY NOT emitted" \ + "$(get_json_key "$SECRETS_JSON" MINIMAX_API_KEY)" "" + +# ── No keys: empty payload ── +SECRETS_JSON=$(run_block) +assert_eq "No keys set → SECRETS_JSON is empty object" \ + "$SECRETS_JSON" "{}" + +echo +echo "─────────────────────────────────────────────────" +echo "PASSED: $PASS" +echo "FAILED: $FAIL" +echo "─────────────────────────────────────────────────" +[ "$FAIL" -eq 0 ] diff --git a/tests/e2e/test_staging_full_saas.sh b/tests/e2e/test_staging_full_saas.sh index 759af7b9..5754b04d 100755 --- a/tests/e2e/test_staging_full_saas.sh +++ b/tests/e2e/test_staging_full_saas.sh @@ -320,29 +320,39 @@ tenant_call() { } # ─── 5. Provision parent workspace ───────────────────────────────────── -# Runtimes like hermes crash at boot with "No provider API key found" -# if nothing in the standard env-var list is set. Inject the API key -# from E2E_OPENAI_API_KEY so the runtime can actually start — it's -# per-workspace secret, so it's persisted as a workspace_secret and -# materialized into the container env. Missing key falls through to -# an empty secrets map; workspace will still fail but the error is -# expected and actionable. +# Inject the LLM provider key so the runtime can authenticate at boot. +# Branch by which secret is set so the script supports both paths +# without forcing every dispatch to ship both keys: +# +# E2E_MINIMAX_API_KEY → claude-code MiniMax path. Cheapest, default +# for the cron canary post-2026-05-03. Routes via the claude-code +# template's `minimax` provider (workspace-configs-templates/ +# claude-code-default/config.yaml:64-69) which sets +# ANTHROPIC_BASE_URL=https://api.minimax.io/anthropic at boot. +# MINIMAX_API_KEY is the vendor-specific env name the adapter +# reads (PR #244 — per-vendor envs prevent ANTHROPIC_AUTH_TOKEN +# collisions when a user runs MiniMax + Z.ai workspaces side-by- +# side). +# +# E2E_OPENAI_API_KEY → langgraph + hermes paths. Kept as fallback +# for operator dispatches that explicitly want to exercise the +# OpenAI path. The HERMES_* fields pin hermes-agent's bridge to +# api.openai.com (template-hermes' derive-provider.sh otherwise +# resolves openai/* → openrouter.ai and 401s). MODEL_PROVIDER +# follows workspace/config.py:258's 'provider:model' format. +# +# Both empty → '{}' (workspace will fail at first turn with an +# expected, actionable auth error rather than masking the test). SECRETS_JSON='{}' -if [ -n "${E2E_OPENAI_API_KEY:-}" ]; then - # MODEL_PROVIDER is a full model slug in 'provider:model' format per - # workspace/config.py:258. Using just "openai" gets parsed as the - # model name → 404 model_not_found. Also set OPENAI_BASE_URL to - # OpenAI's own endpoint — default is openrouter.ai which would need - # a different key format. - # - # The HERMES_* fields below bypass template-hermes/scripts/derive-provider.sh - # — verified 2026-04-24 that even with template-hermes#19's fix in main, - # staging tenants sometimes resolve openai/* to PROVIDER=openrouter and - # emit {'message':'Missing Authentication header','code':401} (OpenRouter's - # shape) in the A2A reply. Setting HERMES_INFERENCE_PROVIDER=custom + - # HERMES_CUSTOM_{BASE_URL,API_KEY,API_MODE} pins the bridge deterministically - # so the test doesn't depend on every tenant EC2 having a freshly-cloned - # template-hermes. +if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then + SECRETS_JSON=$(python3 -c " +import json, os +k = os.environ['E2E_MINIMAX_API_KEY'] +print(json.dumps({ + 'MINIMAX_API_KEY': k, +})) +") +elif [ -n "${E2E_OPENAI_API_KEY:-}" ]; then SECRETS_JSON=$(python3 -c " import json, os k = os.environ['E2E_OPENAI_API_KEY']