From ac6f65ab5e3388df56db8e6fc80136f1e275faea Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 3 May 2026 12:04:12 -0700 Subject: [PATCH] test(e2e): pin pick_model_slug behavior with bash unit tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #2571 fixed synth-E2E by branching MODEL_SLUG per runtime, but only the langgraph branch was verified at runtime — hermes / claude-code / override / fallback had zero automated coverage. A future regression (e.g. dropping the langgraph case) would silently revert and only surface as "Could not resolve authentication method" mid-E2E. This PR: - Extracts the dispatch into tests/e2e/lib/model_slug.sh as a sourceable pick_model_slug() function. No behavior change. - Adds tests/e2e/test_model_slug.sh — 9 assertions across all 5 dispatch branches plus the override path. Verified to FAIL when any branch is flipped (manually regressed langgraph slash-form to confirm the test catches it; restored before commit). - Wires the unit test into ci.yml's existing shellcheck job (only runs when tests/e2e/ or scripts/ change). Pure-bash, no live infra. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yml | 12 ++++ tests/e2e/lib/model_slug.sh | 51 ++++++++++++++++ tests/e2e/test_model_slug.sh | 90 +++++++++++++++++++++++++++++ tests/e2e/test_staging_full_saas.sh | 43 +++----------- 4 files changed, 160 insertions(+), 36 deletions(-) create mode 100755 tests/e2e/lib/model_slug.sh create mode 100755 tests/e2e/test_model_slug.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2bca28a2..7f0c72bb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -272,6 +272,18 @@ jobs: find tests/e2e infra/scripts -type f -name '*.sh' -print0 \ | xargs -0 shellcheck --severity=warning + - if: needs.changes.outputs.scripts == 'true' + name: Run E2E bash unit tests (no live infra) + # Pure-bash unit tests for E2E helper libs (lib/*.sh). These pin + # behavior of dispatch logic that — when broken — silently masks as + # "Could not resolve authentication method" only after a successful + # tenant + workspace provision (PR #2571 incident, 2026-05-03). Add + # new self-contained unit tests here as the lib/ directory grows; + # tests requiring live CP/tenant credentials belong in the dedicated + # e2e-staging-* workflows, not this job. + run: | + bash tests/e2e/test_model_slug.sh + canvas-deploy-reminder: name: Canvas Deploy Reminder runs-on: ubuntu-latest diff --git a/tests/e2e/lib/model_slug.sh b/tests/e2e/lib/model_slug.sh new file mode 100755 index 00000000..fd598a3a --- /dev/null +++ b/tests/e2e/lib/model_slug.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# Per-runtime model slug dispatch for E2E provisioning. +# +# Different runtimes parse the model slug differently (PR #2571 incident, +# 2026-05-03): +# +# hermes → "openai/gpt-4o" (slash-form: derive-provider.sh splits +# on the prefix to set +# HERMES_INFERENCE_PROVIDER. Bare +# "gpt-4o" falls through to Anthropic +# default + 401, see PR #1714.) +# +# langgraph → "openai:gpt-4o" (colon-form: langchain init_chat_model +# requires ":". +# Slash-form was misinterpreted as +# OpenRouter routing → fell through +# without auth, surfaced 2026-05-03 +# after the a2a-sdk v1 contract bugs +# PR #2558+#2563+#2567 cleared the +# masking layers.) +# +# claude-code → "sonnet" (entry-id form: claude-code template's +# config.yaml uses bare model names, +# auth comes via CLAUDE_CODE_OAUTH_TOKEN +# or ANTHROPIC_API_KEY rather than the +# slug.) +# +# When E2E_MODEL_SLUG is set, it overrides this dispatch — useful when an +# operator dispatches the workflow to test a specific slug. +# +# Unit tested by tests/e2e/test_model_slug.sh — every branch must stay +# pinned because regressions silently mask as "Could not resolve +# authentication method" + the synth-E2E gate goes red without naming +# the slug-format mismatch. + +# Usage: pick_model_slug +# stdout: the slug string +# E2E_MODEL_SLUG (env): if set + non-empty, used as-is (operator override) +pick_model_slug() { + local runtime="${1:-}" + if [ -n "${E2E_MODEL_SLUG:-}" ]; then + printf '%s' "$E2E_MODEL_SLUG" + return 0 + fi + case "$runtime" in + hermes) printf 'openai/gpt-4o' ;; + langgraph) printf 'openai:gpt-4o' ;; + claude-code) printf 'sonnet' ;; + *) printf 'openai/gpt-4o' ;; # safest fallback (matches hermes) + esac +} diff --git a/tests/e2e/test_model_slug.sh b/tests/e2e/test_model_slug.sh new file mode 100755 index 00000000..130b413a --- /dev/null +++ b/tests/e2e/test_model_slug.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +# Regression test for tests/e2e/lib/model_slug.sh. +# +# PR #2571 fixed a synth-E2E masking bug where MODEL_SLUG was hardcoded +# to "openai/gpt-4o" (slash-form) but langgraph's init_chat_model needs +# "openai:gpt-4o" (colon-form). Fix shipped as a per-runtime case +# statement. Without this regression test, dropping any branch of the +# case (or flipping a slug format) would silently revert behavior — the +# E2E only fails as "Could not resolve authentication method" at the +# very first message, after a successful tenant + workspace provision. +# +# Each branch must FAIL the test if the dispatch behavior changes, not +# just produce some non-empty string. +set -uo pipefail + +# Resolve to the lib relative to this test file so the test runs from +# any cwd (CI, local invocation, repo root). +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib/model_slug.sh +source "$SCRIPT_DIR/lib/model_slug.sh" + +PASS=0 +FAIL=0 + +assert_eq() { + local label="$1" got="$2" want="$3" + if [ "$got" = "$want" ]; then + echo " ✓ $label" + PASS=$((PASS+1)) + else + echo " ✗ $label: got=$(printf %q "$got") want=$(printf %q "$want")" >&2 + FAIL=$((FAIL+1)) + fi +} + +run_test() { + local label="$1" runtime="$2" want="$3" + # Pin per-test isolation: explicitly unset the override so a leaked + # E2E_MODEL_SLUG from caller env can't poison the dispatch branches. + local got + got=$(unset E2E_MODEL_SLUG; pick_model_slug "$runtime") + assert_eq "$label" "$got" "$want" +} + +echo "Test: pick_model_slug — per-runtime dispatch" +echo + +# ── Per-runtime branches (the load-bearing ones for synth-E2E) ── +run_test "hermes → slash-form (derive-provider.sh contract)" hermes "openai/gpt-4o" +run_test "langgraph → colon-form (init_chat_model contract)" langgraph "openai:gpt-4o" +run_test "claude-code → bare model name (entry-id form)" claude-code "sonnet" + +# ── Fallback for unknown runtime ── +# Picks slash-form (hermes-shaped) since hermes is the historical +# default and most third-party runtimes behave hermes-like. Pinning +# this so a future "smarter" fallback (e.g., empty string, error) is +# a deliberate choice, not silent drift. +run_test "unknown runtime → slash-form fallback" gemini "openai/gpt-4o" +run_test "empty runtime → slash-form fallback" "" "openai/gpt-4o" + +# ── Override via E2E_MODEL_SLUG ── +# When the operator sets E2E_MODEL_SLUG, the per-runtime dispatch is +# bypassed. Used during workflow_dispatch to A/B specific slugs. +echo +echo "Test: pick_model_slug — E2E_MODEL_SLUG override" +echo + +got=$(E2E_MODEL_SLUG="anthropic:claude-opus-4-7" pick_model_slug langgraph) +assert_eq "override beats langgraph default" "$got" "anthropic:claude-opus-4-7" + +got=$(E2E_MODEL_SLUG="custom/whatever" pick_model_slug hermes) +assert_eq "override beats hermes default" "$got" "custom/whatever" + +got=$(E2E_MODEL_SLUG="some-bare-id" pick_model_slug claude-code) +assert_eq "override beats claude-code default" "$got" "some-bare-id" + +# Empty-string override does NOT activate (falls through to dispatch). +# This is the historical bash idiom: -n "" → false → no override. Pin +# it because changing this behavior (e.g. via -v test) would silently +# break the dispatch when an operator passes "" to clear an inherited +# env var. +got=$(E2E_MODEL_SLUG="" pick_model_slug langgraph) +assert_eq "empty-string override falls through to dispatch" "$got" "openai:gpt-4o" + +echo +echo "─────────────────────────────────────────────────" +echo "PASSED: $PASS" +echo "FAILED: $FAIL" +echo "─────────────────────────────────────────────────" +[ "$FAIL" -eq 0 ] diff --git a/tests/e2e/test_staging_full_saas.sh b/tests/e2e/test_staging_full_saas.sh index da4e8a6a..ce7f1e29 100755 --- a/tests/e2e/test_staging_full_saas.sh +++ b/tests/e2e/test_staging_full_saas.sh @@ -67,6 +67,12 @@ log() { echo "[$(date +%H:%M:%S)] $*"; } fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; } ok() { echo "[$(date +%H:%M:%S)] ✅ $*"; } +# Per-runtime model slug dispatch — see lib/model_slug.sh for the rationale. +# Extracted so unit tests (tests/e2e/test_model_slug.sh) can pin every branch +# without booting the full 11-step lifecycle. +# shellcheck source=lib/model_slug.sh +source "$(dirname "$0")/lib/model_slug.sh" + CURL_COMMON=(-sS --fail-with-body --max-time 30) # ─── cleanup trap ─────────────────────────────────────────────────────── @@ -352,42 +358,7 @@ print(json.dumps({ ") fi -# Model slug format depends on the runtime — different model resolvers -# parse it differently: -# -# hermes → "openai/gpt-4o" (slash-form: derive-provider.sh splits -# on the prefix to set -# HERMES_INFERENCE_PROVIDER. Bare -# "gpt-4o" falls through to Anthropic -# default + 401, see PR #1714.) -# -# langgraph → "openai:gpt-4o" (colon-form: langchain init_chat_model -# requires ":". -# Slash-form was misinterpreted as -# OpenRouter routing → fell through -# without auth, surfaced 2026-05-03 -# after the a2a-sdk v1 contract bugs -# PR #2558+#2563+#2567 cleared the -# masking layers.) -# -# claude-code → "sonnet" (entry-id form: claude-code template's -# config.yaml uses bare model names, -# auth comes via CLAUDE_CODE_OAUTH_TOKEN -# or ANTHROPIC_API_KEY rather than the -# slug.) -# -# When E2E_MODEL_SLUG is set, it overrides this dispatch — useful when an -# operator dispatches the workflow to test a specific slug. -if [ -n "${E2E_MODEL_SLUG:-}" ]; then - MODEL_SLUG="$E2E_MODEL_SLUG" -else - case "$RUNTIME" in - hermes) MODEL_SLUG="openai/gpt-4o" ;; - langgraph) MODEL_SLUG="openai:gpt-4o" ;; - claude-code) MODEL_SLUG="sonnet" ;; - *) MODEL_SLUG="openai/gpt-4o" ;; # safest fallback (matches hermes) - esac -fi +MODEL_SLUG=$(pick_model_slug "$RUNTIME") log "5/11 Provisioning parent workspace (runtime=$RUNTIME)..." PARENT_RESP=$(tenant_call POST /workspaces \