From ac6f65ab5e3388df56db8e6fc80136f1e275faea Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 3 May 2026 12:04:12 -0700
Subject: [PATCH] test(e2e): pin pick_model_slug behavior with bash unit tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #2571 fixed synth-E2E by branching MODEL_SLUG per runtime, but only
the langgraph branch was verified at runtime — hermes / claude-code /
override / fallback had zero automated coverage. A future regression
(e.g. dropping the langgraph case) would silently revert and only
surface as "Could not resolve authentication method" mid-E2E.

This PR:
- Extracts the dispatch into tests/e2e/lib/model_slug.sh as a sourceable
  pick_model_slug() function. No behavior change.
- Adds tests/e2e/test_model_slug.sh — 9 assertions across all 5 dispatch
  branches plus the override path. Verified to FAIL when any branch is
  flipped (manually regressed langgraph slash-form to confirm the test
  catches it; restored before commit).
- Wires the unit test into ci.yml's existing shellcheck job (only runs
  when tests/e2e/ or scripts/ change). Pure-bash, no live infra.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci.yml            | 12 ++++
 tests/e2e/lib/model_slug.sh         | 51 ++++++++++++++++
 tests/e2e/test_model_slug.sh        | 90 +++++++++++++++++++++++++++++
 tests/e2e/test_staging_full_saas.sh | 43 +++-----------
 4 files changed, 160 insertions(+), 36 deletions(-)
 create mode 100755 tests/e2e/lib/model_slug.sh
 create mode 100755 tests/e2e/test_model_slug.sh
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2bca28a2..7f0c72bb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -272,6 +272,18 @@ jobs:
           find tests/e2e infra/scripts -type f -name '*.sh' -print0 \
             | xargs -0 shellcheck --severity=warning
 
+      - if: needs.changes.outputs.scripts == 'true'
+        name: Run E2E bash unit tests (no live infra)
+        # Pure-bash unit tests for E2E helper libs (lib/*.sh). These pin
+        # behavior of dispatch logic that — when broken — silently masks as
+        # "Could not resolve authentication method" only after a successful
+        # tenant + workspace provision (PR #2571 incident, 2026-05-03). Add
+        # new self-contained unit tests here as the lib/ directory grows;
+        # tests requiring live CP/tenant credentials belong in the dedicated
+        # e2e-staging-* workflows, not this job.
+        run: |
+          bash tests/e2e/test_model_slug.sh
+
   canvas-deploy-reminder:
     name: Canvas Deploy Reminder
     runs-on: ubuntu-latest
diff --git a/tests/e2e/lib/model_slug.sh b/tests/e2e/lib/model_slug.sh
new file mode 100755
index 00000000..fd598a3a
--- /dev/null
+++ b/tests/e2e/lib/model_slug.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+# Per-runtime model slug dispatch for E2E provisioning.
+#
+# Different runtimes parse the model slug differently (PR #2571 incident,
+# 2026-05-03):
+#
+#   hermes      → "openai/gpt-4o"  (slash-form: derive-provider.sh splits
+#                                    on the prefix to set
+#                                    HERMES_INFERENCE_PROVIDER. Bare
+#                                    "gpt-4o" falls through to Anthropic
+#                                    default + 401, see PR #1714.)
+#
+#   langgraph   → "openai:gpt-4o"  (colon-form: langchain init_chat_model
+#                                    requires "<provider>:<model>".
+#                                    Slash-form was misinterpreted as
+#                                    OpenRouter routing → fell through
+#                                    without auth, surfaced 2026-05-03
+#                                    after the a2a-sdk v1 contract bugs
+#                                    PR #2558+#2563+#2567 cleared the
+#                                    masking layers.)
+#
+#   claude-code → "sonnet"         (entry-id form: claude-code template's
+#                                    config.yaml uses bare model names,
+#                                    auth comes via CLAUDE_CODE_OAUTH_TOKEN
+#                                    or ANTHROPIC_API_KEY rather than the
+#                                    slug.)
+#
+# When E2E_MODEL_SLUG is set, it overrides this dispatch — useful when an
+# operator dispatches the workflow to test a specific slug.
+#
+# Unit tested by tests/e2e/test_model_slug.sh — every branch must stay
+# pinned because regressions silently mask as "Could not resolve
+# authentication method" + the synth-E2E gate goes red without naming
+# the slug-format mismatch.
+
+# Usage: pick_model_slug <runtime>
+#   stdout: the slug string
+#   E2E_MODEL_SLUG (env): if set + non-empty, used as-is (operator override)
+pick_model_slug() {
+  local runtime="${1:-}"
+  if [ -n "${E2E_MODEL_SLUG:-}" ]; then
+    printf '%s' "$E2E_MODEL_SLUG"
+    return 0
+  fi
+  case "$runtime" in
+    hermes)      printf 'openai/gpt-4o' ;;
+    langgraph)   printf 'openai:gpt-4o' ;;
+    claude-code) printf 'sonnet' ;;
+    *)           printf 'openai/gpt-4o' ;;  # safest fallback (matches hermes)
+  esac
+}
diff --git a/tests/e2e/test_model_slug.sh b/tests/e2e/test_model_slug.sh
new file mode 100755
index 00000000..130b413a
--- /dev/null
+++ b/tests/e2e/test_model_slug.sh
@@ -0,0 +1,90 @@
+#!/usr/bin/env bash
+# Regression test for tests/e2e/lib/model_slug.sh.
+#
+# PR #2571 fixed a synth-E2E masking bug where MODEL_SLUG was hardcoded
+# to "openai/gpt-4o" (slash-form) but langgraph's init_chat_model needs
+# "openai:gpt-4o" (colon-form). Fix shipped as a per-runtime case
+# statement. Without this regression test, dropping any branch of the
+# case (or flipping a slug format) would silently revert behavior — the
+# E2E only fails as "Could not resolve authentication method" at the
+# very first message, after a successful tenant + workspace provision.
+#
+# Each branch must FAIL the test if the dispatch behavior changes, not
+# just produce some non-empty string.
+set -uo pipefail
+
+# Resolve to the lib relative to this test file so the test runs from
+# any cwd (CI, local invocation, repo root).
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=lib/model_slug.sh
+source "$SCRIPT_DIR/lib/model_slug.sh"
+
+PASS=0
+FAIL=0
+
+assert_eq() {
+  local label="$1" got="$2" want="$3"
+  if [ "$got" = "$want" ]; then
+    echo "  ✓ $label"
+    PASS=$((PASS+1))
+  else
+    echo "  ✗ $label: got=$(printf %q "$got")  want=$(printf %q "$want")" >&2
+    FAIL=$((FAIL+1))
+  fi
+}
+
+run_test() {
+  local label="$1" runtime="$2" want="$3"
+  # Pin per-test isolation: explicitly unset the override so a leaked
+  # E2E_MODEL_SLUG from caller env can't poison the dispatch branches.
+  local got
+  got=$(unset E2E_MODEL_SLUG; pick_model_slug "$runtime")
+  assert_eq "$label" "$got" "$want"
+}
+
+echo "Test: pick_model_slug — per-runtime dispatch"
+echo
+
+# ── Per-runtime branches (the load-bearing ones for synth-E2E) ──
+run_test "hermes → slash-form (derive-provider.sh contract)"       hermes      "openai/gpt-4o"
+run_test "langgraph → colon-form (init_chat_model contract)"       langgraph   "openai:gpt-4o"
+run_test "claude-code → bare model name (entry-id form)"           claude-code "sonnet"
+
+# ── Fallback for unknown runtime ──
+# Picks slash-form (hermes-shaped) since hermes is the historical
+# default and most third-party runtimes behave hermes-like. Pinning
+# this so a future "smarter" fallback (e.g., empty string, error) is
+# a deliberate choice, not silent drift.
+run_test "unknown runtime → slash-form fallback"                   gemini      "openai/gpt-4o"
+run_test "empty runtime → slash-form fallback"                     ""          "openai/gpt-4o"
+
+# ── Override via E2E_MODEL_SLUG ──
+# When the operator sets E2E_MODEL_SLUG, the per-runtime dispatch is
+# bypassed. Used during workflow_dispatch to A/B specific slugs.
+echo
+echo "Test: pick_model_slug — E2E_MODEL_SLUG override"
+echo
+
+got=$(E2E_MODEL_SLUG="anthropic:claude-opus-4-7" pick_model_slug langgraph)
+assert_eq "override beats langgraph default"                      "$got" "anthropic:claude-opus-4-7"
+
+got=$(E2E_MODEL_SLUG="custom/whatever" pick_model_slug hermes)
+assert_eq "override beats hermes default"                         "$got" "custom/whatever"
+
+got=$(E2E_MODEL_SLUG="some-bare-id" pick_model_slug claude-code)
+assert_eq "override beats claude-code default"                    "$got" "some-bare-id"
+
+# Empty-string override does NOT activate (falls through to dispatch).
+# This is the historical bash idiom: -n "" → false → no override. Pin
+# it because changing this behavior (e.g. via -v test) would silently
+# break the dispatch when an operator passes "" to clear an inherited
+# env var.
+got=$(E2E_MODEL_SLUG="" pick_model_slug langgraph)
+assert_eq "empty-string override falls through to dispatch"       "$got" "openai:gpt-4o"
+
+echo
+echo "─────────────────────────────────────────────────"
+echo "PASSED: $PASS"
+echo "FAILED: $FAIL"
+echo "─────────────────────────────────────────────────"
+[ "$FAIL" -eq 0 ]
diff --git a/tests/e2e/test_staging_full_saas.sh b/tests/e2e/test_staging_full_saas.sh
index da4e8a6a..ce7f1e29 100755
--- a/tests/e2e/test_staging_full_saas.sh
+++ b/tests/e2e/test_staging_full_saas.sh
@@ -67,6 +67,12 @@ log()  { echo "[$(date +%H:%M:%S)] $*"; }
 fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; }
 ok()   { echo "[$(date +%H:%M:%S)] ✅ $*"; }
 
+# Per-runtime model slug dispatch — see lib/model_slug.sh for the rationale.
+# Extracted so unit tests (tests/e2e/test_model_slug.sh) can pin every branch
+# without booting the full 11-step lifecycle.
+# shellcheck source=lib/model_slug.sh
+source "$(dirname "$0")/lib/model_slug.sh"
+
 CURL_COMMON=(-sS --fail-with-body --max-time 30)
 
 # ─── cleanup trap ───────────────────────────────────────────────────────
@@ -352,42 +358,7 @@ print(json.dumps({
 ")
 fi
 
-# Model slug format depends on the runtime — different model resolvers
-# parse it differently:
-#
-#   hermes      → "openai/gpt-4o"  (slash-form: derive-provider.sh splits
-#                                    on the prefix to set
-#                                    HERMES_INFERENCE_PROVIDER. Bare
-#                                    "gpt-4o" falls through to Anthropic
-#                                    default + 401, see PR #1714.)
-#
-#   langgraph   → "openai:gpt-4o"  (colon-form: langchain init_chat_model
-#                                    requires "<provider>:<model>".
-#                                    Slash-form was misinterpreted as
-#                                    OpenRouter routing → fell through
-#                                    without auth, surfaced 2026-05-03
-#                                    after the a2a-sdk v1 contract bugs
-#                                    PR #2558+#2563+#2567 cleared the
-#                                    masking layers.)
-#
-#   claude-code → "sonnet"         (entry-id form: claude-code template's
-#                                    config.yaml uses bare model names,
-#                                    auth comes via CLAUDE_CODE_OAUTH_TOKEN
-#                                    or ANTHROPIC_API_KEY rather than the
-#                                    slug.)
-#
-# When E2E_MODEL_SLUG is set, it overrides this dispatch — useful when an
-# operator dispatches the workflow to test a specific slug.
-if [ -n "${E2E_MODEL_SLUG:-}" ]; then
-  MODEL_SLUG="$E2E_MODEL_SLUG"
-else
-  case "$RUNTIME" in
-    hermes)      MODEL_SLUG="openai/gpt-4o" ;;
-    langgraph)   MODEL_SLUG="openai:gpt-4o" ;;
-    claude-code) MODEL_SLUG="sonnet" ;;
-    *)           MODEL_SLUG="openai/gpt-4o" ;;  # safest fallback (matches hermes)
-  esac
-fi
+MODEL_SLUG=$(pick_model_slug "$RUNTIME")
 
 log "5/11 Provisioning parent workspace (runtime=$RUNTIME)..."
 PARENT_RESP=$(tenant_call POST /workspaces \