forked from molecule-ai/molecule-core
Merge pull request #2619 from Molecule-AI/test/synth-e2e-model-slug-coverage
test(e2e): pin pick_model_slug behavior with bash unit tests
This commit is contained in:
commit
8d5e78d629
12
.github/workflows/ci.yml
vendored
12
.github/workflows/ci.yml
vendored
@ -272,6 +272,18 @@ jobs:
|
||||
find tests/e2e infra/scripts -type f -name '*.sh' -print0 \
|
||||
| xargs -0 shellcheck --severity=warning
|
||||
|
||||
- if: needs.changes.outputs.scripts == 'true'
|
||||
name: Run E2E bash unit tests (no live infra)
|
||||
# Pure-bash unit tests for E2E helper libs (lib/*.sh). These pin
|
||||
# behavior of dispatch logic that — when broken — silently masks as
|
||||
# "Could not resolve authentication method" only after a successful
|
||||
# tenant + workspace provision (PR #2571 incident, 2026-05-03). Add
|
||||
# new self-contained unit tests here as the lib/ directory grows;
|
||||
# tests requiring live CP/tenant credentials belong in the dedicated
|
||||
# e2e-staging-* workflows, not this job.
|
||||
run: |
|
||||
bash tests/e2e/test_model_slug.sh
|
||||
|
||||
canvas-deploy-reminder:
|
||||
name: Canvas Deploy Reminder
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
51
tests/e2e/lib/model_slug.sh
Executable file
51
tests/e2e/lib/model_slug.sh
Executable file
@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env bash
|
||||
# Per-runtime model slug dispatch for E2E provisioning.
|
||||
#
|
||||
# Different runtimes parse the model slug differently (PR #2571 incident,
|
||||
# 2026-05-03):
|
||||
#
|
||||
# hermes → "openai/gpt-4o" (slash-form: derive-provider.sh splits
|
||||
# on the prefix to set
|
||||
# HERMES_INFERENCE_PROVIDER. Bare
|
||||
# "gpt-4o" falls through to Anthropic
|
||||
# default + 401, see PR #1714.)
|
||||
#
|
||||
# langgraph → "openai:gpt-4o" (colon-form: langchain init_chat_model
|
||||
# requires "<provider>:<model>".
|
||||
# Slash-form was misinterpreted as
|
||||
# OpenRouter routing → fell through
|
||||
# without auth, surfaced 2026-05-03
|
||||
# after the a2a-sdk v1 contract bugs
|
||||
# PR #2558+#2563+#2567 cleared the
|
||||
# masking layers.)
|
||||
#
|
||||
# claude-code → "sonnet" (entry-id form: claude-code template's
|
||||
# config.yaml uses bare model names,
|
||||
# auth comes via CLAUDE_CODE_OAUTH_TOKEN
|
||||
# or ANTHROPIC_API_KEY rather than the
|
||||
# slug.)
|
||||
#
|
||||
# When E2E_MODEL_SLUG is set, it overrides this dispatch — useful when an
|
||||
# operator dispatches the workflow to test a specific slug.
|
||||
#
|
||||
# Unit tested by tests/e2e/test_model_slug.sh — every branch must stay
|
||||
# pinned because regressions silently mask as "Could not resolve
|
||||
# authentication method" + the synth-E2E gate goes red without naming
|
||||
# the slug-format mismatch.
|
||||
|
||||
# Usage: pick_model_slug <runtime>
|
||||
# stdout: the slug string
|
||||
# E2E_MODEL_SLUG (env): if set + non-empty, used as-is (operator override)
|
||||
pick_model_slug() {
|
||||
local runtime="${1:-}"
|
||||
if [ -n "${E2E_MODEL_SLUG:-}" ]; then
|
||||
printf '%s' "$E2E_MODEL_SLUG"
|
||||
return 0
|
||||
fi
|
||||
case "$runtime" in
|
||||
hermes) printf 'openai/gpt-4o' ;;
|
||||
langgraph) printf 'openai:gpt-4o' ;;
|
||||
claude-code) printf 'sonnet' ;;
|
||||
*) printf 'openai/gpt-4o' ;; # safest fallback (matches hermes)
|
||||
esac
|
||||
}
|
||||
90
tests/e2e/test_model_slug.sh
Executable file
90
tests/e2e/test_model_slug.sh
Executable file
@ -0,0 +1,90 @@
|
||||
#!/usr/bin/env bash
|
||||
# Regression test for tests/e2e/lib/model_slug.sh.
|
||||
#
|
||||
# PR #2571 fixed a synth-E2E masking bug where MODEL_SLUG was hardcoded
|
||||
# to "openai/gpt-4o" (slash-form) but langgraph's init_chat_model needs
|
||||
# "openai:gpt-4o" (colon-form). Fix shipped as a per-runtime case
|
||||
# statement. Without this regression test, dropping any branch of the
|
||||
# case (or flipping a slug format) would silently revert behavior — the
|
||||
# E2E only fails as "Could not resolve authentication method" at the
|
||||
# very first message, after a successful tenant + workspace provision.
|
||||
#
|
||||
# Each branch must FAIL the test if the dispatch behavior changes, not
|
||||
# just produce some non-empty string.
|
||||
set -uo pipefail
|
||||
|
||||
# Resolve to the lib relative to this test file so the test runs from
|
||||
# any cwd (CI, local invocation, repo root).
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
# shellcheck source=lib/model_slug.sh
|
||||
source "$SCRIPT_DIR/lib/model_slug.sh"
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
|
||||
assert_eq() {
|
||||
local label="$1" got="$2" want="$3"
|
||||
if [ "$got" = "$want" ]; then
|
||||
echo " ✓ $label"
|
||||
PASS=$((PASS+1))
|
||||
else
|
||||
echo " ✗ $label: got=$(printf %q "$got") want=$(printf %q "$want")" >&2
|
||||
FAIL=$((FAIL+1))
|
||||
fi
|
||||
}
|
||||
|
||||
run_test() {
|
||||
local label="$1" runtime="$2" want="$3"
|
||||
# Pin per-test isolation: explicitly unset the override so a leaked
|
||||
# E2E_MODEL_SLUG from caller env can't poison the dispatch branches.
|
||||
local got
|
||||
got=$(unset E2E_MODEL_SLUG; pick_model_slug "$runtime")
|
||||
assert_eq "$label" "$got" "$want"
|
||||
}
|
||||
|
||||
echo "Test: pick_model_slug — per-runtime dispatch"
|
||||
echo
|
||||
|
||||
# ── Per-runtime branches (the load-bearing ones for synth-E2E) ──
|
||||
run_test "hermes → slash-form (derive-provider.sh contract)" hermes "openai/gpt-4o"
|
||||
run_test "langgraph → colon-form (init_chat_model contract)" langgraph "openai:gpt-4o"
|
||||
run_test "claude-code → bare model name (entry-id form)" claude-code "sonnet"
|
||||
|
||||
# ── Fallback for unknown runtime ──
|
||||
# Picks slash-form (hermes-shaped) since hermes is the historical
|
||||
# default and most third-party runtimes behave hermes-like. Pinning
|
||||
# this so a future "smarter" fallback (e.g., empty string, error) is
|
||||
# a deliberate choice, not silent drift.
|
||||
run_test "unknown runtime → slash-form fallback" gemini "openai/gpt-4o"
|
||||
run_test "empty runtime → slash-form fallback" "" "openai/gpt-4o"
|
||||
|
||||
# ── Override via E2E_MODEL_SLUG ──
|
||||
# When the operator sets E2E_MODEL_SLUG, the per-runtime dispatch is
|
||||
# bypassed. Used during workflow_dispatch to A/B specific slugs.
|
||||
echo
|
||||
echo "Test: pick_model_slug — E2E_MODEL_SLUG override"
|
||||
echo
|
||||
|
||||
got=$(E2E_MODEL_SLUG="anthropic:claude-opus-4-7" pick_model_slug langgraph)
|
||||
assert_eq "override beats langgraph default" "$got" "anthropic:claude-opus-4-7"
|
||||
|
||||
got=$(E2E_MODEL_SLUG="custom/whatever" pick_model_slug hermes)
|
||||
assert_eq "override beats hermes default" "$got" "custom/whatever"
|
||||
|
||||
got=$(E2E_MODEL_SLUG="some-bare-id" pick_model_slug claude-code)
|
||||
assert_eq "override beats claude-code default" "$got" "some-bare-id"
|
||||
|
||||
# Empty-string override does NOT activate (falls through to dispatch).
|
||||
# This is the historical bash idiom: -n "" → false → no override. Pin
|
||||
# it because changing this behavior (e.g. via -v test) would silently
|
||||
# break the dispatch when an operator passes "" to clear an inherited
|
||||
# env var.
|
||||
got=$(E2E_MODEL_SLUG="" pick_model_slug langgraph)
|
||||
assert_eq "empty-string override falls through to dispatch" "$got" "openai:gpt-4o"
|
||||
|
||||
echo
|
||||
echo "─────────────────────────────────────────────────"
|
||||
echo "PASSED: $PASS"
|
||||
echo "FAILED: $FAIL"
|
||||
echo "─────────────────────────────────────────────────"
|
||||
[ "$FAIL" -eq 0 ]
|
||||
@ -67,6 +67,12 @@ log() { echo "[$(date +%H:%M:%S)] $*"; }
|
||||
fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; }
|
||||
ok() { echo "[$(date +%H:%M:%S)] ✅ $*"; }
|
||||
|
||||
# Per-runtime model slug dispatch — see lib/model_slug.sh for the rationale.
|
||||
# Extracted so unit tests (tests/e2e/test_model_slug.sh) can pin every branch
|
||||
# without booting the full 11-step lifecycle.
|
||||
# shellcheck source=lib/model_slug.sh
|
||||
source "$(dirname "$0")/lib/model_slug.sh"
|
||||
|
||||
CURL_COMMON=(-sS --fail-with-body --max-time 30)
|
||||
|
||||
# ─── cleanup trap ───────────────────────────────────────────────────────
|
||||
@ -352,42 +358,7 @@ print(json.dumps({
|
||||
")
|
||||
fi
|
||||
|
||||
# Model slug format depends on the runtime — different model resolvers
|
||||
# parse it differently:
|
||||
#
|
||||
# hermes → "openai/gpt-4o" (slash-form: derive-provider.sh splits
|
||||
# on the prefix to set
|
||||
# HERMES_INFERENCE_PROVIDER. Bare
|
||||
# "gpt-4o" falls through to Anthropic
|
||||
# default + 401, see PR #1714.)
|
||||
#
|
||||
# langgraph → "openai:gpt-4o" (colon-form: langchain init_chat_model
|
||||
# requires "<provider>:<model>".
|
||||
# Slash-form was misinterpreted as
|
||||
# OpenRouter routing → fell through
|
||||
# without auth, surfaced 2026-05-03
|
||||
# after the a2a-sdk v1 contract bugs
|
||||
# PR #2558+#2563+#2567 cleared the
|
||||
# masking layers.)
|
||||
#
|
||||
# claude-code → "sonnet" (entry-id form: claude-code template's
|
||||
# config.yaml uses bare model names,
|
||||
# auth comes via CLAUDE_CODE_OAUTH_TOKEN
|
||||
# or ANTHROPIC_API_KEY rather than the
|
||||
# slug.)
|
||||
#
|
||||
# When E2E_MODEL_SLUG is set, it overrides this dispatch — useful when an
|
||||
# operator dispatches the workflow to test a specific slug.
|
||||
if [ -n "${E2E_MODEL_SLUG:-}" ]; then
|
||||
MODEL_SLUG="$E2E_MODEL_SLUG"
|
||||
else
|
||||
case "$RUNTIME" in
|
||||
hermes) MODEL_SLUG="openai/gpt-4o" ;;
|
||||
langgraph) MODEL_SLUG="openai:gpt-4o" ;;
|
||||
claude-code) MODEL_SLUG="sonnet" ;;
|
||||
*) MODEL_SLUG="openai/gpt-4o" ;; # safest fallback (matches hermes)
|
||||
esac
|
||||
fi
|
||||
MODEL_SLUG=$(pick_model_slug "$RUNTIME")
|
||||
|
||||
log "5/11 Provisioning parent workspace (runtime=$RUNTIME)..."
|
||||
PARENT_RESP=$(tenant_call POST /workspaces \
|
||||
|
||||
Loading…
Reference in New Issue
Block a user