From a4b3ebf951c3ffb44ed8d3dfe046c79a9ef1fc10 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 26 Apr 2026 23:48:54 -0700 Subject: [PATCH] test(e2e): claude-code + hermes priority-runtimes happy path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Self-contained happy-path E2E for the two runtimes the project commits to first-class support for (task #116, completes the loop on the "both must work end-to-end with tests" requirement). What it proves per runtime: 1. POST /workspaces succeeds with the runtime + secrets 2. Workspace reaches status=online within its cold-boot window (claude-code: 240s, hermes: 900s on cold apt + uv + sidecar) 3. POST /a2a (message/send "Reply with PONG") returns a non-error, non-empty reply 4. activity_logs row written with method=message/send and ok|error status (a2a_proxy.LogActivity contract) Skip semantics: each phase independently checks for its required env key (CLAUDE_CODE_OAUTH_TOKEN / E2E_OPENAI_API_KEY) and skips cleanly if absent. The script always exit-0s if every phase either passed or skipped — so wiring it into a no-keys CI job validates the script itself stays clean without false-failing. Idempotent: pre-sweeps any prior "Priority E2E (claude-code)" / "Priority E2E (hermes)" workspaces so a run interrupted by SIGPIPE / kill -9 (which bypasses the EXIT trap) doesn't poison the next run. Same defensive pattern as test_notify_attachments_e2e.sh. CI wiring: - e2e-api.yml — runs on every PR with no LLM keys, both phases skip, catches script-level regressions (set -u bugs, syntax issues, etc.) - canary-staging.yml + e2e-staging-saas.yml already have the keys via secrets.MOLECULE_STAGING_OPENAI_KEY and exercise wire-real behavior — could be wired to opt-in if you want claude-code coverage there too. Local runs (from this branch, no keys): === Results: 0 passed, 0 failed, 2 skipped === Validates the capability primitives shipped in PRs #2137-2144: once template PRs #12 (claude-code) + #25 (hermes) merge with their declared provides_native_session=True + idle_timeout_override=900, a manual run with both keys validates the full native+pluggable chain. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/e2e-api.yml | 5 + tests/e2e/test_priority_runtimes_e2e.sh | 299 ++++++++++++++++++++++++ 2 files changed, 304 insertions(+) create mode 100755 tests/e2e/test_priority_runtimes_e2e.sh diff --git a/.github/workflows/e2e-api.yml b/.github/workflows/e2e-api.yml index 05f9ab52..89c69b88 100644 --- a/.github/workflows/e2e-api.yml +++ b/.github/workflows/e2e-api.yml @@ -99,6 +99,11 @@ jobs: run: bash tests/e2e/test_api.sh - name: Run notify-with-attachments E2E run: bash tests/e2e/test_notify_attachments_e2e.sh + - name: Run priority-runtimes E2E (claude-code + hermes — skips when keys absent) + # Validates the test script itself runs cleanly even with no LLM + # keys (both phases skip gracefully). The wire-real coverage with + # actual keys runs in canary-staging.yml + e2e-staging-saas.yml. + run: bash tests/e2e/test_priority_runtimes_e2e.sh - name: Dump platform log on failure if: failure() run: cat workspace-server/platform.log || true diff --git a/tests/e2e/test_priority_runtimes_e2e.sh b/tests/e2e/test_priority_runtimes_e2e.sh new file mode 100755 index 00000000..6c2febaf --- /dev/null +++ b/tests/e2e/test_priority_runtimes_e2e.sh @@ -0,0 +1,299 @@ +#!/usr/bin/env bash +# E2E test: claude-code AND hermes both work end-to-end (task #87 priority adapters). +# +# Self-contained happy-path smoke for the two runtimes the project commits +# to first-class support for. Provisions a fresh workspace per runtime, +# waits for it to reach status=online, sends a real A2A message, and +# asserts a non-error reply. Pins the contract so the upcoming refactor +# (move adapter executors to template repos) cannot silently break either +# path. +# +# What this proves: +# 1. Provisioning + container boot works for each runtime. +# 2. The runtime reaches status=online within its expected cold-boot +# window (claude-code: ~60s, hermes: up to 15min on cold apt). +# 3. A real A2A message/send produces a non-empty, non-error reply. +# 4. The activity_logs row for the call is well-formed. +# +# Each phase skips cleanly when its prerequisite secret is absent so a +# partially-keyed env (e.g. CI without an OpenAI key) doesn't false-fail. +# +# Usage: +# CLAUDE_CODE_OAUTH_TOKEN=... E2E_OPENAI_API_KEY=... \ +# tests/e2e/test_priority_runtimes_e2e.sh +# +# # Run only one runtime +# E2E_RUNTIMES=claude-code tests/e2e/test_priority_runtimes_e2e.sh +# E2E_RUNTIMES=hermes tests/e2e/test_priority_runtimes_e2e.sh +# +# Prereqs: +# - workspace-server on http://localhost:8080 +# - MOLECULE_ENV != production (required for admin/test-token) +# - For claude-code: CLAUDE_CODE_OAUTH_TOKEN +# - For hermes: E2E_OPENAI_API_KEY (other providers also OK if you +# set MODEL_SLUG_HERMES + matching secrets directly) + +set -euo pipefail + +source "$(dirname "$0")/_lib.sh" + +PASS=0 +FAIL=0 +SKIP=0 +CREATED_WSIDS=() + +cleanup() { + # `set -u` + empty array would error on "${CREATED_WSIDS[@]}"; the + # ${VAR[@]+"…"} form expands to nothing when the array is unset/empty + # so the loop body is skipped cleanly. Hits the skip-no-keys path. + for wid in ${CREATED_WSIDS[@]+"${CREATED_WSIDS[@]}"}; do + [ -n "$wid" ] && curl -s -X DELETE "$BASE/workspaces/$wid?confirm=true" > /dev/null || true + done +} +trap cleanup EXIT + +pass() { echo " PASS — $1"; PASS=$((PASS + 1)); } +fail() { echo " FAIL — $1"; echo " $2"; FAIL=$((FAIL + 1)); } +skip() { echo " SKIP — $1"; SKIP=$((SKIP + 1)); } + +# Pre-sweep any prior runs that left workspaces behind (same defence as +# test_notify_attachments_e2e.sh: trap fires on normal exit, but a +# SIGPIPE / kill -9 can bypass it). +PRIOR=$(curl -s "$BASE/workspaces" | python3 -c ' +import json, sys +try: + print(" ".join(w["id"] for w in json.load(sys.stdin) if w.get("name","").startswith("Priority E2E "))) +except Exception: + pass +') +for _wid in $PRIOR; do + echo "Sweeping prior workspace: $_wid" + curl -s -X DELETE "$BASE/workspaces/$_wid?confirm=true" > /dev/null || true +done + +# Block until $1 reaches one of $2 (space-separated states), or $3 sec elapse. +wait_for_status() { + local wsid="$1" want="$2" budget="$3" + local start=$SECONDS + while [ $((SECONDS - start)) -lt "$budget" ]; do + local s + s=$(curl -s "$BASE/workspaces/$wsid" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("status",""))' 2>/dev/null || echo "") + for w in $want; do [ "$s" = "$w" ] && { echo "$s"; return 0; }; done + sleep 4 + done + echo "$s" + return 1 +} + +# Send "What is 2+2?" via A2A, return the reply text on stdout. Fails +# (non-zero exit + empty stdout) if the platform returns an error envelope +# or the reply is empty / sentinel-error. +send_test_prompt() { + local wsid="$1" token="$2" + local resp + resp=$(curl -s --max-time 180 -X POST "$BASE/workspaces/$wsid/a2a" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $token" \ + -d '{ + "method": "message/send", + "params": { + "message": { + "role": "user", + "messageId": "e2e-priority-runtime", + "parts": [{"kind": "text", "text": "Reply with exactly the word: PONG"}] + } + } + }') || return 1 + # Walk a few common A2A reply shapes; stop at the first non-empty text. + echo "$resp" | python3 -c ' +import json, sys +try: + d = json.loads(sys.stdin.read()) +except Exception: + sys.exit(1) +texts = [] +def walk(node): + if isinstance(node, dict): + for v in node.values(): walk(v) + elif isinstance(node, list): + for v in node: walk(v) + elif isinstance(node, str): + texts.append(node) +walk(d.get("result") or d) +joined = "\n".join(t for t in texts if t.strip()) +if not joined.strip(): + sys.exit(2) +# Surface a known error sentinel so the caller can tell apart "empty" from "explicit error" +low = joined.lower() +for needle in ("a2a_error", "agent error", "could not resolve authentication", "401", + "no provider api key", "missing api", "model_not_found"): + if needle in low: + print("ERROR: " + joined[:200]) + sys.exit(3) +print(joined) +' +} + +assert_activity_logged() { + # After a successful A2A round-trip, the platform's a2a_proxy logs + # an a2a_receive row with method=message/send. Pin the contract so a + # silent regression in LogActivity (e.g. dropped status field, broken + # broadcaster) shows up here. Polls briefly because LogActivity is + # detached-goroutine — the row may land a few hundred ms after the + # POST returns. + local label="$1" wsid="$2" token="$3" + local start=$SECONDS + while [ $((SECONDS - start)) -lt 10 ]; do + local act + act=$(curl -s -H "Authorization: Bearer $token" "$BASE/workspaces/$wsid/activity?type=a2a_receive&limit=10") + local found + found=$(echo "$act" | python3 -c ' +import json, sys +try: + rows = json.load(sys.stdin) or [] +except Exception: + sys.exit(1) +for r in rows: + if r.get("method") == "message/send" and r.get("status") in ("ok", "error"): + print("ok") + sys.exit(0) +sys.exit(2) +' 2>/dev/null) && true + if [ "$found" = "ok" ]; then + pass "$label activity_logs row written for the A2A turn" + return 0 + fi + sleep 1 + done + fail "$label activity_logs row" "no a2a_receive row with method=message/send appeared in 10s" +} + +run_claude_code() { + echo "" + echo "=== claude-code happy path ===" + if [ -z "${CLAUDE_CODE_OAUTH_TOKEN:-}" ]; then + skip "CLAUDE_CODE_OAUTH_TOKEN not set" + return 0 + fi + local secrets + secrets=$(python3 -c " +import json, os +print(json.dumps({'CLAUDE_CODE_OAUTH_TOKEN': os.environ['CLAUDE_CODE_OAUTH_TOKEN']})) +") + local resp wsid + resp=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \ + -d "{\"name\":\"Priority E2E (claude-code)\",\"runtime\":\"claude-code\",\"tier\":1,\"secrets\":$secrets}") + wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true + if [ -z "$wsid" ]; then + fail "create claude-code workspace" "$resp" + return 0 + fi + CREATED_WSIDS+=("$wsid") + echo " workspace=$wsid" + + # claude-code typical cold boot: 30-90s (image already pulled) + local final + final=$(wait_for_status "$wsid" "online failed" 240) || true + if [ "$final" != "online" ]; then + fail "claude-code workspace reaches online" "final status: $final" + return 0 + fi + pass "claude-code workspace reaches online" + + local token + token=$(e2e_mint_test_token "$wsid") + if [ -z "$token" ]; then + fail "mint claude-code test token" "no token returned" + return 0 + fi + + local reply + if reply=$(send_test_prompt "$wsid" "$token"); then + if echo "$reply" | grep -q "PONG"; then + pass "claude-code reply contains PONG" + else + pass "claude-code reply non-empty (first 80 chars: ${reply:0:80})" + fi + assert_activity_logged "claude-code" "$wsid" "$token" + else + fail "claude-code reply" "${reply:-}" + fi +} + +run_hermes() { + echo "" + echo "=== hermes happy path ===" + if [ -z "${E2E_OPENAI_API_KEY:-}" ]; then + skip "E2E_OPENAI_API_KEY not set (hermes needs an LLM provider key)" + return 0 + fi + local secrets + secrets=$(python3 -c " +import json, os +k = os.environ['E2E_OPENAI_API_KEY'] +print(json.dumps({ + 'OPENAI_API_KEY': k, + 'OPENAI_BASE_URL': 'https://api.openai.com/v1', + 'MODEL_PROVIDER': 'openai:gpt-4o', + # The HERMES_* fields below pin the provider deterministically + # (see comment in test_staging_full_saas.sh:268-275 for why). + 'HERMES_INFERENCE_PROVIDER': 'custom', + 'HERMES_CUSTOM_BASE_URL': 'https://api.openai.com/v1', + 'HERMES_CUSTOM_API_KEY': k, + 'HERMES_CUSTOM_API_MODE': 'chat_completions', +})) +") + local resp wsid + resp=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \ + -d "{\"name\":\"Priority E2E (hermes)\",\"runtime\":\"hermes\",\"tier\":1,\"model\":\"openai/gpt-4o\",\"secrets\":$secrets}") + wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true + if [ -z "$wsid" ]; then + fail "create hermes workspace" "$resp" + return 0 + fi + CREATED_WSIDS+=("$wsid") + echo " workspace=$wsid" + + # Hermes cold boot is the slow path: apt + uv + hermes-agent sidecar. + # Up to 15 min on cold disk; usually 3-5 min when the runtime image is + # already cached. Be generous so the test doesn't false-fail in CI. + local final + final=$(wait_for_status "$wsid" "online failed" 900) || true + if [ "$final" != "online" ]; then + fail "hermes workspace reaches online" "final status: $final" + return 0 + fi + pass "hermes workspace reaches online" + + local token + token=$(e2e_mint_test_token "$wsid") + if [ -z "$token" ]; then + fail "mint hermes test token" "no token returned" + return 0 + fi + + local reply + if reply=$(send_test_prompt "$wsid" "$token"); then + if echo "$reply" | grep -q "PONG"; then + pass "hermes reply contains PONG" + else + pass "hermes reply non-empty (first 80 chars: ${reply:0:80})" + fi + assert_activity_logged "hermes" "$wsid" "$token" + else + fail "hermes reply" "${reply:-}" + fi +} + +WANT="${E2E_RUNTIMES:-claude-code hermes}" +for r in $WANT; do + case "$r" in + claude-code) run_claude_code ;; + hermes) run_hermes ;; + *) echo "unknown runtime in E2E_RUNTIMES: $r" >&2; exit 2 ;; + esac +done + +echo "" +echo "=== Results: $PASS passed, $FAIL failed, $SKIP skipped ===" +[ "$FAIL" -eq 0 ]