From a4b3ebf951c3ffb44ed8d3dfe046c79a9ef1fc10 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Sun, 26 Apr 2026 23:48:54 -0700
Subject: [PATCH] test(e2e): claude-code + hermes priority-runtimes happy path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Self-contained happy-path E2E for the two runtimes the project commits
to first-class support for (task #116, completes the loop on the
"both must work end-to-end with tests" requirement).

What it proves per runtime:
  1. POST /workspaces succeeds with the runtime + secrets
  2. Workspace reaches status=online within its cold-boot window
     (claude-code: 240s, hermes: 900s on cold apt + uv + sidecar)
  3. POST /a2a (message/send "Reply with PONG") returns a non-error,
     non-empty reply
  4. activity_logs row written with method=message/send and ok|error
     status (a2a_proxy.LogActivity contract)

Skip semantics: each phase independently checks for its required env
key (CLAUDE_CODE_OAUTH_TOKEN / E2E_OPENAI_API_KEY) and skips cleanly
if absent. The script always exit-0s if every phase either passed or
skipped — so wiring it into a no-keys CI job validates the script
itself stays clean without false-failing.

Idempotent: pre-sweeps any prior "Priority E2E (claude-code)" /
"Priority E2E (hermes)" workspaces so a run interrupted by SIGPIPE /
kill -9 (which bypasses the EXIT trap) doesn't poison the next run.
Same defensive pattern as test_notify_attachments_e2e.sh.

CI wiring:
  - e2e-api.yml — runs on every PR with no LLM keys, both phases skip,
    catches script-level regressions (set -u bugs, syntax issues, etc.)
  - canary-staging.yml + e2e-staging-saas.yml already have the keys
    via secrets.MOLECULE_STAGING_OPENAI_KEY and exercise wire-real
    behavior — could be wired to opt-in if you want claude-code coverage
    there too.

Local runs (from this branch, no keys):
  === Results: 0 passed, 0 failed, 2 skipped ===

Validates the capability primitives shipped in PRs #2137-2144: once
template PRs #12 (claude-code) + #25 (hermes) merge with their
declared provides_native_session=True + idle_timeout_override=900,
a manual run with both keys validates the full native+pluggable chain.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/e2e-api.yml           |   5 +
 tests/e2e/test_priority_runtimes_e2e.sh | 299 ++++++++++++++++++++++++
 2 files changed, 304 insertions(+)
 create mode 100755 tests/e2e/test_priority_runtimes_e2e.sh

diff --git a/.github/workflows/e2e-api.yml b/.github/workflows/e2e-api.yml
index 05f9ab52..89c69b88 100644
--- a/.github/workflows/e2e-api.yml
+++ b/.github/workflows/e2e-api.yml
@@ -99,6 +99,11 @@ jobs:
         run: bash tests/e2e/test_api.sh
       - name: Run notify-with-attachments E2E
         run: bash tests/e2e/test_notify_attachments_e2e.sh
+      - name: Run priority-runtimes E2E (claude-code + hermes — skips when keys absent)
+        # Validates the test script itself runs cleanly even with no LLM
+        # keys (both phases skip gracefully). The wire-real coverage with
+        # actual keys runs in canary-staging.yml + e2e-staging-saas.yml.
+        run: bash tests/e2e/test_priority_runtimes_e2e.sh
       - name: Dump platform log on failure
         if: failure()
         run: cat workspace-server/platform.log || true
diff --git a/tests/e2e/test_priority_runtimes_e2e.sh b/tests/e2e/test_priority_runtimes_e2e.sh
new file mode 100755
index 00000000..6c2febaf
--- /dev/null
+++ b/tests/e2e/test_priority_runtimes_e2e.sh
@@ -0,0 +1,299 @@
+#!/usr/bin/env bash
+# E2E test: claude-code AND hermes both work end-to-end (task #87 priority adapters).
+#
+# Self-contained happy-path smoke for the two runtimes the project commits
+# to first-class support for. Provisions a fresh workspace per runtime,
+# waits for it to reach status=online, sends a real A2A message, and
+# asserts a non-error reply. Pins the contract so the upcoming refactor
+# (move adapter executors to template repos) cannot silently break either
+# path.
+#
+# What this proves:
+#   1. Provisioning + container boot works for each runtime.
+#   2. The runtime reaches status=online within its expected cold-boot
+#      window (claude-code: ~60s, hermes: up to 15min on cold apt).
+#   3. A real A2A message/send produces a non-empty, non-error reply.
+#   4. The activity_logs row for the call is well-formed.
+#
+# Each phase skips cleanly when its prerequisite secret is absent so a
+# partially-keyed env (e.g. CI without an OpenAI key) doesn't false-fail.
+#
+# Usage:
+#   CLAUDE_CODE_OAUTH_TOKEN=... E2E_OPENAI_API_KEY=... \
+#     tests/e2e/test_priority_runtimes_e2e.sh
+#
+#   # Run only one runtime
+#   E2E_RUNTIMES=claude-code tests/e2e/test_priority_runtimes_e2e.sh
+#   E2E_RUNTIMES=hermes      tests/e2e/test_priority_runtimes_e2e.sh
+#
+# Prereqs:
+#   - workspace-server on http://localhost:8080
+#   - MOLECULE_ENV != production (required for admin/test-token)
+#   - For claude-code: CLAUDE_CODE_OAUTH_TOKEN
+#   - For hermes:      E2E_OPENAI_API_KEY  (other providers also OK if you
+#                       set MODEL_SLUG_HERMES + matching secrets directly)
+
+set -euo pipefail
+
+source "$(dirname "$0")/_lib.sh"
+
+PASS=0
+FAIL=0
+SKIP=0
+CREATED_WSIDS=()
+
+cleanup() {
+  # `set -u` + empty array would error on "${CREATED_WSIDS[@]}"; the
+  # ${VAR[@]+"…"} form expands to nothing when the array is unset/empty
+  # so the loop body is skipped cleanly. Hits the skip-no-keys path.
+  for wid in ${CREATED_WSIDS[@]+"${CREATED_WSIDS[@]}"}; do
+    [ -n "$wid" ] && curl -s -X DELETE "$BASE/workspaces/$wid?confirm=true" > /dev/null || true
+  done
+}
+trap cleanup EXIT
+
+pass()  { echo "  PASS — $1"; PASS=$((PASS + 1)); }
+fail()  { echo "  FAIL — $1"; echo "         $2"; FAIL=$((FAIL + 1)); }
+skip()  { echo "  SKIP — $1"; SKIP=$((SKIP + 1)); }
+
+# Pre-sweep any prior runs that left workspaces behind (same defence as
+# test_notify_attachments_e2e.sh: trap fires on normal exit, but a
+# SIGPIPE / kill -9 can bypass it).
+PRIOR=$(curl -s "$BASE/workspaces" | python3 -c '
+import json, sys
+try:
+    print(" ".join(w["id"] for w in json.load(sys.stdin) if w.get("name","").startswith("Priority E2E ")))
+except Exception:
+    pass
+')
+for _wid in $PRIOR; do
+  echo "Sweeping prior workspace: $_wid"
+  curl -s -X DELETE "$BASE/workspaces/$_wid?confirm=true" > /dev/null || true
+done
+
+# Block until $1 reaches one of $2 (space-separated states), or $3 sec elapse.
+wait_for_status() {
+  local wsid="$1" want="$2" budget="$3"
+  local start=$SECONDS
+  while [ $((SECONDS - start)) -lt "$budget" ]; do
+    local s
+    s=$(curl -s "$BASE/workspaces/$wsid" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("status",""))' 2>/dev/null || echo "")
+    for w in $want; do [ "$s" = "$w" ] && { echo "$s"; return 0; }; done
+    sleep 4
+  done
+  echo "$s"
+  return 1
+}
+
+# Send "What is 2+2?" via A2A, return the reply text on stdout. Fails
+# (non-zero exit + empty stdout) if the platform returns an error envelope
+# or the reply is empty / sentinel-error.
+send_test_prompt() {
+  local wsid="$1" token="$2"
+  local resp
+  resp=$(curl -s --max-time 180 -X POST "$BASE/workspaces/$wsid/a2a" \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer $token" \
+    -d '{
+      "method": "message/send",
+      "params": {
+        "message": {
+          "role": "user",
+          "messageId": "e2e-priority-runtime",
+          "parts": [{"kind": "text", "text": "Reply with exactly the word: PONG"}]
+        }
+      }
+    }') || return 1
+  # Walk a few common A2A reply shapes; stop at the first non-empty text.
+  echo "$resp" | python3 -c '
+import json, sys
+try:
+    d = json.loads(sys.stdin.read())
+except Exception:
+    sys.exit(1)
+texts = []
+def walk(node):
+    if isinstance(node, dict):
+        for v in node.values(): walk(v)
+    elif isinstance(node, list):
+        for v in node: walk(v)
+    elif isinstance(node, str):
+        texts.append(node)
+walk(d.get("result") or d)
+joined = "\n".join(t for t in texts if t.strip())
+if not joined.strip():
+    sys.exit(2)
+# Surface a known error sentinel so the caller can tell apart "empty" from "explicit error"
+low = joined.lower()
+for needle in ("a2a_error", "agent error", "could not resolve authentication", "401",
+               "no provider api key", "missing api", "model_not_found"):
+    if needle in low:
+        print("ERROR: " + joined[:200])
+        sys.exit(3)
+print(joined)
+'
+}
+
+assert_activity_logged() {
+  # After a successful A2A round-trip, the platform's a2a_proxy logs
+  # an a2a_receive row with method=message/send. Pin the contract so a
+  # silent regression in LogActivity (e.g. dropped status field, broken
+  # broadcaster) shows up here. Polls briefly because LogActivity is
+  # detached-goroutine — the row may land a few hundred ms after the
+  # POST returns.
+  local label="$1" wsid="$2" token="$3"
+  local start=$SECONDS
+  while [ $((SECONDS - start)) -lt 10 ]; do
+    local act
+    act=$(curl -s -H "Authorization: Bearer $token" "$BASE/workspaces/$wsid/activity?type=a2a_receive&limit=10")
+    local found
+    found=$(echo "$act" | python3 -c '
+import json, sys
+try:
+    rows = json.load(sys.stdin) or []
+except Exception:
+    sys.exit(1)
+for r in rows:
+    if r.get("method") == "message/send" and r.get("status") in ("ok", "error"):
+        print("ok")
+        sys.exit(0)
+sys.exit(2)
+' 2>/dev/null) && true
+    if [ "$found" = "ok" ]; then
+      pass "$label activity_logs row written for the A2A turn"
+      return 0
+    fi
+    sleep 1
+  done
+  fail "$label activity_logs row" "no a2a_receive row with method=message/send appeared in 10s"
+}
+
+run_claude_code() {
+  echo ""
+  echo "=== claude-code happy path ==="
+  if [ -z "${CLAUDE_CODE_OAUTH_TOKEN:-}" ]; then
+    skip "CLAUDE_CODE_OAUTH_TOKEN not set"
+    return 0
+  fi
+  local secrets
+  secrets=$(python3 -c "
+import json, os
+print(json.dumps({'CLAUDE_CODE_OAUTH_TOKEN': os.environ['CLAUDE_CODE_OAUTH_TOKEN']}))
+")
+  local resp wsid
+  resp=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \
+    -d "{\"name\":\"Priority E2E (claude-code)\",\"runtime\":\"claude-code\",\"tier\":1,\"secrets\":$secrets}")
+  wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true
+  if [ -z "$wsid" ]; then
+    fail "create claude-code workspace" "$resp"
+    return 0
+  fi
+  CREATED_WSIDS+=("$wsid")
+  echo "  workspace=$wsid"
+
+  # claude-code typical cold boot: 30-90s (image already pulled)
+  local final
+  final=$(wait_for_status "$wsid" "online failed" 240) || true
+  if [ "$final" != "online" ]; then
+    fail "claude-code workspace reaches online" "final status: $final"
+    return 0
+  fi
+  pass "claude-code workspace reaches online"
+
+  local token
+  token=$(e2e_mint_test_token "$wsid")
+  if [ -z "$token" ]; then
+    fail "mint claude-code test token" "no token returned"
+    return 0
+  fi
+
+  local reply
+  if reply=$(send_test_prompt "$wsid" "$token"); then
+    if echo "$reply" | grep -q "PONG"; then
+      pass "claude-code reply contains PONG"
+    else
+      pass "claude-code reply non-empty (first 80 chars: ${reply:0:80})"
+    fi
+    assert_activity_logged "claude-code" "$wsid" "$token"
+  else
+    fail "claude-code reply" "${reply:-<empty or error>}"
+  fi
+}
+
+run_hermes() {
+  echo ""
+  echo "=== hermes happy path ==="
+  if [ -z "${E2E_OPENAI_API_KEY:-}" ]; then
+    skip "E2E_OPENAI_API_KEY not set (hermes needs an LLM provider key)"
+    return 0
+  fi
+  local secrets
+  secrets=$(python3 -c "
+import json, os
+k = os.environ['E2E_OPENAI_API_KEY']
+print(json.dumps({
+    'OPENAI_API_KEY': k,
+    'OPENAI_BASE_URL': 'https://api.openai.com/v1',
+    'MODEL_PROVIDER': 'openai:gpt-4o',
+    # The HERMES_* fields below pin the provider deterministically
+    # (see comment in test_staging_full_saas.sh:268-275 for why).
+    'HERMES_INFERENCE_PROVIDER': 'custom',
+    'HERMES_CUSTOM_BASE_URL': 'https://api.openai.com/v1',
+    'HERMES_CUSTOM_API_KEY': k,
+    'HERMES_CUSTOM_API_MODE': 'chat_completions',
+}))
+")
+  local resp wsid
+  resp=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \
+    -d "{\"name\":\"Priority E2E (hermes)\",\"runtime\":\"hermes\",\"tier\":1,\"model\":\"openai/gpt-4o\",\"secrets\":$secrets}")
+  wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true
+  if [ -z "$wsid" ]; then
+    fail "create hermes workspace" "$resp"
+    return 0
+  fi
+  CREATED_WSIDS+=("$wsid")
+  echo "  workspace=$wsid"
+
+  # Hermes cold boot is the slow path: apt + uv + hermes-agent sidecar.
+  # Up to 15 min on cold disk; usually 3-5 min when the runtime image is
+  # already cached. Be generous so the test doesn't false-fail in CI.
+  local final
+  final=$(wait_for_status "$wsid" "online failed" 900) || true
+  if [ "$final" != "online" ]; then
+    fail "hermes workspace reaches online" "final status: $final"
+    return 0
+  fi
+  pass "hermes workspace reaches online"
+
+  local token
+  token=$(e2e_mint_test_token "$wsid")
+  if [ -z "$token" ]; then
+    fail "mint hermes test token" "no token returned"
+    return 0
+  fi
+
+  local reply
+  if reply=$(send_test_prompt "$wsid" "$token"); then
+    if echo "$reply" | grep -q "PONG"; then
+      pass "hermes reply contains PONG"
+    else
+      pass "hermes reply non-empty (first 80 chars: ${reply:0:80})"
+    fi
+    assert_activity_logged "hermes" "$wsid" "$token"
+  else
+    fail "hermes reply" "${reply:-<empty or error>}"
+  fi
+}
+
+WANT="${E2E_RUNTIMES:-claude-code hermes}"
+for r in $WANT; do
+  case "$r" in
+    claude-code) run_claude_code ;;
+    hermes)      run_hermes ;;
+    *) echo "unknown runtime in E2E_RUNTIMES: $r" >&2; exit 2 ;;
+  esac
+done
+
+echo ""
+echo "=== Results: $PASS passed, $FAIL failed, $SKIP skipped ==="
+[ "$FAIL" -eq 0 ]