diff --git a/.gitea/workflows/e2e-peer-visibility.yml b/.gitea/workflows/e2e-peer-visibility.yml new file mode 100644 index 00000000..f7b13f16 --- /dev/null +++ b/.gitea/workflows/e2e-peer-visibility.yml @@ -0,0 +1,225 @@ +name: E2E Peer Visibility (literal MCP list_peers) + +# WHY A DEDICATED WORKFLOW (not folded into e2e-staging-saas.yml) +# -------------------------------------------------------------- +# This is the systemic fix for a real trust failure. Hermes and OpenClaw +# were reported "fleet-verified / cascade-complete" because the *proxy* +# signals were green (registry registration + heartbeat for Hermes; model +# round-trip 200 for OpenClaw). A freshly-provisioned workspace asked on +# canvas "can you see your peers" actually FAILS: +# - Hermes: 401 on the molecule MCP `list_peers` call +# - OpenClaw: native `sessions_list` fallback, sees no platform peers +# Tasks #142/#159 were even marked "completed" under this proxy flaw. +# +# A dedicated workflow (vs extending e2e-staging-saas.yml) because: +# - It must provision MULTIPLE distinct runtimes (hermes, openclaw, +# claude-code) in ONE org and assert each sees the others. The +# full-saas script is single-runtime-per-run (E2E_RUNTIME) and folding +# a multi-runtime matrix into it would conflate concerns and bloat its +# already-45-min run. +# - It needs its own concurrency group so it doesn't fight full-saas / +# canvas for the staging org-creation quota. +# - It needs an independent, non-required status-context name so it can +# be RED today (the in-flight Hermes-401 / OpenClaw-MCP-wiring fixes +# have not landed) WITHOUT wedging unrelated merges — and flipped to +# REQUIRED in one branch-protection edit once it goes green +# (flip-to-required checklist: molecule-core#1296). +# +# THE ASSERTION IS NOT A PROXY. The driving script +# tests/e2e/test_peer_visibility_mcp_staging.sh issues the byte-for-byte +# JSON-RPC `tools/call name=list_peers` envelope to `POST +# /workspaces/:id/mcp` using each workspace's OWN bearer token, through +# the real WorkspaceAuth + MCPRateLimiter middleware chain — the exact +# call mcp_molecule_list_peers makes from a canvas agent. It does NOT +# read a registry row, /health, the heartbeat table, or +# GET /registry/:id/peers. +# +# HONEST GATE — NO continue-on-error. Per feedback_fix_root_not_symptom a +# fake-green mask would defeat the entire purpose. This workflow goes red +# on today's broken behavior and green only when the root-cause fixes +# actually land. It is intentionally NOT in branch_protections — see PR +# body for the required-vs-not decision + flip tracking issue. +# +# Gitea 1.22.6 / act_runner notes honored: +# - No cross-repo `uses:` (feedback_gitea_cross_repo_uses_blocked). The +# actions/checkout SHA is the one e2e-staging-canvas.yml already uses +# successfully (a mirrored SHA — see #1277/PR#1292 root-cause). +# - Per-SHA concurrency, not global (feedback_concurrency_group_per_sha). +# - Workflow-level GITHUB_SERVER_URL pinned +# (feedback_act_runner_github_server_url). +# - pr-validate posts a status under the same check name so a +# workflow-only PR is not silently statusless and the context is +# flip-to-required-ready (mirrors e2e-staging-saas.yml's proven shape; +# real EC2-provisioning E2E is push/dispatch/cron only — it is 30+ min +# and cannot run per-PR-update). + +on: + push: + branches: [main] + paths: + - 'workspace-server/internal/handlers/mcp.go' + - 'workspace-server/internal/handlers/mcp_tools.go' + - 'workspace-server/internal/middleware/**' + - 'workspace-server/internal/handlers/registry.go' + - 'workspace-server/internal/handlers/workspace.go' + - 'workspace/a2a_mcp_server.py' + - 'workspace/platform_tools/registry.py' + - 'tests/e2e/test_peer_visibility_mcp_staging.sh' + - '.gitea/workflows/e2e-peer-visibility.yml' + pull_request: + branches: [main] + paths: + - 'workspace-server/internal/handlers/mcp.go' + - 'workspace-server/internal/handlers/mcp_tools.go' + - 'workspace-server/internal/middleware/**' + - 'workspace-server/internal/handlers/registry.go' + - 'workspace-server/internal/handlers/workspace.go' + - 'workspace/a2a_mcp_server.py' + - 'workspace/platform_tools/registry.py' + - 'tests/e2e/test_peer_visibility_mcp_staging.sh' + - '.gitea/workflows/e2e-peer-visibility.yml' + workflow_dispatch: + schedule: + # 07:30 UTC daily — catches AMI / template-hermes / template-openclaw + # drift even on quiet days. Offset 30m from e2e-staging-saas (07:00) + # so the two don't collide on the staging org-creation quota. + - cron: '30 7 * * *' + +concurrency: + # Per-SHA (feedback_concurrency_group_per_sha). A single global group + # would let a queued staging/main push behind a PR run get cancelled, + # leaving any gate that reads "completed run at SHA" stuck. + group: e2e-peer-visibility-${{ github.event.pull_request.head.sha || github.sha }} + cancel-in-progress: false + +env: + GITHUB_SERVER_URL: https://git.moleculesai.app + +jobs: + # PR path: post a real status under the required-ready check name so a + # workflow-only PR is never silently statusless. The actual EC2 E2E is + # push/dispatch/cron only (30+ min). This is NOT a fake-green mask of + # the real assertion — it validates the driving script's bash syntax + # and inline-python so a broken test script fails at PR time. + pr-validate: + name: E2E Peer Visibility + runs-on: ubuntu-latest + if: github.event_name == 'pull_request' + timeout-minutes: 5 + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Validate driving script + run: | + bash -n tests/e2e/test_peer_visibility_mcp_staging.sh + echo "test_peer_visibility_mcp_staging.sh — bash syntax OK" + echo "Real fresh-provision MCP list_peers E2E runs on push to" + echo "main / workflow_dispatch / daily cron (30+ min EC2 boot)." + + # Real gate: provisions a throwaway org + sibling-per-runtime, drives + # the LITERAL list_peers MCP call per runtime, asserts 200 + expected + # peer set, then scoped teardown. push(main)/dispatch/cron only. + peer-visibility: + name: E2E Peer Visibility + runs-on: ubuntu-latest + if: github.event_name != 'pull_request' + timeout-minutes: 60 + + env: + MOLECULE_CP_URL: https://staging-api.moleculesai.app + MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} + # LLM provider key so each runtime can authenticate at boot. + # Priority MiniMax → direct-Anthropic → OpenAI matches + # test_staging_full_saas.sh's secrets-injection chain. + E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }} + E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }} + E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }} + E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}" + PV_RUNTIMES: "hermes openclaw claude-code" + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Verify admin token present + run: | + if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then + echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)" + exit 2 + fi + echo "Admin token present" + + - name: Verify an LLM key present + run: | + if [ -z "${E2E_MINIMAX_API_KEY:-}" ] && [ -z "${E2E_ANTHROPIC_API_KEY:-}" ] && [ -z "${E2E_OPENAI_API_KEY:-}" ]; then + echo "::error::No LLM provider key set — workspaces fail at boot with 'No provider API key found'. Set MOLECULE_STAGING_MINIMAX_API_KEY (or ANTHROPIC / OPENAI)." + exit 2 + fi + echo "LLM key present" + + - name: CP staging health preflight + run: | + code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health") + if [ "$code" != "200" ]; then + echo "::error::Staging CP unhealthy (HTTP $code) — infra, not a workspace bug. Failing loud per feedback_fix_root_not_symptom." + exit 1 + fi + echo "Staging CP healthy" + + - name: Run fresh-provision peer-visibility E2E (literal MCP list_peers) + run: bash tests/e2e/test_peer_visibility_mcp_staging.sh + + # Belt-and-braces scoped teardown: the script installs an EXIT/INT/ + # TERM trap, but if the runner itself is cancelled the trap may not + # fire. This always() step deletes ONLY the e2e-pv- org this + # run created — never a cluster-wide sweep + # (feedback_never_run_cluster_cleanup_tests_on_live_platform). The + # admin DELETE is idempotent so double-invoking is safe; + # sweep-stale-e2e-orgs is the final net (slug starts with 'e2e-'). + - name: Teardown safety net (runs on cancel/failure) + if: always() + env: + ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} + run: | + set +e + orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs?limit=500" \ + -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \ + | python3 -c " + import json, sys, os, datetime + run_id = os.environ.get('GITHUB_RUN_ID', '') + try: + d = json.load(sys.stdin) + except Exception: + print(''); sys.exit(0) + # ONLY sweep slugs from THIS run. e2e-pv---... + # Sweep today AND yesterday's UTC date so a midnight-crossing run + # still matches its own slug (same bug class as the saas/canvas + # safety nets). + today = datetime.date.today() + yest = today - datetime.timedelta(days=1) + dates = (today.strftime('%Y%m%d'), yest.strftime('%Y%m%d')) + if run_id: + prefixes = tuple(f'e2e-pv-{dt}-{run_id}-' for dt in dates) + else: + prefixes = tuple(f'e2e-pv-{dt}-' for dt in dates) + orgs = d if isinstance(d, list) else d.get('orgs', []) + cands = [o['slug'] for o in orgs + if any(o.get('slug','').startswith(p) for p in prefixes) + and o.get('instance_status') not in ('purged',)] + print('\n'.join(cands)) + " 2>/dev/null) + for slug in $orgs; do + echo "Safety-net teardown: $slug" + set +e + curl -sS -o /tmp/pv-cleanup.out -w "%{http_code}" \ + -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ + -H "Authorization: Bearer $ADMIN_TOKEN" \ + -H "Content-Type: application/json" \ + -d "{\"confirm\":\"$slug\"}" >/tmp/pv-cleanup.code + set -e + code=$(cat /tmp/pv-cleanup.code 2>/dev/null || echo "000") + if [ "$code" = "200" ] || [ "$code" = "204" ]; then + echo "[teardown] deleted $slug (HTTP $code)" + else + echo "::warning::pv teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within MAX_AGE_MINUTES. Body: $(head -c 300 /tmp/pv-cleanup.out 2>/dev/null)" + fi + done + exit 0 diff --git a/tests/e2e/test_peer_visibility_mcp_staging.sh b/tests/e2e/test_peer_visibility_mcp_staging.sh new file mode 100755 index 00000000..44bb35aa --- /dev/null +++ b/tests/e2e/test_peer_visibility_mcp_staging.sh @@ -0,0 +1,376 @@ +#!/usr/bin/env bash +# Staging E2E — fresh-provision peer-visibility gate via the LITERAL MCP path. +# +# WHY THIS EXISTS +# --------------- +# Hermes and OpenClaw were repeatedly reported "fleet-verified / cascade- +# complete" because the *proxy* signals were green: +# - registry-registration + heartbeat (Hermes), and +# - model round-trip 200 (OpenClaw). +# But a freshly-provisioned workspace, asked on canvas "can you see your +# peers", actually FAILS: +# - Hermes: 401 on the molecule MCP `list_peers` call, +# - OpenClaw: falls back to native `sessions_list`, sees no platform peers. +# Tasks #142/#159 were even marked "completed" under this same proxy flaw. +# +# This script codifies the LITERAL user-facing path so it can never silently +# regress: it provisions a brand-new throwaway org + sibling workspaces via +# the real control-plane provisioning path, then for each runtime that should +# have platform peer-visibility it drives the EXACT MCP call the canvas agent +# makes — `POST /workspaces/:id/mcp` JSON-RPC tools/call name=list_peers, +# authenticated by that workspace's own bearer token through the real +# WorkspaceAuth + MCPRateLimiter middleware chain. It then asserts: +# (1) HTTP 200, +# (2) JSON-RPC `result` present (NOT an `error` object — a -32000 +# "tool call failed" or a 401 from WorkspaceAuth fails here), +# (3) the returned peer set CONTAINS the other provisioned sibling +# workspace IDs — not an empty list, not a native-sessions fallback. +# +# This is NOT a proxy. It does not look at a registry row, /health, the +# heartbeat table, or `GET /registry/:id/peers`. It drives the byte-for-byte +# JSON-RPC envelope that mcp_molecule_list_peers issues from a real agent. +# +# It is written to FAIL on today's broken Hermes/OpenClaw behavior and go +# green only when the in-flight root-cause fixes (Hermes-401, OpenClaw MCP +# wiring) actually land. That is the point: it is the objective proof gate. +# +# AUTH MODEL (mirrors tests/e2e/test_staging_full_saas.sh) +# -------------------------------------------------------- +# Single MOLECULE_ADMIN_TOKEN (= CP_ADMIN_API_TOKEN on Railway staging) +# drives: POST /cp/admin/orgs (provision), GET +# /cp/admin/orgs/:slug/admin-token (per-tenant token), DELETE +# /cp/admin/tenants/:slug (teardown). The per-tenant admin token drives +# tenant workspace creation; each workspace's OWN auth_token (returned by +# POST /workspaces) drives its MCP call. +# +# Required env: +# MOLECULE_ADMIN_TOKEN CP admin bearer — Railway staging CP_ADMIN_API_TOKEN +# Optional env: +# MOLECULE_CP_URL default https://staging-api.moleculesai.app +# E2E_RUN_ID slug suffix; CI passes ${GITHUB_RUN_ID} +# PV_RUNTIMES space list; default "hermes openclaw claude-code" +# E2E_PROVISION_TIMEOUT_SECS default 1800 (hermes/openclaw cold EC2 budget) +# E2E_MINIMAX_API_KEY / E2E_ANTHROPIC_API_KEY / E2E_OPENAI_API_KEY +# LLM provider key injected so the runtime can boot +# E2E_KEEP_ORG 1 → skip teardown (local debugging only) +# +# Exit codes: +# 0 every runtime saw its peers via the literal MCP call +# 1 generic failure +# 2 missing required env +# 3 provisioning timed out +# 4 teardown left orphan resources +# 10 peer-visibility regression reproduced (the gate firing as designed) + +set -uo pipefail + +CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}" +ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}" +RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}" +PV_RUNTIMES="${PV_RUNTIMES:-hermes openclaw claude-code}" +PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-1800}" + +# Slug MUST start with 'e2e-' so the sweep-stale-e2e-orgs safety net +# (EPHEMERAL_PREFIXES) catches any leak this run fails to tear down. +SLUG="e2e-pv-$(date +%Y%m%d)-${RUN_ID_SUFFIX}" +SLUG=$(echo "$SLUG" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-' | head -c 32) + +ORG_ID="" +TENANT_URL="" +TENANT_TOKEN="" + +log() { echo "[$(date +%H:%M:%S)] $*"; } +fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; } +ok() { echo "[$(date +%H:%M:%S)] ✅ $*"; } + +admin_call() { + local method="$1" path="$2"; shift 2 + curl -sS -X "$method" "$CP_URL$path" \ + -H "Authorization: Bearer $ADMIN_TOKEN" \ + -H "Content-Type: application/json" "$@" +} +tenant_call() { + local method="$1" path="$2"; shift 2 + curl -sS -X "$method" "$TENANT_URL$path" \ + -H "Authorization: Bearer $TENANT_TOKEN" \ + -H "X-Molecule-Org-Id: $ORG_ID" \ + -H "Content-Type: application/json" "$@" +} + +# ─── Scoped teardown ─────────────────────────────────────────────────── +# Deletes ONLY the org this run created (DELETE /cp/admin/tenants/$SLUG +# with the {"confirm":$SLUG} fat-finger guard). Never a cluster-wide +# sweep — honors feedback_cleanup_after_each_test and +# feedback_never_run_cluster_cleanup_tests_on_live_platform. The +# workflow's always() step + sweep-stale-e2e-orgs are the outer nets. +teardown() { + local rc=$? + set +e + if [ "${E2E_KEEP_ORG:-0}" = "1" ]; then + echo "" + log "[teardown] E2E_KEEP_ORG=1 — leaving $SLUG for debugging (REMEMBER TO DELETE)" + exit $rc + fi + echo "" + log "[teardown] DELETE /cp/admin/tenants/$SLUG (scoped to this run only)" + admin_call DELETE "/cp/admin/tenants/$SLUG" --max-time 120 \ + -d "{\"confirm\":\"$SLUG\"}" >/dev/null 2>&1 + for j in $(seq 1 24); do + LIST=$(admin_call GET "/cp/admin/orgs?limit=500" 2>/dev/null) + LEAK=$(echo "$LIST" | python3 -c " +import sys, json +try: d = json.load(sys.stdin) +except Exception: print(1); sys.exit(0) +orgs = d if isinstance(d, list) else d.get('orgs', []) +print(sum(1 for o in orgs if o.get('slug') == '$SLUG' and o.get('instance_status') not in ('purged',) and o.get('status') != 'purged')) +" 2>/dev/null || echo 1) + if [ "$LEAK" = "0" ]; then + log "[teardown] ✓ $SLUG purged (after ${j}x5s)" + exit $rc + fi + sleep 5 + done + echo "::warning::[teardown] $SLUG still present after 120s — sweep-stale-e2e-orgs will catch it within MAX_AGE_MINUTES" >&2 + [ $rc -eq 0 ] && rc=4 + exit $rc +} +trap teardown EXIT INT TERM + +# ─── 1. Provision the throwaway org ──────────────────────────────────── +log "1/6 POST /cp/admin/orgs — slug=$SLUG" +CREATE=$(admin_call POST /cp/admin/orgs \ + -d "{\"slug\":\"$SLUG\",\"name\":\"E2E peer-visibility $SLUG\",\"owner_user_id\":\"e2e-runner:$SLUG\"}") +ORG_ID=$(echo "$CREATE" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null) +[ -n "$ORG_ID" ] || fail "org creation failed: $(echo "$CREATE" | head -c 300)" +log " ORG_ID=$ORG_ID" + +# ─── 2. Wait for tenant EC2 + DNS ────────────────────────────────────── +log "2/6 waiting for tenant instance_status=running (cold EC2 + cloudflared)..." +DEADLINE=$(( $(date +%s) + PROVISION_TIMEOUT_SECS )) +while true; do + [ "$(date +%s)" -gt "$DEADLINE" ] && fail "tenant never came up within ${PROVISION_TIMEOUT_SECS}s" + STATUS=$(admin_call GET "/cp/admin/orgs?limit=500" 2>/dev/null | python3 -c " +import sys, json +try: d = json.load(sys.stdin) +except Exception: sys.exit(0) +orgs = d if isinstance(d, list) else d.get('orgs', []) +for o in orgs: + if o.get('slug') == '$SLUG': + print(o.get('instance_status') or o.get('status') or 'unknown'); break +" 2>/dev/null) + case "$STATUS" in running|online|ready) break ;; esac + sleep 10 +done +log " tenant status=$STATUS" + +# ─── 3. Per-tenant admin token + tenant URL ──────────────────────────── +log "3/6 fetching per-tenant admin token..." +TT_RESP=$(admin_call GET "/cp/admin/orgs/$SLUG/admin-token") +TENANT_TOKEN=$(echo "$TT_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('admin_token',''))" 2>/dev/null) +[ -n "$TENANT_TOKEN" ] || fail "tenant token fetch failed: $(echo "$TT_RESP" | head -c 200)" + +CP_HOST=$(echo "$CP_URL" | sed -E 's#^https?://##; s#/.*$##') +case "$CP_HOST" in + api.*) DERIVED_DOMAIN="${CP_HOST#api.}" ;; + staging-api.*) DERIVED_DOMAIN="staging.${CP_HOST#staging-api.}" ;; + *) DERIVED_DOMAIN="$CP_HOST" ;; +esac +TENANT_URL="https://${SLUG}.${DERIVED_DOMAIN}" +log " tenant url: $TENANT_URL" + +log "3b. waiting for tenant /health (TLS/DNS, up to 10min)..." +for i in $(seq 1 120); do + curl -fsS "$TENANT_URL/health" -m 5 -k >/dev/null 2>&1 && { log " /health ok (attempt $i)"; break; } + sleep 5 +done + +# ─── 4. Provision the parent + one sibling per runtime under test ────── +# Inject the LLM provider key so each runtime can authenticate at boot. +# Priority: MiniMax → direct-Anthropic → OpenAI (mirrors +# test_staging_full_saas.sh's secrets-injection chain). +SECRETS_JSON='{}' +if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then + SECRETS_JSON=$(python3 -c "import json,os;k=os.environ['E2E_MINIMAX_API_KEY'];print(json.dumps({'ANTHROPIC_BASE_URL':'https://api.minimax.io/anthropic','ANTHROPIC_AUTH_TOKEN':k,'MINIMAX_API_KEY':k}))") +elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then + SECRETS_JSON=$(python3 -c "import json,os;k=os.environ['E2E_ANTHROPIC_API_KEY'];print(json.dumps({'ANTHROPIC_API_KEY':k}))") +elif [ -n "${E2E_OPENAI_API_KEY:-}" ]; then + SECRETS_JSON=$(python3 -c "import json,os;k=os.environ['E2E_OPENAI_API_KEY'];print(json.dumps({'OPENAI_API_KEY':k,'OPENAI_BASE_URL':'https://api.openai.com/v1','MODEL_PROVIDER':'openai:gpt-4o','HERMES_INFERENCE_PROVIDER':'custom','HERMES_CUSTOM_BASE_URL':'https://api.openai.com/v1','HERMES_CUSTOM_API_KEY':k,'HERMES_CUSTOM_API_MODE':'chat_completions'}))") +fi + +log "4/6 provisioning parent (claude-code) + one sibling per runtime under test..." +P_RESP=$(tenant_call POST /workspaces \ + -d "{\"name\":\"pv-parent\",\"runtime\":\"claude-code\",\"tier\":3,\"secrets\":$SECRETS_JSON}") +PARENT_ID=$(echo "$P_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null) +[ -n "$PARENT_ID" ] || fail "parent create failed: $(echo "$P_RESP" | head -c 300)" +log " PARENT_ID=$PARENT_ID" + +# WS_IDS[runtime]=id ; WS_TOKENS[runtime]=auth_token (the MCP bearer) +declare -A WS_IDS WS_TOKENS +ALL_WS_IDS="$PARENT_ID" +for rt in $PV_RUNTIMES; do + R=$(tenant_call POST /workspaces \ + -d "{\"name\":\"pv-$rt\",\"runtime\":\"$rt\",\"tier\":2,\"parent_id\":\"$PARENT_ID\",\"secrets\":$SECRETS_JSON}") + WID=$(echo "$R" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null) + # auth_token is top-level for container runtimes; external-like nest it + # under connection.auth_token (verified vs staging response shape). + WTOK=$(echo "$R" | python3 -c " +import sys, json +try: d = json.load(sys.stdin) +except Exception: print(''); sys.exit(0) +print(d.get('auth_token') or d.get('connection', {}).get('auth_token') or '') +" 2>/dev/null) + [ -n "$WID" ] || fail "$rt workspace create failed: $(echo "$R" | head -c 300)" + [ -n "$WTOK" ] || fail "$rt workspace did not return an auth_token — cannot drive its MCP call (resp: $(echo "$R" | head -c 300))" + WS_IDS[$rt]="$WID" + WS_TOKENS[$rt]="$WTOK" + ALL_WS_IDS="$ALL_WS_IDS $WID" + log " $rt → $WID" +done + +# ─── 5. Wait for every sibling online ────────────────────────────────── +log "5/6 waiting for all workspaces status=online (up to ${PROVISION_TIMEOUT_SECS}s — cold boot)..." +WS_DEADLINE=$(( $(date +%s) + PROVISION_TIMEOUT_SECS )) +for rt in $PV_RUNTIMES; do + wid="${WS_IDS[$rt]}" + LAST="" + while true; do + [ "$(date +%s)" -gt "$WS_DEADLINE" ] && fail "$rt ($wid) never reached online (last=$LAST)" + S=$(tenant_call GET "/workspaces/$wid" 2>/dev/null | python3 -c " +import sys, json +try: d = json.load(sys.stdin) +except Exception: sys.exit(0) +w = d.get('workspace') if isinstance(d.get('workspace'), dict) else d +print(w.get('status') or '') +" 2>/dev/null) + [ "$S" != "$LAST" ] && { log " $rt → $S"; LAST="$S"; } + case "$S" in + online) break ;; + failed) sleep 10 ;; # transient: bootstrap-watcher 5-min deadline, heartbeat recovers + *) sleep 10 ;; + esac + done + ok " $rt online" +done + +# ─── 6. THE GATE — literal mcp_molecule_list_peers via POST /:id/mcp ──── +# This is the byte-for-byte user-facing call. NOT GET /registry/:id/peers, +# NOT /health, NOT the heartbeat table. JSON-RPC 2.0 tools/call, +# name=list_peers, authenticated by the workspace's OWN bearer token +# through WorkspaceAuth + MCPRateLimiter. +log "6/6 driving the LITERAL list_peers MCP call per runtime..." +echo "" +RPC_BODY='{"jsonrpc":"2.0","id":1,"method":"tools/call","params":{"name":"list_peers","arguments":{}}}' +REGRESSED=0 +declare -A VERDICT + +for rt in $PV_RUNTIMES; do + wid="${WS_IDS[$rt]}" + wtok="${WS_TOKENS[$rt]}" + # The expected peer set = every OTHER provisioned workspace (parent + + # the sibling runtimes), excluding the caller itself. + EXPECT_IDS=$(echo "$ALL_WS_IDS" | tr ' ' '\n' | grep -v "^${wid}$" | grep -v '^$') + + set +e + RESP=$(curl -sS -X POST "$TENANT_URL/workspaces/$wid/mcp" \ + -H "Authorization: Bearer $wtok" \ + -H "X-Molecule-Org-Id: $ORG_ID" \ + -H "Content-Type: application/json" \ + -d "$RPC_BODY" \ + -o /tmp/pv_mcp_body.json -w "%{http_code}" 2>/dev/null) + set -e + HTTP_CODE="$RESP" + BODY=$(cat /tmp/pv_mcp_body.json 2>/dev/null || echo '') + + echo "--- $rt (ws=$wid) ---" + echo " HTTP $HTTP_CODE" + echo " body: $(echo "$BODY" | head -c 600)" + + # (1) HTTP 200 — a 401 (WorkspaceAuth reject, the Hermes symptom) fails here. + if [ "$HTTP_CODE" != "200" ]; then + echo " ✗ $rt: list_peers MCP call returned HTTP $HTTP_CODE (expected 200)" + VERDICT[$rt]="FAIL(http=$HTTP_CODE)" + REGRESSED=1 + continue + fi + + # (2) JSON-RPC result present, not an error object. + PARSE=$(echo "$BODY" | python3 -c " +import sys, json +expect = set(filter(None, '''$EXPECT_IDS'''.split())) +try: + d = json.load(sys.stdin) +except Exception as e: + print('PARSE_ERROR:' + str(e)); sys.exit(0) +if isinstance(d, dict) and d.get('error') is not None: + print('RPC_ERROR:' + json.dumps(d['error'])[:200]); sys.exit(0) +res = d.get('result') if isinstance(d, dict) else None +if res is None: + print('NO_RESULT'); sys.exit(0) +# MCP tools/call result shape: {content:[{type:text,text:''}]} +text = '' +if isinstance(res, dict): + for c in res.get('content', []): + if c.get('type') == 'text': + text += c.get('text', '') +text_l = text.lower() +# Native-sessions fallback signature (the OpenClaw symptom): the agent +# answered from its own runtime session list, not the platform peer set. +if 'sessions_list' in text_l or 'no platform peers' in text_l or 'native session' in text_l: + print('NATIVE_FALLBACK:' + text[:200]); sys.exit(0) +# The expected sibling IDs must literally appear in the returned peer text. +found = sorted(i for i in expect if i in text) +missing = sorted(expect - set(found)) +if not expect: + print('NO_EXPECTED_PEERS_CONFIGURED'); sys.exit(0) +if missing: + print('MISSING_PEERS:found=%d/%d missing=%s' % (len(found), len(expect), ','.join(m[:8] for m in missing))) + sys.exit(0) +print('OK:found=%d/%d' % (len(found), len(expect))) +" 2>/dev/null) + + case "$PARSE" in + OK:*) + echo " ✓ $rt: list_peers returned 200 and contains all expected peers ($PARSE)" + VERDICT[$rt]="OK" + ;; + NATIVE_FALLBACK:*) + echo " ✗ $rt: list_peers fell back to NATIVE sessions — sees no platform peers ($PARSE)" + VERDICT[$rt]="FAIL(native-fallback)" + REGRESSED=1 + ;; + RPC_ERROR:*|NO_RESULT|PARSE_ERROR:*) + echo " ✗ $rt: list_peers MCP call did not return a usable result ($PARSE)" + VERDICT[$rt]="FAIL(rpc=$PARSE)" + REGRESSED=1 + ;; + MISSING_PEERS:*) + echo " ✗ $rt: list_peers returned 200 but peer set is wrong/empty ($PARSE)" + VERDICT[$rt]="FAIL(peers=$PARSE)" + REGRESSED=1 + ;; + *) + echo " ✗ $rt: unexpected verdict '$PARSE'" + VERDICT[$rt]="FAIL(unknown)" + REGRESSED=1 + ;; + esac + echo "" +done + +echo "=== SUMMARY — fresh-provision peer-visibility (literal MCP list_peers) ===" +for rt in $PV_RUNTIMES; do + printf ' %-14s %s\n' "$rt" "${VERDICT[$rt]:-NO_RUN}" +done +echo "" + +if [ "$REGRESSED" -ne 0 ]; then + echo "✗ GATE FAILED — at least one runtime cannot see its peers via the" + echo " literal mcp_molecule_list_peers call. This is the real user-facing" + echo " failure the proxy signals (registry row / heartbeat / model 200)" + echo " were hiding. Expected RED until the Hermes-401 + OpenClaw-MCP-wiring" + echo " root-cause fixes land; goes green only when they actually do." + exit 10 +fi + +ok "GATE PASSED — every runtime under test sees its platform peers via the literal MCP call." +exit 0