molecule-core/.gitea/workflows/e2e-peer-visibility.yml

name: E2E Peer Visibility (literal MCP list_peers)

# WHY A DEDICATED WORKFLOW (not folded into e2e-staging-saas.yml)
# --------------------------------------------------------------
# This is the systemic fix for a real trust failure. Hermes and OpenClaw
# were reported "fleet-verified / cascade-complete" because the *proxy*
# signals were green (registry registration + heartbeat for Hermes; model
# round-trip 200 for OpenClaw). A freshly-provisioned workspace asked on
# canvas "can you see your peers" actually FAILS:
#   - Hermes: 401 on the molecule MCP `list_peers` call
#   - OpenClaw: native `sessions_list` fallback, sees no platform peers
# Tasks #142/#159 were even marked "completed" under this proxy flaw.
#
# A dedicated workflow (vs extending e2e-staging-saas.yml) because:
#   - It must provision MULTIPLE distinct runtimes (hermes, openclaw,
#     claude-code) in ONE org and assert each sees the others. The
#     full-saas script is single-runtime-per-run (E2E_RUNTIME) and folding
#     a multi-runtime matrix into it would conflate concerns and bloat its
#     already-45-min run.
#   - It needs its own concurrency group so it doesn't fight full-saas /
#     canvas for the staging org-creation quota.
#   - It needs an independent, non-required status-context name so it can
#     be RED today (the in-flight Hermes-401 / OpenClaw-MCP-wiring fixes
#     have not landed) WITHOUT wedging unrelated merges — and flipped to
#     REQUIRED in one branch-protection edit once it goes green
#     (flip-to-required checklist: molecule-core#1296).
#
# THE ASSERTION IS NOT A PROXY. The driving script
# tests/e2e/test_peer_visibility_mcp_staging.sh issues the byte-for-byte
# JSON-RPC `tools/call name=list_peers` envelope to `POST
# /workspaces/:id/mcp` using each workspace's OWN bearer token, through
# the real WorkspaceAuth + MCPRateLimiter middleware chain — the exact
# call mcp_molecule_list_peers makes from a canvas agent. It does NOT
# read a registry row, /health, the heartbeat table, or
# GET /registry/:id/peers.
#
# HONEST GATE — NO continue-on-error. Per feedback_fix_root_not_symptom a
# fake-green mask would defeat the entire purpose. This workflow goes red
# on today's broken behavior and green only when the root-cause fixes
# actually land. It is intentionally NOT in branch_protections — see PR
# body for the required-vs-not decision + flip tracking issue.
#
# Gitea 1.22.6 / act_runner notes honored:
#   - No cross-repo `uses:` (feedback_gitea_cross_repo_uses_blocked). The
#     actions/checkout SHA is the one e2e-staging-canvas.yml already uses
#     successfully (a mirrored SHA — see #1277/PR#1292 root-cause).
#   - Per-SHA concurrency, not global (feedback_concurrency_group_per_sha).
#   - Workflow-level GITHUB_SERVER_URL pinned
#     (feedback_act_runner_github_server_url).
#   - pr-validate posts a status under the same check name so a
#     workflow-only PR is not silently statusless and the context is
#     flip-to-required-ready (mirrors e2e-staging-saas.yml's proven shape;
#     real EC2-provisioning E2E is push/dispatch/cron only — it is 30+ min
#     and cannot run per-PR-update).
#
# LOCAL BACKEND (added 2026-05-15 — feedback_local_must_mimic_production,
# feedback_mandatory_local_e2e_before_ship, feedback_local_test_before_
# staging_e2e)
# --------------------------------------------------------------------
# The standing rule is that the local prod-mimic stack runs a MANDATORY
# local-Postgres E2E BEFORE staging E2E. A staging-only peer-visibility
# gate caught regressions late + expensively (cold EC2). The
# `peer-visibility-local` job below runs the SAME byte-identical
# assertion (tests/e2e/lib/peer_visibility_assert.sh) against the local
# docker-compose stack — built + booted exactly like e2e-api.yml's
# proven E2E API Smoke Test job (ephemeral pg/redis ports, go build,
# background platform-server). It runs on PR + push (local boot is
# minutes, not the 30+ min cold-EC2 path), so peer-visibility is part of
# the local gate that fires before the staging E2E.
#
# It is its OWN non-required status context `E2E Peer Visibility (local)`
# — same non-required-by-design decision as the staging job (red until
# Hermes-401 #162 / OpenClaw-never-online #165 land; flip-to-required
# tracked at molecule-core#1296). It is an HONEST gate: NO
# continue-on-error mask (feedback_fix_root_not_symptom). It is kept a
# distinct context (not folded into e2e-api.yml's required `E2E API
# Smoke Test`) precisely so a deliberately-RED-today gate cannot wedge
# the required local-E2E job or any unrelated merge.

on:
  push:
    branches: [main]
    paths:
      - 'workspace-server/internal/handlers/mcp.go'
      - 'workspace-server/internal/handlers/mcp_tools.go'
      - 'workspace-server/internal/middleware/**'
      - 'workspace-server/internal/handlers/registry.go'
      - 'workspace-server/internal/handlers/workspace.go'
      - 'tests/e2e/test_peer_visibility_mcp_staging.sh'
      - 'tests/e2e/test_peer_visibility_mcp_local.sh'
      - 'tests/e2e/lib/peer_visibility_assert.sh'
      - '.gitea/workflows/e2e-peer-visibility.yml'
  pull_request:
    branches: [main]
    paths:
      - 'workspace-server/internal/handlers/mcp.go'
      - 'workspace-server/internal/handlers/mcp_tools.go'
      - 'workspace-server/internal/middleware/**'
      - 'workspace-server/internal/handlers/registry.go'
      - 'workspace-server/internal/handlers/workspace.go'
      - 'tests/e2e/test_peer_visibility_mcp_staging.sh'
      - 'tests/e2e/test_peer_visibility_mcp_local.sh'
      - 'tests/e2e/lib/peer_visibility_assert.sh'
      - '.gitea/workflows/e2e-peer-visibility.yml'
  workflow_dispatch:
  schedule:
    # 07:30 UTC daily — catches AMI / template-hermes / template-openclaw
    # drift even on quiet days. Offset 30m from e2e-staging-saas (07:00)
    # so the two don't collide on the staging org-creation quota.
    - cron: '30 7 * * *'

concurrency:
  # Per-SHA (feedback_concurrency_group_per_sha). A single global group
  # would let a queued staging/main push behind a PR run get cancelled,
  # leaving any gate that reads "completed run at SHA" stuck.
  group: e2e-peer-visibility-${{ github.event.pull_request.head.sha || github.sha }}
  cancel-in-progress: false

env:
  GITHUB_SERVER_URL: https://git.moleculesai.app

jobs:
  # PR path: post a real status under the required-ready check name so a
  # workflow-only PR is never silently statusless. The actual EC2 E2E is
  # push/dispatch/cron only (30+ min). This is NOT a fake-green mask of
  # the real assertion — it validates the driving script's bash syntax
  # and inline-python so a broken test script fails at PR time.
  pr-validate:
    name: E2E Peer Visibility
    runs-on: ubuntu-latest
    if: github.event_name == 'pull_request'
    timeout-minutes: 5
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - name: Validate driving scripts + shared assertion lib
        run: |
          bash -n tests/e2e/lib/peer_visibility_assert.sh
          echo "lib/peer_visibility_assert.sh — bash syntax OK"
          bash -n tests/e2e/test_peer_visibility_mcp_staging.sh
          echo "test_peer_visibility_mcp_staging.sh — bash syntax OK"
          bash -n tests/e2e/test_peer_visibility_mcp_local.sh
          echo "test_peer_visibility_mcp_local.sh — bash syntax OK"
          echo "Staging fresh-provision MCP list_peers E2E runs on push to"
          echo "main / workflow_dispatch / daily cron (30+ min EC2 boot)."
          echo "The LOCAL backend runs in the peer-visibility-local job"
          echo "below on this same PR (local docker-compose stack)."

  # LOCAL gate: same byte-identical assertion against the local prod-mimic
  # docker-compose stack — the MANDATORY local-E2E that must run BEFORE
  # the staging E2E (feedback_mandatory_local_e2e_before_ship,
  # feedback_local_test_before_staging_e2e). Bootstrap mirrors
  # e2e-api.yml's proven E2E API Smoke Test job (per-run container names +
  # ephemeral host ports so concurrent host-network act_runner runs don't
  # collide; go build; background platform-server). Its OWN non-required
  # status context `E2E Peer Visibility (local)` — non-required-by-design
  # exactly like the staging job (red until #162/#165 land;
  # flip-to-required tracked at molecule-core#1296). HONEST gate, NO
  # continue-on-error mask (feedback_fix_root_not_symptom). Runs on PR +
  # push (local boot is minutes, not the 30+ min cold-EC2 path).
  # bp-required: pending #1296
  peer-visibility-local:
    name: E2E Peer Visibility (local)
    runs-on: docker-host
    timeout-minutes: 30
    env:
      # Per-run names + ephemeral ports — same collision-avoidance as
      # e2e-api.yml (host-network act_runner; feedback_act_runner_*).
      PG_CONTAINER: pg-e2e-pv-${{ github.run_id }}-${{ github.run_attempt }}
      REDIS_CONTAINER: redis-e2e-pv-${{ github.run_id }}-${{ github.run_attempt }}
      # LLM keys so hermes/openclaw can actually boot. The local script
      # SKIPs (not fails) any runtime whose key is absent, so a partially
      # keyed CI env still exercises whatever it can.
      CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.E2E_CLAUDE_CODE_OAUTH_TOKEN }}
      E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
      E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
      E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }}
      PV_RUNTIMES: "hermes openclaw claude-code"
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
        with:
          go-version: 'stable'
          cache: true
          cache-dependency-path: workspace-server/go.sum
      - name: Pre-pull alpine + ensure provisioner network
        run: |
          docker pull alpine:latest >/dev/null
          docker network create molecule-core-net >/dev/null 2>&1 || true
          echo "alpine:latest pre-pulled; molecule-core-net ensured."
      - name: Start Postgres (docker, ephemeral port)
        run: |
          docker rm -f "$PG_CONTAINER" 2>/dev/null || true
          docker run -d --name "$PG_CONTAINER" \
            -e POSTGRES_USER=dev -e POSTGRES_PASSWORD=dev -e POSTGRES_DB=molecule \
            -p 0:5432 postgres:16 >/dev/null
          PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}')
          [ -n "$PG_PORT" ] || PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | head -1 | awk -F: '{print $NF}')
          if [ -z "$PG_PORT" ]; then
            echo "::error::Could not resolve host port for $PG_CONTAINER"
            docker logs "$PG_CONTAINER" || true; exit 1
          fi
          echo "DATABASE_URL=postgres://dev:dev@127.0.0.1:${PG_PORT}/molecule?sslmode=disable" >> "$GITHUB_ENV"
          for i in $(seq 1 30); do
            docker exec "$PG_CONTAINER" pg_isready -U dev >/dev/null 2>&1 && { echo "Postgres ready after ${i}s"; exit 0; }
            sleep 1
          done
          echo "::error::Postgres did not become ready in 30s"; docker logs "$PG_CONTAINER" || true; exit 1
      - name: Start Redis (docker, ephemeral port)
        run: |
          docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
          docker run -d --name "$REDIS_CONTAINER" -p 0:6379 redis:7 >/dev/null
          REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}')
          [ -n "$REDIS_PORT" ] || REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | head -1 | awk -F: '{print $NF}')
          if [ -z "$REDIS_PORT" ]; then
            echo "::error::Could not resolve host port for $REDIS_CONTAINER"
            docker logs "$REDIS_CONTAINER" || true; exit 1
          fi
          echo "REDIS_URL=redis://127.0.0.1:${REDIS_PORT}" >> "$GITHUB_ENV"
          for i in $(seq 1 15); do
            docker exec "$REDIS_CONTAINER" redis-cli ping 2>/dev/null | grep -q PONG && { echo "Redis ready after ${i}s"; exit 0; }
            sleep 1
          done
          echo "::error::Redis did not become ready in 15s"; docker logs "$REDIS_CONTAINER" || true; exit 1
      - name: Build platform
        working-directory: workspace-server
        run: go build -o platform-server ./cmd/server
      - name: Pick platform port
        run: |
          PLATFORM_PORT=$(python3 - <<'PY'
          import socket
          with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
              s.bind(("127.0.0.1", 0))
              print(s.getsockname()[1])
          PY
          )
          echo "PORT=${PLATFORM_PORT}" >> "$GITHUB_ENV"
          echo "BASE=http://127.0.0.1:${PLATFORM_PORT}" >> "$GITHUB_ENV"
          echo "Platform host port: ${PLATFORM_PORT}"
      - name: Kill stale platform-server before start
        run: |
          killed=0
          for pid in $(grep -l "platform-serve" /proc/[0-9]*/comm 2>/dev/null); do
            kpid="${pid%/comm}"; kpid="${kpid##*/}"
            cmdline=$(cat "/proc/${kpid}/cmdline" 2>/dev/null | tr '\0' ' ')
            if echo "$cmdline" | grep -q "platform-server"; then
              echo "Killing stale platform-server pid ${kpid}"
              kill "$kpid" 2>/dev/null || true; killed=$((killed + 1))
            fi
          done
          [ "$killed" -gt 0 ] && sleep 2 || true
          echo "stale-kill done ($killed killed)"
      - name: Start platform (background)
        working-directory: workspace-server
        run: |
          ./platform-server > platform.log 2>&1 &
          echo $! > platform.pid
      - name: Wait for /health
        run: |
          for i in $(seq 1 30); do
            curl -sf "$BASE/health" > /dev/null && { echo "Platform up after ${i}s"; exit 0; }
            sleep 1
          done
          echo "::error::Platform did not become healthy in 30s"
          cat workspace-server/platform.log || true; exit 1
      - name: Run LOCAL fresh-provision peer-visibility E2E (literal MCP list_peers)
        # HONEST gate — NO continue-on-error. Red today (Hermes-401 #162 /
        # OpenClaw-never-online #165 not yet fixed); green when they land.
        # Non-required-by-design via its distinct status context until the
        # molecule-core#1296 flip-to-required.
        run: bash tests/e2e/test_peer_visibility_mcp_local.sh
      - name: Dump platform log on failure
        if: failure()
        run: cat workspace-server/platform.log || true
      - name: Stop platform
        if: always()
        run: |
          if [ -f workspace-server/platform.pid ]; then
            kill "$(cat workspace-server/platform.pid)" 2>/dev/null || true
          fi
      - name: Stop service containers
        if: always()
        run: |
          docker rm -f "$PG_CONTAINER" 2>/dev/null || true
          docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true

  # Real STAGING gate: provisions a throwaway org + sibling-per-runtime,
  # drives the LITERAL list_peers MCP call per runtime, asserts 200 +
  # expected peer set, then scoped teardown. push(main)/dispatch/cron only.
  peer-visibility:
    name: E2E Peer Visibility
    runs-on: ubuntu-latest
    if: github.event_name != 'pull_request'
    timeout-minutes: 60

    env:
      MOLECULE_CP_URL: https://staging-api.moleculesai.app
      MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      # LLM provider key so each runtime can authenticate at boot.
      # Priority MiniMax → direct-Anthropic → OpenAI matches
      # test_staging_full_saas.sh's secrets-injection chain.
      E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
      E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
      E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }}
      E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
      PV_RUNTIMES: "hermes openclaw claude-code"

    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Verify admin token present
        run: |
          if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
            echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
            exit 2
          fi
          echo "Admin token present"

      - name: Verify an LLM key present
        run: |
          if [ -z "${E2E_MINIMAX_API_KEY:-}" ] && [ -z "${E2E_ANTHROPIC_API_KEY:-}" ] && [ -z "${E2E_OPENAI_API_KEY:-}" ]; then
            echo "::error::No LLM provider key set — workspaces fail at boot with 'No provider API key found'. Set MOLECULE_STAGING_MINIMAX_API_KEY (or ANTHROPIC / OPENAI)."
            exit 2
          fi
          echo "LLM key present"

      - name: CP staging health preflight
        run: |
          code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health")
          if [ "$code" != "200" ]; then
            echo "::error::Staging CP unhealthy (HTTP $code) — infra, not a workspace bug. Failing loud per feedback_fix_root_not_symptom."
            exit 1
          fi
          echo "Staging CP healthy"

      - name: Run fresh-provision peer-visibility E2E (literal MCP list_peers)
        run: bash tests/e2e/test_peer_visibility_mcp_staging.sh

      # Belt-and-braces scoped teardown: the script installs an EXIT/INT/
      # TERM trap, but if the runner itself is cancelled the trap may not
      # fire. This always() step deletes ONLY the e2e-pv-<run_id> org this
      # run created — never a cluster-wide sweep
      # (feedback_never_run_cluster_cleanup_tests_on_live_platform). The
      # admin DELETE is idempotent so double-invoking is safe;
      # sweep-stale-e2e-orgs is the final net (slug starts with 'e2e-').
      - name: Teardown safety net (runs on cancel/failure)
        if: always()
        env:
          ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
        run: |
          set +e
          orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs?limit=500" \
            -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
            | python3 -c "
          import json, sys, os, datetime
          run_id = os.environ.get('GITHUB_RUN_ID', '')
          try:
              d = json.load(sys.stdin)
          except Exception:
              print(''); sys.exit(0)
          # ONLY sweep slugs from THIS run. e2e-pv-<YYYYMMDD>-<run_id>-...
          # Sweep today AND yesterday's UTC date so a midnight-crossing run
          # still matches its own slug (same bug class as the saas/canvas
          # safety nets).
          today = datetime.date.today()
          yest = today - datetime.timedelta(days=1)
          dates = (today.strftime('%Y%m%d'), yest.strftime('%Y%m%d'))
          if run_id:
              prefixes = tuple(f'e2e-pv-{dt}-{run_id}-' for dt in dates)
          else:
              prefixes = tuple(f'e2e-pv-{dt}-' for dt in dates)
          orgs = d if isinstance(d, list) else d.get('orgs', [])
          cands = [o['slug'] for o in orgs
                   if any(o.get('slug','').startswith(p) for p in prefixes)
                   and o.get('instance_status') not in ('purged',)]
          print('\n'.join(cands))
          " 2>/dev/null)
          for slug in $orgs; do
            echo "Safety-net teardown: $slug"
            set +e
            curl -sS -o /tmp/pv-cleanup.out -w "%{http_code}" \
              -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
              -H "Authorization: Bearer $ADMIN_TOKEN" \
              -H "Content-Type: application/json" \
              -d "{\"confirm\":\"$slug\"}" >/tmp/pv-cleanup.code
            set -e
            code=$(cat /tmp/pv-cleanup.code 2>/dev/null || echo "000")
            if [ "$code" = "200" ] || [ "$code" = "204" ]; then
              echo "[teardown] deleted $slug (HTTP $code)"
            else
              echo "::warning::pv teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within MAX_AGE_MINUTES. Body: $(head -c 300 /tmp/pv-cleanup.out 2>/dev/null)"
            fi
          done
          exit 0