diff --git a/.gitea/workflows/boot-to-registration-e2e.yml b/.gitea/workflows/boot-to-registration-e2e.yml new file mode 100644 index 000000000..5254dba7b --- /dev/null +++ b/.gitea/workflows/boot-to-registration-e2e.yml @@ -0,0 +1,165 @@ +name: boot-to-registration-e2e (advisory) + +# cp#455 — Minimal-cell boot-to-registration e2e. +# CTO directive 14eb4f07: "build the minimal claude-code+kimi cell, +# it should now go GREEN since the fix is live." +# +# Stage 1 of 5-stage rollout. Reuses the dispatch-only EC2 +# provisioning path from test_staging_full_saas.sh but reduced to +# the minimum boot-to-registration surface: +# +# 1. Provision request accepted; workspace transitions to booting/running +# 2. Controlplane receives /registry/register for that workspace_id +# 3. JSON-RPC/completion route returns successful minimal response +# 4. Teardown terminates workspace even on failure (trap) +# +# Advisory (non-blocking) per Researcher Stage 2 design — RED on +# current main is expected pre-cp#469-cluster. After cp#477 deploy +# (888efceb) + PR #2167 merge, cell should turn GREEN. THAT green +# is the cluster-proof signal. +# +# Cost controls (mandatory): +# - SPOT instances (tagged run_id/workspace_id for cost attribution) +# - Fast teardown (~3-5 min wall-clock) even on assertion failure +# - Structured per-cell results JSON (runtime/provider/model/ +# billing_mode/workspace_id/register_status/completion_status/ +# teardown_status/elapsed_seconds) +# +# Inputs: +# runtime : default claude-code +# billing_mode : default platform_managed (the cp#469-cluster path) +# provider : default platform (vs direct-to-provider) +# model : default moonshot/kimi-k2.6 (CTO-specified) +# +# PR target: molecule-core (this file). Companion harness extension +# (test_minimal_boot_cell.sh) lives in tests/e2e/ alongside +# test_staging_full_saas.sh — same repo, same branch. +# +# Note: cp#455 was originally spec'd to live in molecule-controlplane +# (`.gitea/workflows/` path), but molecule-core's CI is the home for +# tenant-boot e2e tests in this stage. Stage 2 may move the path. + +on: + workflow_dispatch: + # Note: Gitea 1.22.6 does not support workflow_dispatch.inputs + # (feedback_gitea_workflow_dispatch_inputs_unsupported). Defaults + # are hardcoded in the job env below. Stage 2 can add matrix/ + # param support once the Gitea version supports it. + +# Advisory: no cron schedule, manual dispatch only. Branch protection +# doesn't require this — RED on main is expected pre-cp#469-cluster +# deploy, GREEN signals the cluster is live. +permissions: + contents: read + # No issue-write; failures surface as red runs in workflow history. + +concurrency: + group: boot-to-registration-e2e + cancel-in-progress: false + +jobs: + # bp-exempt: advisory e2e — non-gating, manual dispatch only (cp#455 Stage 1) + minimal-cell: + name: Minimal cell (claude-code + platform + moonshot/kimi-k2.6) + runs-on: ubuntu-latest + # Bounded at 12 min. Wall-clock budget breakdown: + # - cold EC2 provision: ~3-4 min (SPOT) + # - /registry/register wait: ~30s + # - completion call: ~10s + # - teardown: ~30-60s + # - tail headroom: ~6-7 min + timeout-minutes: 12 + env: + # Hardcoded defaults — Gitea 1.22.6 does not support workflow_dispatch.inputs + # (feedback_gitea_workflow_dispatch_inputs_unsupported). Stage 2 can add + # matrix/param support once the Gitea version supports it. + E2E_RUNTIME: claude-code + E2E_BILLING_MODE: platform_managed + E2E_PROVIDER: platform + E2E_MODEL: moonshot/kimi-k2.6 + E2E_RUN_ID: cp455-${{ github.run_id }} + E2E_PROVISION_TIMEOUT_SECS: '300' # 5 min — fast teardown budget + MOLECULE_CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }} + MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Verify required secrets present + run: | + if [ -z "${MOLECULE_ADMIN_TOKEN:-}" ]; then + echo "::error::CP_STAGING_ADMIN_API_TOKEN secret missing — minimal-cell e2e cannot run" + echo "::error::Set it at Settings → Secrets and Variables → Actions; pull from staging-CP's CP_ADMIN_API_TOKEN env in Railway." + exit 1 + fi + + - name: Install required tools + run: | + for cmd in jq curl python3; do + command -v "$cmd" >/dev/null 2>&1 || { + echo "::error::required tool '$cmd' not on PATH — runner image regression?" + exit 1 + } + done + + - name: Run minimal-cell boot-to-registration harness + # The harness script handles its own teardown via EXIT trap; + # even on assertion failure (provision timeout, register + # timeout, completion failure), the workspace is deprovisioned + # and a leak is reported. Exit code propagates from the script. + # Structured per-cell results are emitted to ${GITHUB_STEP_SUMMARY} + # so operators see pass/fail per assertion without scrolling. + run: | + bash tests/e2e/test_minimal_boot_cell.sh + + - name: Emit structured per-cell results + if: always() + # Always run (even on failure) so the structured results are + # visible in the workflow summary. The script writes a JSON + # file at /tmp/cell-result.json; this step renders it as a + # job summary. + run: | + if [ -f /tmp/cell-result.json ]; then + echo "## Minimal-cell results" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + echo '```json' >> "$GITHUB_STEP_SUMMARY" + cat /tmp/cell-result.json >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + echo '```' >> "$GITHUB_STEP_SUMMARY" + else + echo "## Minimal-cell results: NO_RESULT_FILE" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + echo "Harness did not produce /tmp/cell-result.json — likely crashed before trap fired." >> "$GITHUB_STEP_SUMMARY" + fi + + - name: Failure summary + if: failure() + run: | + { + echo "## cp#455 minimal-cell FAILED" + echo "" + echo "**Run ID:** ${{ github.run_id }}" + echo "**Runtime:** ${E2E_RUNTIME}" + echo "**Billing mode:** ${E2E_BILLING_MODE}" + echo "**Provider:** ${E2E_PROVIDER}" + echo "**Model:** ${E2E_MODEL}" + echo "**Slug:** ${E2E_RUN_ID}" + echo "" + echo "### What this means" + echo "" + echo "The minimal claude-code+kimi cell did not pass all 4 assertions:" + echo "1. Provision request accepted; workspace transitions to booting/running" + echo "2. Controlplane receives /registry/register for that workspace_id" + echo "3. JSON-RPC/completion route returns successful minimal response" + echo "4. Teardown terminates workspace even on failure (trap)" + echo "" + echo "RED is expected pre-cp#469-cluster. After cp#477 deploy (888efceb) + PR #2167 merge," + echo "this should turn GREEN. Persistent RED after both merge = cluster bug, not e2e bug." + echo "" + echo "### Next steps" + echo "" + echo "1. Check the harness output above for the assertion that failed" + echo "2. If assertion 1 fails: provision path broken — check CP admin API + EC2 quota" + echo "3. If assertion 2 fails: /registry/register path broken — check workspace-server boot" + echo "4. If assertion 3 fails: LLM proxy / completion path broken — check cp#469 cluster" + echo "5. If assertion 4 fails: teardown trap broken — leak risk, fix immediately" + } >> "$GITHUB_STEP_SUMMARY" diff --git a/tests/e2e/test_minimal_boot_cell.sh b/tests/e2e/test_minimal_boot_cell.sh new file mode 100755 index 000000000..8835cf3d5 --- /dev/null +++ b/tests/e2e/test_minimal_boot_cell.sh @@ -0,0 +1,299 @@ +#!/usr/bin/env bash +# cp#455 — Minimal-cell boot-to-registration harness. +# CTO directive 14eb4f07: "build the minimal claude-code+kimi cell, +# it should now go GREEN since the fix is live." +# +# Stage 1 of 5-stage rollout. Reduced to the minimum boot-to- +# registration surface so each cell run is ~3-5 min wall-clock. +# +# Four assertions (per Researcher Task #79 spec): +# 1. Provision request accepted; workspace transitions to booting/running +# 2. Controlplane receives /registry/register for that workspace_id +# 3. JSON-RPC/completion route returns successful minimal response +# 4. Teardown terminates workspace even on failure (trap) +# +# Cost controls (mandatory): +# - SPOT instances (via the dispatch-only EC2 provisioning path; +# we don't set instance type — that's the provisioner's call) +# - Fast teardown ~3-5 min wall-clock +# - Structured per-cell results JSON output +# +# Auth model (mirrors test_staging_full_saas.sh): +# Single MOLECULE_ADMIN_TOKEN drives everything. +# - POST /cp/admin/orgs to provision +# - GET /cp/admin/orgs/:slug/admin-token for per-tenant token +# - DELETE /cp/admin/tenants/:slug for teardown +# Per-tenant admin token drives tenant API calls (workspaces, +# /registry/register, JSON-RPC completion). +# +# Required env: +# MOLECULE_CP_URL default: https://staging-api.moleculesai.app +# MOLECULE_ADMIN_TOKEN CP admin bearer +# +# Optional env (passed from workflow_dispatch inputs): +# E2E_RUNTIME default claude-code +# E2E_BILLING_MODE default platform_managed +# E2E_PROVIDER default platform +# E2E_MODEL default moonshot/kimi-k2.6 +# E2E_RUN_ID Slug suffix; CI: cp455-${GITHUB_RUN_ID} +# E2E_PROVISION_TIMEOUT_SECS default 300 (5 min — fast teardown budget) +# E2E_KEEP_ORG 1 → skip teardown (debugging only) +# +# Exit codes: +# 0 happy path +# 1 generic failure +# 2 missing required env +# 3 provisioning timed out (assertion 1) +# 4 register timeout (assertion 2) +# 5 completion failure (assertion 3) +# 6 teardown left orphan (assertion 4) + +set -uo pipefail + +CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}" +ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}" +RUNTIME="${E2E_RUNTIME:-claude-code}" +BILLING_MODE="${E2E_BILLING_MODE:-platform_managed}" +PROVIDER="${E2E_PROVIDER:-platform}" +MODEL="${E2E_MODEL:-moonshot/kimi-k2.6}" +PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-300}" +KEEP_ORG="${E2E_KEEP_ORG:-}" +RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}" +SLUG="cp455-${RUNTIME}-${RUN_ID_SUFFIX}" +WORKSPACE_ID="" +TENANT_TOKEN="" +RESULT_JSON="/tmp/cell-result.json" +PROVISION_START_EPOCH="" +PROVISION_END_EPOCH="" +REGISTER_STATUS="not_attempted" +COMPLETION_STATUS="not_attempted" +TEARDOWN_STATUS="not_attempted" +EXIT_CODE=0 + +# Structured per-cell results writer. Emits JSON with all 4 +# assertion statuses + elapsed timing. Called from EXIT trap so +# results are captured even on early failure. +write_result() { + local elapsed="${1:-0}" + cat > "${RESULT_JSON}" </dev/null; then + elapsed=$(( now - PROVISION_START_EPOCH )) + fi + + # Assertion 4: teardown terminates workspace even on failure. + if [ -z "${KEEP_ORG}" ] && [ -n "${SLUG:-}" ]; then + if [ -n "${WORKSPACE_ID:-}" ] || [ -n "${SLUG:-}" ]; then + echo "::group::Teardown (trap)" + echo "DELETE ${CP_URL}/cp/admin/tenants/${SLUG}" + local teardown_http_code + teardown_http_code=$(curl -sS -o /dev/null -w '%{http_code}' \ + -X DELETE \ + -H "Authorization: Bearer ${ADMIN_TOKEN}" \ + --max-time 60 \ + "${CP_URL}/cp/admin/tenants/${SLUG}" || echo "000") + if [ "${teardown_http_code}" = "200" ] || [ "${teardown_http_code}" = "204" ] || [ "${teardown_http_code}" = "404" ]; then + TEARDOWN_STATUS="ok" + echo "Teardown OK (HTTP ${teardown_http_code})" + else + TEARDOWN_STATUS="leak_risk_http_${teardown_http_code}" + echo "::error::Teardown returned HTTP ${teardown_http_code} — orphan risk" + # Bump exit code to 6 if teardown is the failure source. + if [ "${EXIT_CODE}" -eq 0 ]; then + EXIT_CODE=6 + fi + fi + echo "::endgroup::" + fi + else + TEARDOWN_STATUS="skipped_keep_org" + fi + + write_result "${elapsed}" + echo "Structured results written to ${RESULT_JSON}" + cat "${RESULT_JSON}" + exit "${EXIT_CODE}" +} +trap on_exit EXIT +trap 'echo "::error::Script aborted on signal"; exit 130' INT TERM + +PROVISION_START_EPOCH=$(date +%s) + +# Assertion 1: Provision request accepted; workspace transitions to +# booting/running. +echo "::group::Assertion 1: Provision" +echo "POST ${CP_URL}/cp/admin/orgs slug=${SLUG} runtime=${RUNTIME} billing_mode=${BILLING_MODE} provider=${PROVIDER} model=${MODEL}" +PROVISION_HTTP_CODE=$(curl -sS -o /tmp/provision-resp.json -w '%{http_code}' \ + -X POST \ + -H "Authorization: Bearer ${ADMIN_TOKEN}" \ + -H "Content-Type: application/json" \ + --max-time 30 \ + -d "$(cat </dev/null || true + EXIT_CODE=1 + exit "${EXIT_CODE}" +fi +echo "::endgroup::" + +# Wait for org to reach running + retrieve per-tenant token. Bounded +# at PROVISION_TIMEOUT_SECS. We poll the admin token endpoint; once +# the org is up, the endpoint returns 200 with the token, and the +# workspace_id is in the same response or in a follow-up /orgs/:slug +# call. +echo "::group::Wait for org to be ready (max ${PROVISION_TIMEOUT_SECS}s)" +WAIT_START=$(date +%s) +WAIT_DEADLINE=$(( WAIT_START + PROVISION_TIMEOUT_SECS )) +TENANT_TOKEN="" +while [ "$(date +%s)" -lt "${WAIT_DEADLINE}" ]; do + TOKEN_HTTP_CODE=$(curl -sS -o /tmp/token-resp.json -w '%{http_code}' \ + -H "Authorization: Bearer ${ADMIN_TOKEN}" \ + --max-time 10 \ + "${CP_URL}/cp/admin/orgs/${SLUG}/admin-token" || echo "000") + if [ "${TOKEN_HTTP_CODE}" = "200" ]; then + TENANT_TOKEN=$(jq -r '.admin_token // .token // empty' /tmp/token-resp.json 2>/dev/null || echo "") + if [ -n "${TENANT_TOKEN}" ]; then + WORKSPACE_ID=$(jq -r '.workspace_id // .default_workspace_id // empty' /tmp/token-resp.json 2>/dev/null || echo "") + if [ -z "${WORKSPACE_ID}" ]; then + # Fallback: list orgs and find by slug + WORKSPACE_ID=$(curl -sS -H "Authorization: Bearer ${ADMIN_TOKEN}" \ + "${CP_URL}/cp/admin/orgs/${SLUG}" | jq -r '.workspace_id // .default_workspace_id // empty' 2>/dev/null || echo "") + fi + if [ -n "${WORKSPACE_ID}" ]; then + PROVISION_END_EPOCH=$(date +%s) + echo "Org ready in $(( PROVISION_END_EPOCH - WAIT_START ))s — workspace_id=${WORKSPACE_ID}" + break + fi + fi + fi + sleep 5 +done +if [ -z "${TENANT_TOKEN}" ] || [ -z "${WORKSPACE_ID}" ]; then + echo "::error::Provision timed out (org never reached running within ${PROVISION_TIMEOUT_SECS}s)" + EXIT_CODE=3 + exit "${EXIT_CODE}" +fi +echo "::endgroup::" + +# Assertion 2: Controlplane receives /registry/register for that +# workspace_id. The harness doesn't POST to /registry/register +# directly — that's the workspace-server's own job on boot. We +# verify the registration was received by polling the registry +# endpoint (or by checking that a /workspaces/:id call returns +# the expected fields). +echo "::group::Assertion 2: /registry/register for workspace_id=${WORKSPACE_ID}" +REGISTER_DEADLINE=$(( $(date +%s) + 60 )) +while [ "$(date +%s)" -lt "${REGISTER_DEADLINE}" ]; do + REG_HTTP_CODE=$(curl -sS -o /tmp/reg-resp.json -w '%{http_code}' \ + -H "Authorization: Bearer ${TENANT_TOKEN}" \ + --max-time 10 \ + "${CP_URL}/cp/registry/workspaces/${WORKSPACE_ID}" || echo "000") + if [ "${REG_HTTP_CODE}" = "200" ]; then + REGISTERED=$(jq -r '.registered // .workspace_id // empty' /tmp/reg-resp.json 2>/dev/null || echo "") + if [ -n "${REGISTERED}" ]; then + REGISTER_STATUS="ok" + echo "Registry confirms workspace_id=${WORKSPACE_ID} registered" + break + fi + fi + sleep 3 +done +if [ "${REGISTER_STATUS}" != "ok" ]; then + echo "::error::Registry did not confirm registration within 60s" + cat /tmp/reg-resp.json 2>/dev/null || true + EXIT_CODE=4 + exit "${EXIT_CODE}" +fi +echo "::endgroup::" + +# Assertion 3: JSON-RPC/completion route returns successful minimal +# response. One minimal completion call — keep payload small. +echo "::group::Assertion 3: JSON-RPC completion" +COMPLETION_HTTP_CODE=$(curl -sS -o /tmp/completion-resp.json -w '%{http_code}' \ + -X POST \ + -H "Authorization: Bearer ${TENANT_TOKEN}" \ + -H "Content-Type: application/json" \ + --max-time 30 \ + -d "$(cat </dev/null || true + EXIT_CODE=5 + exit "${EXIT_CODE}" +fi +# Verify JSON-RPC 2.0 success envelope +RPC_ERROR=$(jq -r '.error // empty' /tmp/completion-resp.json 2>/dev/null || echo "") +if [ -n "${RPC_ERROR}" ]; then + echo "::error::Completion returned JSON-RPC error: ${RPC_ERROR}" + cat /tmp/completion-resp.json 2>/dev/null || true + EXIT_CODE=5 + exit "${EXIT_CODE}" +fi +RPC_RESULT=$(jq -r '.result // empty' /tmp/completion-resp.json 2>/dev/null || echo "") +if [ -z "${RPC_RESULT}" ] || [ "${RPC_RESULT}" = "null" ]; then + echo "::error::Completion response missing result field" + cat /tmp/completion-resp.json 2>/dev/null || true + EXIT_CODE=5 + exit "${EXIT_CODE}" +fi +COMPLETION_STATUS="ok" +echo "Completion OK" +echo "::endgroup::" + +echo "All 4 assertions passed for ${SLUG} (workspace_id=${WORKSPACE_ID})" \ No newline at end of file