feat(ci): cp#455 minimal-cell boot-to-registration e2e (Stage 1) #2299

Merged
devops-engineer merged 6 commits from cp455-minimal-cell-boot-e2e-stage1 into main 2026-06-06 21:27:25 +00:00
2 changed files with 464 additions and 0 deletions
@@ -0,0 +1,165 @@
name: boot-to-registration-e2e (advisory)
# cp#455 — Minimal-cell boot-to-registration e2e.
# CTO directive 14eb4f07: "build the minimal claude-code+kimi cell,
# it should now go GREEN since the fix is live."
#
# Stage 1 of 5-stage rollout. Reuses the dispatch-only EC2
# provisioning path from test_staging_full_saas.sh but reduced to
# the minimum boot-to-registration surface:
#
# 1. Provision request accepted; workspace transitions to booting/running
# 2. Controlplane receives /registry/register for that workspace_id
# 3. JSON-RPC/completion route returns successful minimal response
# 4. Teardown terminates workspace even on failure (trap)
#
# Advisory (non-blocking) per Researcher Stage 2 design — RED on
# current main is expected pre-cp#469-cluster. After cp#477 deploy
# (888efceb) + PR #2167 merge, cell should turn GREEN. THAT green
# is the cluster-proof signal.
#
# Cost controls (mandatory):
# - SPOT instances (tagged run_id/workspace_id for cost attribution)
# - Fast teardown (~3-5 min wall-clock) even on assertion failure
# - Structured per-cell results JSON (runtime/provider/model/
# billing_mode/workspace_id/register_status/completion_status/
# teardown_status/elapsed_seconds)
#
# Inputs:
# runtime : default claude-code
# billing_mode : default platform_managed (the cp#469-cluster path)
# provider : default platform (vs direct-to-provider)
# model : default moonshot/kimi-k2.6 (CTO-specified)
#
# PR target: molecule-core (this file). Companion harness extension
# (test_minimal_boot_cell.sh) lives in tests/e2e/ alongside
# test_staging_full_saas.sh — same repo, same branch.
#
# Note: cp#455 was originally spec'd to live in molecule-controlplane
# (`.gitea/workflows/` path), but molecule-core's CI is the home for
# tenant-boot e2e tests in this stage. Stage 2 may move the path.
on:
workflow_dispatch:
# Note: Gitea 1.22.6 does not support workflow_dispatch.inputs
# (feedback_gitea_workflow_dispatch_inputs_unsupported). Defaults
# are hardcoded in the job env below. Stage 2 can add matrix/
# param support once the Gitea version supports it.
# Advisory: no cron schedule, manual dispatch only. Branch protection
# doesn't require this — RED on main is expected pre-cp#469-cluster
# deploy, GREEN signals the cluster is live.
permissions:
contents: read
# No issue-write; failures surface as red runs in workflow history.
concurrency:
group: boot-to-registration-e2e
cancel-in-progress: false
jobs:
# bp-exempt: advisory e2e — non-gating, manual dispatch only (cp#455 Stage 1)
minimal-cell:
name: Minimal cell (claude-code + platform + moonshot/kimi-k2.6)
runs-on: ubuntu-latest
# Bounded at 12 min. Wall-clock budget breakdown:
# - cold EC2 provision: ~3-4 min (SPOT)
# - /registry/register wait: ~30s
# - completion call: ~10s
# - teardown: ~30-60s
# - tail headroom: ~6-7 min
timeout-minutes: 12
env:
# Hardcoded defaults — Gitea 1.22.6 does not support workflow_dispatch.inputs
# (feedback_gitea_workflow_dispatch_inputs_unsupported). Stage 2 can add
# matrix/param support once the Gitea version supports it.
E2E_RUNTIME: claude-code
E2E_BILLING_MODE: platform_managed
E2E_PROVIDER: platform
E2E_MODEL: moonshot/kimi-k2.6
E2E_RUN_ID: cp455-${{ github.run_id }}
E2E_PROVISION_TIMEOUT_SECS: '300' # 5 min — fast teardown budget
MOLECULE_CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }}
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Verify required secrets present
run: |
if [ -z "${MOLECULE_ADMIN_TOKEN:-}" ]; then
echo "::error::CP_STAGING_ADMIN_API_TOKEN secret missing — minimal-cell e2e cannot run"
echo "::error::Set it at Settings → Secrets and Variables → Actions; pull from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
exit 1
fi
- name: Install required tools
run: |
for cmd in jq curl python3; do
command -v "$cmd" >/dev/null 2>&1 || {
echo "::error::required tool '$cmd' not on PATH — runner image regression?"
exit 1
}
done
- name: Run minimal-cell boot-to-registration harness
# The harness script handles its own teardown via EXIT trap;
# even on assertion failure (provision timeout, register
# timeout, completion failure), the workspace is deprovisioned
# and a leak is reported. Exit code propagates from the script.
# Structured per-cell results are emitted to ${GITHUB_STEP_SUMMARY}
# so operators see pass/fail per assertion without scrolling.
run: |
bash tests/e2e/test_minimal_boot_cell.sh
- name: Emit structured per-cell results
if: always()
# Always run (even on failure) so the structured results are
# visible in the workflow summary. The script writes a JSON
# file at /tmp/cell-result.json; this step renders it as a
# job summary.
run: |
if [ -f /tmp/cell-result.json ]; then
echo "## Minimal-cell results" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo '```json' >> "$GITHUB_STEP_SUMMARY"
cat /tmp/cell-result.json >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo '```' >> "$GITHUB_STEP_SUMMARY"
else
echo "## Minimal-cell results: NO_RESULT_FILE" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "Harness did not produce /tmp/cell-result.json — likely crashed before trap fired." >> "$GITHUB_STEP_SUMMARY"
fi
- name: Failure summary
if: failure()
run: |
{
echo "## cp#455 minimal-cell FAILED"
echo ""
echo "**Run ID:** ${{ github.run_id }}"
echo "**Runtime:** ${E2E_RUNTIME}"
echo "**Billing mode:** ${E2E_BILLING_MODE}"
echo "**Provider:** ${E2E_PROVIDER}"
echo "**Model:** ${E2E_MODEL}"
echo "**Slug:** ${E2E_RUN_ID}"
echo ""
echo "### What this means"
echo ""
echo "The minimal claude-code+kimi cell did not pass all 4 assertions:"
echo "1. Provision request accepted; workspace transitions to booting/running"
echo "2. Controlplane receives /registry/register for that workspace_id"
echo "3. JSON-RPC/completion route returns successful minimal response"
echo "4. Teardown terminates workspace even on failure (trap)"
echo ""
echo "RED is expected pre-cp#469-cluster. After cp#477 deploy (888efceb) + PR #2167 merge,"
echo "this should turn GREEN. Persistent RED after both merge = cluster bug, not e2e bug."
echo ""
echo "### Next steps"
echo ""
echo "1. Check the harness output above for the assertion that failed"
echo "2. If assertion 1 fails: provision path broken — check CP admin API + EC2 quota"
echo "3. If assertion 2 fails: /registry/register path broken — check workspace-server boot"
echo "4. If assertion 3 fails: LLM proxy / completion path broken — check cp#469 cluster"
echo "5. If assertion 4 fails: teardown trap broken — leak risk, fix immediately"
} >> "$GITHUB_STEP_SUMMARY"
+299
View File
@@ -0,0 +1,299 @@
#!/usr/bin/env bash
# cp#455 — Minimal-cell boot-to-registration harness.
# CTO directive 14eb4f07: "build the minimal claude-code+kimi cell,
# it should now go GREEN since the fix is live."
#
# Stage 1 of 5-stage rollout. Reduced to the minimum boot-to-
# registration surface so each cell run is ~3-5 min wall-clock.
#
# Four assertions (per Researcher Task #79 spec):
# 1. Provision request accepted; workspace transitions to booting/running
# 2. Controlplane receives /registry/register for that workspace_id
# 3. JSON-RPC/completion route returns successful minimal response
# 4. Teardown terminates workspace even on failure (trap)
#
# Cost controls (mandatory):
# - SPOT instances (via the dispatch-only EC2 provisioning path;
# we don't set instance type — that's the provisioner's call)
# - Fast teardown ~3-5 min wall-clock
# - Structured per-cell results JSON output
#
# Auth model (mirrors test_staging_full_saas.sh):
# Single MOLECULE_ADMIN_TOKEN drives everything.
# - POST /cp/admin/orgs to provision
# - GET /cp/admin/orgs/:slug/admin-token for per-tenant token
# - DELETE /cp/admin/tenants/:slug for teardown
# Per-tenant admin token drives tenant API calls (workspaces,
# /registry/register, JSON-RPC completion).
#
# Required env:
# MOLECULE_CP_URL default: https://staging-api.moleculesai.app
# MOLECULE_ADMIN_TOKEN CP admin bearer
#
# Optional env (passed from workflow_dispatch inputs):
# E2E_RUNTIME default claude-code
# E2E_BILLING_MODE default platform_managed
# E2E_PROVIDER default platform
# E2E_MODEL default moonshot/kimi-k2.6
# E2E_RUN_ID Slug suffix; CI: cp455-${GITHUB_RUN_ID}
# E2E_PROVISION_TIMEOUT_SECS default 300 (5 min — fast teardown budget)
# E2E_KEEP_ORG 1 → skip teardown (debugging only)
#
# Exit codes:
# 0 happy path
# 1 generic failure
# 2 missing required env
# 3 provisioning timed out (assertion 1)
# 4 register timeout (assertion 2)
# 5 completion failure (assertion 3)
# 6 teardown left orphan (assertion 4)
set -uo pipefail
CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}"
ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}"
RUNTIME="${E2E_RUNTIME:-claude-code}"
BILLING_MODE="${E2E_BILLING_MODE:-platform_managed}"
PROVIDER="${E2E_PROVIDER:-platform}"
MODEL="${E2E_MODEL:-moonshot/kimi-k2.6}"
PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-300}"
KEEP_ORG="${E2E_KEEP_ORG:-}"
RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}"
SLUG="cp455-${RUNTIME}-${RUN_ID_SUFFIX}"
WORKSPACE_ID=""
TENANT_TOKEN=""
RESULT_JSON="/tmp/cell-result.json"
PROVISION_START_EPOCH=""
PROVISION_END_EPOCH=""
REGISTER_STATUS="not_attempted"
COMPLETION_STATUS="not_attempted"
TEARDOWN_STATUS="not_attempted"
EXIT_CODE=0
# Structured per-cell results writer. Emits JSON with all 4
# assertion statuses + elapsed timing. Called from EXIT trap so
# results are captured even on early failure.
write_result() {
local elapsed="${1:-0}"
cat > "${RESULT_JSON}" <<JSON
{
"runtime": "${RUNTIME}",
"billing_mode": "${BILLING_MODE}",
"provider": "${PROVIDER}",
"model": "${MODEL}",
"workspace_id": "${WORKSPACE_ID}",
"register_status": "${REGISTER_STATUS}",
"completion_status": "${COMPLETION_STATUS}",
"teardown_status": "${TEARDOWN_STATUS}",
"elapsed_seconds": ${elapsed},
"exit_code": ${EXIT_CODE},
"ts": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
}
JSON
}
# EXIT trap — ALWAYS run. Writes structured results, tears down
# workspace if we have one, never lets the script exit without
# emitting /tmp/cell-result.json.
on_exit() {
local exit_code=$?
EXIT_CODE=${exit_code}
local now
now=$(date +%s)
local elapsed=0
if [ -n "${PROVISION_START_EPOCH:-}" ] && [ "${PROVISION_START_EPOCH}" -gt 0 ] 2>/dev/null; then
elapsed=$(( now - PROVISION_START_EPOCH ))
fi
# Assertion 4: teardown terminates workspace even on failure.
if [ -z "${KEEP_ORG}" ] && [ -n "${SLUG:-}" ]; then
if [ -n "${WORKSPACE_ID:-}" ] || [ -n "${SLUG:-}" ]; then
echo "::group::Teardown (trap)"
echo "DELETE ${CP_URL}/cp/admin/tenants/${SLUG}"
local teardown_http_code
teardown_http_code=$(curl -sS -o /dev/null -w '%{http_code}' \
-X DELETE \
-H "Authorization: Bearer ${ADMIN_TOKEN}" \
--max-time 60 \
"${CP_URL}/cp/admin/tenants/${SLUG}" || echo "000")
if [ "${teardown_http_code}" = "200" ] || [ "${teardown_http_code}" = "204" ] || [ "${teardown_http_code}" = "404" ]; then
TEARDOWN_STATUS="ok"
echo "Teardown OK (HTTP ${teardown_http_code})"
else
TEARDOWN_STATUS="leak_risk_http_${teardown_http_code}"
echo "::error::Teardown returned HTTP ${teardown_http_code} — orphan risk"
# Bump exit code to 6 if teardown is the failure source.
if [ "${EXIT_CODE}" -eq 0 ]; then
EXIT_CODE=6
fi
fi
echo "::endgroup::"
fi
else
TEARDOWN_STATUS="skipped_keep_org"
fi
write_result "${elapsed}"
echo "Structured results written to ${RESULT_JSON}"
cat "${RESULT_JSON}"
exit "${EXIT_CODE}"
}
trap on_exit EXIT
trap 'echo "::error::Script aborted on signal"; exit 130' INT TERM
PROVISION_START_EPOCH=$(date +%s)
# Assertion 1: Provision request accepted; workspace transitions to
# booting/running.
echo "::group::Assertion 1: Provision"
echo "POST ${CP_URL}/cp/admin/orgs slug=${SLUG} runtime=${RUNTIME} billing_mode=${BILLING_MODE} provider=${PROVIDER} model=${MODEL}"
PROVISION_HTTP_CODE=$(curl -sS -o /tmp/provision-resp.json -w '%{http_code}' \
-X POST \
-H "Authorization: Bearer ${ADMIN_TOKEN}" \
-H "Content-Type: application/json" \
--max-time 30 \
-d "$(cat <<JSON
{
"slug": "${SLUG}",
"runtime": "${RUNTIME}",
"billing_mode": "${BILLING_MODE}",
"provider": "${PROVIDER}",
"model": "${MODEL}",
"tier": "spot",
"tags": {
"cp455_minimal_cell": "1",
"run_id": "${RUN_ID_SUFFIX}"
}
}
JSON
)" \
"${CP_URL}/cp/admin/orgs" || echo "000")
echo "HTTP ${PROVISION_HTTP_CODE}"
if [ "${PROVISION_HTTP_CODE}" != "202" ] && [ "${PROVISION_HTTP_CODE}" != "200" ]; then
echo "::error::Provision failed (HTTP ${PROVISION_HTTP_CODE})"
cat /tmp/provision-resp.json 2>/dev/null || true
EXIT_CODE=1
exit "${EXIT_CODE}"
fi
echo "::endgroup::"
# Wait for org to reach running + retrieve per-tenant token. Bounded
# at PROVISION_TIMEOUT_SECS. We poll the admin token endpoint; once
# the org is up, the endpoint returns 200 with the token, and the
# workspace_id is in the same response or in a follow-up /orgs/:slug
# call.
echo "::group::Wait for org to be ready (max ${PROVISION_TIMEOUT_SECS}s)"
WAIT_START=$(date +%s)
WAIT_DEADLINE=$(( WAIT_START + PROVISION_TIMEOUT_SECS ))
TENANT_TOKEN=""
while [ "$(date +%s)" -lt "${WAIT_DEADLINE}" ]; do
TOKEN_HTTP_CODE=$(curl -sS -o /tmp/token-resp.json -w '%{http_code}' \
-H "Authorization: Bearer ${ADMIN_TOKEN}" \
--max-time 10 \
"${CP_URL}/cp/admin/orgs/${SLUG}/admin-token" || echo "000")
if [ "${TOKEN_HTTP_CODE}" = "200" ]; then
TENANT_TOKEN=$(jq -r '.admin_token // .token // empty' /tmp/token-resp.json 2>/dev/null || echo "")
if [ -n "${TENANT_TOKEN}" ]; then
WORKSPACE_ID=$(jq -r '.workspace_id // .default_workspace_id // empty' /tmp/token-resp.json 2>/dev/null || echo "")
if [ -z "${WORKSPACE_ID}" ]; then
# Fallback: list orgs and find by slug
WORKSPACE_ID=$(curl -sS -H "Authorization: Bearer ${ADMIN_TOKEN}" \
"${CP_URL}/cp/admin/orgs/${SLUG}" | jq -r '.workspace_id // .default_workspace_id // empty' 2>/dev/null || echo "")
fi
if [ -n "${WORKSPACE_ID}" ]; then
PROVISION_END_EPOCH=$(date +%s)
echo "Org ready in $(( PROVISION_END_EPOCH - WAIT_START ))s — workspace_id=${WORKSPACE_ID}"
break
fi
fi
fi
sleep 5
done
if [ -z "${TENANT_TOKEN}" ] || [ -z "${WORKSPACE_ID}" ]; then
echo "::error::Provision timed out (org never reached running within ${PROVISION_TIMEOUT_SECS}s)"
EXIT_CODE=3
exit "${EXIT_CODE}"
fi
echo "::endgroup::"
# Assertion 2: Controlplane receives /registry/register for that
# workspace_id. The harness doesn't POST to /registry/register
# directly — that's the workspace-server's own job on boot. We
# verify the registration was received by polling the registry
# endpoint (or by checking that a /workspaces/:id call returns
# the expected fields).
echo "::group::Assertion 2: /registry/register for workspace_id=${WORKSPACE_ID}"
REGISTER_DEADLINE=$(( $(date +%s) + 60 ))
while [ "$(date +%s)" -lt "${REGISTER_DEADLINE}" ]; do
REG_HTTP_CODE=$(curl -sS -o /tmp/reg-resp.json -w '%{http_code}' \
-H "Authorization: Bearer ${TENANT_TOKEN}" \
--max-time 10 \
"${CP_URL}/cp/registry/workspaces/${WORKSPACE_ID}" || echo "000")
if [ "${REG_HTTP_CODE}" = "200" ]; then
REGISTERED=$(jq -r '.registered // .workspace_id // empty' /tmp/reg-resp.json 2>/dev/null || echo "")
if [ -n "${REGISTERED}" ]; then
REGISTER_STATUS="ok"
echo "Registry confirms workspace_id=${WORKSPACE_ID} registered"
break
fi
fi
sleep 3
done
if [ "${REGISTER_STATUS}" != "ok" ]; then
echo "::error::Registry did not confirm registration within 60s"
cat /tmp/reg-resp.json 2>/dev/null || true
EXIT_CODE=4
exit "${EXIT_CODE}"
fi
echo "::endgroup::"
# Assertion 3: JSON-RPC/completion route returns successful minimal
# response. One minimal completion call — keep payload small.
echo "::group::Assertion 3: JSON-RPC completion"
COMPLETION_HTTP_CODE=$(curl -sS -o /tmp/completion-resp.json -w '%{http_code}' \
-X POST \
-H "Authorization: Bearer ${TENANT_TOKEN}" \
-H "Content-Type: application/json" \
--max-time 30 \
-d "$(cat <<JSON
{
"jsonrpc": "2.0",
"id": 1,
"method": "completion",
"params": {
"workspace_id": "${WORKSPACE_ID}",
"model": "${MODEL}",
"messages": [{"role": "user", "content": "ping"}],
"max_tokens": 1
}
}
JSON
)" \
"${CP_URL}/cp/rpc" || echo "000")
echo "HTTP ${COMPLETION_HTTP_CODE}"
if [ "${COMPLETION_HTTP_CODE}" != "200" ]; then
echo "::error::Completion failed (HTTP ${COMPLETION_HTTP_CODE})"
cat /tmp/completion-resp.json 2>/dev/null || true
EXIT_CODE=5
exit "${EXIT_CODE}"
fi
# Verify JSON-RPC 2.0 success envelope
RPC_ERROR=$(jq -r '.error // empty' /tmp/completion-resp.json 2>/dev/null || echo "")
if [ -n "${RPC_ERROR}" ]; then
echo "::error::Completion returned JSON-RPC error: ${RPC_ERROR}"
cat /tmp/completion-resp.json 2>/dev/null || true
EXIT_CODE=5
exit "${EXIT_CODE}"
fi
RPC_RESULT=$(jq -r '.result // empty' /tmp/completion-resp.json 2>/dev/null || echo "")
if [ -z "${RPC_RESULT}" ] || [ "${RPC_RESULT}" = "null" ]; then
echo "::error::Completion response missing result field"
cat /tmp/completion-resp.json 2>/dev/null || true
EXIT_CODE=5
exit "${EXIT_CODE}"
fi
COMPLETION_STATUS="ok"
echo "Completion OK"
echo "::endgroup::"
echo "All 4 assertions passed for ${SLUG} (workspace_id=${WORKSPACE_ID})"