2026-06-06 21:27:25 +00:00
2 changed files with 464 additions and 0 deletions
@@ -0,0 +1,165 @@
+name: boot-to-registration-e2e (advisory)
+
+# cp#455 — Minimal-cell boot-to-registration e2e.
+# CTO directive 14eb4f07: "build the minimal claude-code+kimi cell,
+# it should now go GREEN since the fix is live."
+#
+# Stage 1 of 5-stage rollout. Reuses the dispatch-only EC2
+# provisioning path from test_staging_full_saas.sh but reduced to
+# the minimum boot-to-registration surface:
+#
+#   1. Provision request accepted; workspace transitions to booting/running
+#   2. Controlplane receives /registry/register for that workspace_id
+#   3. JSON-RPC/completion route returns successful minimal response
+#   4. Teardown terminates workspace even on failure (trap)
+#
+# Advisory (non-blocking) per Researcher Stage 2 design — RED on
+# current main is expected pre-cp#469-cluster. After cp#477 deploy
+# (888efceb) + PR #2167 merge, cell should turn GREEN. THAT green
+# is the cluster-proof signal.
+#
+# Cost controls (mandatory):
+#   - SPOT instances (tagged run_id/workspace_id for cost attribution)
+#   - Fast teardown (~3-5 min wall-clock) even on assertion failure
+#   - Structured per-cell results JSON (runtime/provider/model/
+#     billing_mode/workspace_id/register_status/completion_status/
+#     teardown_status/elapsed_seconds)
+#
+# Inputs:
+#   runtime        : default claude-code
+#   billing_mode   : default platform_managed (the cp#469-cluster path)
+#   provider       : default platform (vs direct-to-provider)
+#   model          : default moonshot/kimi-k2.6 (CTO-specified)
+#
+# PR target: molecule-core (this file). Companion harness extension
+# (test_minimal_boot_cell.sh) lives in tests/e2e/ alongside
+# test_staging_full_saas.sh — same repo, same branch.
+#
+# Note: cp#455 was originally spec'd to live in molecule-controlplane
+# (`.gitea/workflows/` path), but molecule-core's CI is the home for
+# tenant-boot e2e tests in this stage. Stage 2 may move the path.
+
+on:
+  workflow_dispatch:
+  # Note: Gitea 1.22.6 does not support workflow_dispatch.inputs
+  # (feedback_gitea_workflow_dispatch_inputs_unsupported). Defaults
+  # are hardcoded in the job env below. Stage 2 can add matrix/
+  # param support once the Gitea version supports it.
+
+# Advisory: no cron schedule, manual dispatch only. Branch protection
+# doesn't require this — RED on main is expected pre-cp#469-cluster
+# deploy, GREEN signals the cluster is live.
+permissions:
+  contents: read
+  # No issue-write; failures surface as red runs in workflow history.
+
+concurrency:
+  group: boot-to-registration-e2e
+  cancel-in-progress: false
+
+jobs:
+  # bp-exempt: advisory e2e — non-gating, manual dispatch only (cp#455 Stage 1)
+  minimal-cell:
+    name: Minimal cell (claude-code + platform + moonshot/kimi-k2.6)
+    runs-on: ubuntu-latest
+    # Bounded at 12 min. Wall-clock budget breakdown:
+    #   - cold EC2 provision: ~3-4 min (SPOT)
+    #   - /registry/register wait: ~30s
+    #   - completion call: ~10s
+    #   - teardown: ~30-60s
+    #   - tail headroom: ~6-7 min
+    timeout-minutes: 12
+    env:
+      # Hardcoded defaults — Gitea 1.22.6 does not support workflow_dispatch.inputs
+      # (feedback_gitea_workflow_dispatch_inputs_unsupported). Stage 2 can add
+      # matrix/param support once the Gitea version supports it.
+      E2E_RUNTIME: claude-code
+      E2E_BILLING_MODE: platform_managed
+      E2E_PROVIDER: platform
+      E2E_MODEL: moonshot/kimi-k2.6
+      E2E_RUN_ID: cp455-${{ github.run_id }}
+      E2E_PROVISION_TIMEOUT_SECS: '300' # 5 min — fast teardown budget
+      MOLECULE_CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }}
+      MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Verify required secrets present
+        run: |
+          if [ -z "${MOLECULE_ADMIN_TOKEN:-}" ]; then
+            echo "::error::CP_STAGING_ADMIN_API_TOKEN secret missing — minimal-cell e2e cannot run"
+            echo "::error::Set it at Settings → Secrets and Variables → Actions; pull from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
+            exit 1
+          fi
+
+      - name: Install required tools
+        run: |
+          for cmd in jq curl python3; do
+            command -v "$cmd" >/dev/null 2>&1 || {
+              echo "::error::required tool '$cmd' not on PATH — runner image regression?"
+              exit 1
+            }
+          done
+
+      - name: Run minimal-cell boot-to-registration harness
+        # The harness script handles its own teardown via EXIT trap;
+        # even on assertion failure (provision timeout, register
+        # timeout, completion failure), the workspace is deprovisioned
+        # and a leak is reported. Exit code propagates from the script.
+        # Structured per-cell results are emitted to ${GITHUB_STEP_SUMMARY}
+        # so operators see pass/fail per assertion without scrolling.
+        run: |
+          bash tests/e2e/test_minimal_boot_cell.sh
+
+      - name: Emit structured per-cell results
+        if: always()
+        # Always run (even on failure) so the structured results are
+        # visible in the workflow summary. The script writes a JSON
+        # file at /tmp/cell-result.json; this step renders it as a
+        # job summary.
+        run: |
+          if [ -f /tmp/cell-result.json ]; then
+            echo "## Minimal-cell results" >> "$GITHUB_STEP_SUMMARY"
+            echo "" >> "$GITHUB_STEP_SUMMARY"
+            echo '```json' >> "$GITHUB_STEP_SUMMARY"
+            cat /tmp/cell-result.json >> "$GITHUB_STEP_SUMMARY"
+            echo "" >> "$GITHUB_STEP_SUMMARY"
+            echo '```' >> "$GITHUB_STEP_SUMMARY"
+          else
+            echo "## Minimal-cell results: NO_RESULT_FILE" >> "$GITHUB_STEP_SUMMARY"
+            echo "" >> "$GITHUB_STEP_SUMMARY"
+            echo "Harness did not produce /tmp/cell-result.json — likely crashed before trap fired." >> "$GITHUB_STEP_SUMMARY"
+          fi
+
+      - name: Failure summary
+        if: failure()
+        run: |
+          {
+            echo "## cp#455 minimal-cell FAILED"
+            echo ""
+            echo "**Run ID:** ${{ github.run_id }}"
+            echo "**Runtime:** ${E2E_RUNTIME}"
+            echo "**Billing mode:** ${E2E_BILLING_MODE}"
+            echo "**Provider:** ${E2E_PROVIDER}"
+            echo "**Model:** ${E2E_MODEL}"
+            echo "**Slug:** ${E2E_RUN_ID}"
+            echo ""
+            echo "### What this means"
+            echo ""
+            echo "The minimal claude-code+kimi cell did not pass all 4 assertions:"
+            echo "1. Provision request accepted; workspace transitions to booting/running"
+            echo "2. Controlplane receives /registry/register for that workspace_id"
+            echo "3. JSON-RPC/completion route returns successful minimal response"
+            echo "4. Teardown terminates workspace even on failure (trap)"
+            echo ""
+            echo "RED is expected pre-cp#469-cluster. After cp#477 deploy (888efceb) + PR #2167 merge,"
+            echo "this should turn GREEN. Persistent RED after both merge = cluster bug, not e2e bug."
+            echo ""
+            echo "### Next steps"
+            echo ""
+            echo "1. Check the harness output above for the assertion that failed"
+            echo "2. If assertion 1 fails: provision path broken — check CP admin API + EC2 quota"
+            echo "3. If assertion 2 fails: /registry/register path broken — check workspace-server boot"
+            echo "4. If assertion 3 fails: LLM proxy / completion path broken — check cp#469 cluster"
+            echo "5. If assertion 4 fails: teardown trap broken — leak risk, fix immediately"
+          } >> "$GITHUB_STEP_SUMMARY"
@@ -0,0 +1,299 @@
+#!/usr/bin/env bash
+# cp#455 — Minimal-cell boot-to-registration harness.
+# CTO directive 14eb4f07: "build the minimal claude-code+kimi cell,
+# it should now go GREEN since the fix is live."
+#
+# Stage 1 of 5-stage rollout. Reduced to the minimum boot-to-
+# registration surface so each cell run is ~3-5 min wall-clock.
+#
+# Four assertions (per Researcher Task #79 spec):
+#   1. Provision request accepted; workspace transitions to booting/running
+#   2. Controlplane receives /registry/register for that workspace_id
+#   3. JSON-RPC/completion route returns successful minimal response
+#   4. Teardown terminates workspace even on failure (trap)
+#
+# Cost controls (mandatory):
+#   - SPOT instances (via the dispatch-only EC2 provisioning path;
+#     we don't set instance type — that's the provisioner's call)
+#   - Fast teardown ~3-5 min wall-clock
+#   - Structured per-cell results JSON output
+#
+# Auth model (mirrors test_staging_full_saas.sh):
+#   Single MOLECULE_ADMIN_TOKEN drives everything.
+#     - POST /cp/admin/orgs to provision
+#     - GET  /cp/admin/orgs/:slug/admin-token for per-tenant token
+#     - DELETE /cp/admin/tenants/:slug for teardown
+#   Per-tenant admin token drives tenant API calls (workspaces,
+#   /registry/register, JSON-RPC completion).
+#
+# Required env:
+#   MOLECULE_CP_URL        default: https://staging-api.moleculesai.app
+#   MOLECULE_ADMIN_TOKEN   CP admin bearer
+#
+# Optional env (passed from workflow_dispatch inputs):
+#   E2E_RUNTIME            default claude-code
+#   E2E_BILLING_MODE       default platform_managed
+#   E2E_PROVIDER           default platform
+#   E2E_MODEL              default moonshot/kimi-k2.6
+#   E2E_RUN_ID             Slug suffix; CI: cp455-${GITHUB_RUN_ID}
+#   E2E_PROVISION_TIMEOUT_SECS  default 300 (5 min — fast teardown budget)
+#   E2E_KEEP_ORG           1 → skip teardown (debugging only)
+#
+# Exit codes:
+#   0  happy path
+#   1  generic failure
+#   2  missing required env
+#   3  provisioning timed out (assertion 1)
+#   4  register timeout (assertion 2)
+#   5  completion failure (assertion 3)
+#   6  teardown left orphan (assertion 4)
+
+set -uo pipefail
+
+CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}"
+ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}"
+RUNTIME="${E2E_RUNTIME:-claude-code}"
+BILLING_MODE="${E2E_BILLING_MODE:-platform_managed}"
+PROVIDER="${E2E_PROVIDER:-platform}"
+MODEL="${E2E_MODEL:-moonshot/kimi-k2.6}"
+PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-300}"
+KEEP_ORG="${E2E_KEEP_ORG:-}"
+RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}"
+SLUG="cp455-${RUNTIME}-${RUN_ID_SUFFIX}"
+WORKSPACE_ID=""
+TENANT_TOKEN=""
+RESULT_JSON="/tmp/cell-result.json"
+PROVISION_START_EPOCH=""
+PROVISION_END_EPOCH=""
+REGISTER_STATUS="not_attempted"
+COMPLETION_STATUS="not_attempted"
+TEARDOWN_STATUS="not_attempted"
+EXIT_CODE=0
+
+# Structured per-cell results writer. Emits JSON with all 4
+# assertion statuses + elapsed timing. Called from EXIT trap so
+# results are captured even on early failure.
+write_result() {
+  local elapsed="${1:-0}"
+  cat > "${RESULT_JSON}" <<JSON
+{
+  "runtime": "${RUNTIME}",
+  "billing_mode": "${BILLING_MODE}",
+  "provider": "${PROVIDER}",
+  "model": "${MODEL}",
+  "workspace_id": "${WORKSPACE_ID}",
+  "register_status": "${REGISTER_STATUS}",
+  "completion_status": "${COMPLETION_STATUS}",
+  "teardown_status": "${TEARDOWN_STATUS}",
+  "elapsed_seconds": ${elapsed},
+  "exit_code": ${EXIT_CODE},
+  "ts": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+}
+JSON
+}
+
+# EXIT trap — ALWAYS run. Writes structured results, tears down
+# workspace if we have one, never lets the script exit without
+# emitting /tmp/cell-result.json.
+on_exit() {
+  local exit_code=$?
+  EXIT_CODE=${exit_code}
+  local now
+  now=$(date +%s)
+  local elapsed=0
+  if [ -n "${PROVISION_START_EPOCH:-}" ] && [ "${PROVISION_START_EPOCH}" -gt 0 ] 2>/dev/null; then
+    elapsed=$(( now - PROVISION_START_EPOCH ))
+  fi
+
+  # Assertion 4: teardown terminates workspace even on failure.
+  if [ -z "${KEEP_ORG}" ] && [ -n "${SLUG:-}" ]; then
+    if [ -n "${WORKSPACE_ID:-}" ] || [ -n "${SLUG:-}" ]; then
+      echo "::group::Teardown (trap)"
+      echo "DELETE ${CP_URL}/cp/admin/tenants/${SLUG}"
+      local teardown_http_code
+      teardown_http_code=$(curl -sS -o /dev/null -w '%{http_code}' \
+        -X DELETE \
+        -H "Authorization: Bearer ${ADMIN_TOKEN}" \
+        --max-time 60 \
+        "${CP_URL}/cp/admin/tenants/${SLUG}" || echo "000")
+      if [ "${teardown_http_code}" = "200" ] || [ "${teardown_http_code}" = "204" ] || [ "${teardown_http_code}" = "404" ]; then
+        TEARDOWN_STATUS="ok"
+        echo "Teardown OK (HTTP ${teardown_http_code})"
+      else
+        TEARDOWN_STATUS="leak_risk_http_${teardown_http_code}"
+        echo "::error::Teardown returned HTTP ${teardown_http_code} — orphan risk"
+        # Bump exit code to 6 if teardown is the failure source.
+        if [ "${EXIT_CODE}" -eq 0 ]; then
+          EXIT_CODE=6
+        fi
+      fi
+      echo "::endgroup::"
+    fi
+  else
+    TEARDOWN_STATUS="skipped_keep_org"
+  fi
+
+  write_result "${elapsed}"
+  echo "Structured results written to ${RESULT_JSON}"
+  cat "${RESULT_JSON}"
+  exit "${EXIT_CODE}"
+}
+trap on_exit EXIT
+trap 'echo "::error::Script aborted on signal"; exit 130' INT TERM
+
+PROVISION_START_EPOCH=$(date +%s)
+
+# Assertion 1: Provision request accepted; workspace transitions to
+# booting/running.
+echo "::group::Assertion 1: Provision"
+echo "POST ${CP_URL}/cp/admin/orgs  slug=${SLUG}  runtime=${RUNTIME}  billing_mode=${BILLING_MODE}  provider=${PROVIDER}  model=${MODEL}"
+PROVISION_HTTP_CODE=$(curl -sS -o /tmp/provision-resp.json -w '%{http_code}' \
+  -X POST \
+  -H "Authorization: Bearer ${ADMIN_TOKEN}" \
+  -H "Content-Type: application/json" \
+  --max-time 30 \
+  -d "$(cat <<JSON
+{
+  "slug": "${SLUG}",
+  "runtime": "${RUNTIME}",
+  "billing_mode": "${BILLING_MODE}",
+  "provider": "${PROVIDER}",
+  "model": "${MODEL}",
+  "tier": "spot",
+  "tags": {
+    "cp455_minimal_cell": "1",
+    "run_id": "${RUN_ID_SUFFIX}"
+  }
+}
+JSON
+)" \
+  "${CP_URL}/cp/admin/orgs" || echo "000")
+echo "HTTP ${PROVISION_HTTP_CODE}"
+if [ "${PROVISION_HTTP_CODE}" != "202" ] && [ "${PROVISION_HTTP_CODE}" != "200" ]; then
+  echo "::error::Provision failed (HTTP ${PROVISION_HTTP_CODE})"
+  cat /tmp/provision-resp.json 2>/dev/null || true
+  EXIT_CODE=1
+  exit "${EXIT_CODE}"
+fi
+echo "::endgroup::"
+
+# Wait for org to reach running + retrieve per-tenant token. Bounded
+# at PROVISION_TIMEOUT_SECS. We poll the admin token endpoint; once
+# the org is up, the endpoint returns 200 with the token, and the
+# workspace_id is in the same response or in a follow-up /orgs/:slug
+# call.
+echo "::group::Wait for org to be ready (max ${PROVISION_TIMEOUT_SECS}s)"
+WAIT_START=$(date +%s)
+WAIT_DEADLINE=$(( WAIT_START + PROVISION_TIMEOUT_SECS ))
+TENANT_TOKEN=""
+while [ "$(date +%s)" -lt "${WAIT_DEADLINE}" ]; do
+  TOKEN_HTTP_CODE=$(curl -sS -o /tmp/token-resp.json -w '%{http_code}' \
+    -H "Authorization: Bearer ${ADMIN_TOKEN}" \
+    --max-time 10 \
+    "${CP_URL}/cp/admin/orgs/${SLUG}/admin-token" || echo "000")
+  if [ "${TOKEN_HTTP_CODE}" = "200" ]; then
+    TENANT_TOKEN=$(jq -r '.admin_token // .token // empty' /tmp/token-resp.json 2>/dev/null || echo "")
+    if [ -n "${TENANT_TOKEN}" ]; then
+      WORKSPACE_ID=$(jq -r '.workspace_id // .default_workspace_id // empty' /tmp/token-resp.json 2>/dev/null || echo "")
+      if [ -z "${WORKSPACE_ID}" ]; then
+        # Fallback: list orgs and find by slug
+        WORKSPACE_ID=$(curl -sS -H "Authorization: Bearer ${ADMIN_TOKEN}" \
+          "${CP_URL}/cp/admin/orgs/${SLUG}" | jq -r '.workspace_id // .default_workspace_id // empty' 2>/dev/null || echo "")
+      fi
+      if [ -n "${WORKSPACE_ID}" ]; then
+        PROVISION_END_EPOCH=$(date +%s)
+        echo "Org ready in $(( PROVISION_END_EPOCH - WAIT_START ))s — workspace_id=${WORKSPACE_ID}"
+        break
+      fi
+    fi
+  fi
+  sleep 5
+done
+if [ -z "${TENANT_TOKEN}" ] || [ -z "${WORKSPACE_ID}" ]; then
+  echo "::error::Provision timed out (org never reached running within ${PROVISION_TIMEOUT_SECS}s)"
+  EXIT_CODE=3
+  exit "${EXIT_CODE}"
+fi
+echo "::endgroup::"
+
+# Assertion 2: Controlplane receives /registry/register for that
+# workspace_id. The harness doesn't POST to /registry/register
+# directly — that's the workspace-server's own job on boot. We
+# verify the registration was received by polling the registry
+# endpoint (or by checking that a /workspaces/:id call returns
+# the expected fields).
+echo "::group::Assertion 2: /registry/register for workspace_id=${WORKSPACE_ID}"
+REGISTER_DEADLINE=$(( $(date +%s) + 60 ))
+while [ "$(date +%s)" -lt "${REGISTER_DEADLINE}" ]; do
+  REG_HTTP_CODE=$(curl -sS -o /tmp/reg-resp.json -w '%{http_code}' \
+    -H "Authorization: Bearer ${TENANT_TOKEN}" \
+    --max-time 10 \
+    "${CP_URL}/cp/registry/workspaces/${WORKSPACE_ID}" || echo "000")
+  if [ "${REG_HTTP_CODE}" = "200" ]; then
+    REGISTERED=$(jq -r '.registered // .workspace_id // empty' /tmp/reg-resp.json 2>/dev/null || echo "")
+    if [ -n "${REGISTERED}" ]; then
+      REGISTER_STATUS="ok"
+      echo "Registry confirms workspace_id=${WORKSPACE_ID} registered"
+      break
+    fi
+  fi
+  sleep 3
+done
+if [ "${REGISTER_STATUS}" != "ok" ]; then
+  echo "::error::Registry did not confirm registration within 60s"
+  cat /tmp/reg-resp.json 2>/dev/null || true
+  EXIT_CODE=4
+  exit "${EXIT_CODE}"
+fi
+echo "::endgroup::"
+
+# Assertion 3: JSON-RPC/completion route returns successful minimal
+# response. One minimal completion call — keep payload small.
+echo "::group::Assertion 3: JSON-RPC completion"
+COMPLETION_HTTP_CODE=$(curl -sS -o /tmp/completion-resp.json -w '%{http_code}' \
+  -X POST \
+  -H "Authorization: Bearer ${TENANT_TOKEN}" \
+  -H "Content-Type: application/json" \
+  --max-time 30 \
+  -d "$(cat <<JSON
+{
+  "jsonrpc": "2.0",
+  "id": 1,
+  "method": "completion",
+  "params": {
+    "workspace_id": "${WORKSPACE_ID}",
+    "model": "${MODEL}",
+    "messages": [{"role": "user", "content": "ping"}],
+    "max_tokens": 1
+  }
+}
+JSON
+)" \
+  "${CP_URL}/cp/rpc" || echo "000")
+echo "HTTP ${COMPLETION_HTTP_CODE}"
+if [ "${COMPLETION_HTTP_CODE}" != "200" ]; then
+  echo "::error::Completion failed (HTTP ${COMPLETION_HTTP_CODE})"
+  cat /tmp/completion-resp.json 2>/dev/null || true
+  EXIT_CODE=5
+  exit "${EXIT_CODE}"
+fi
+# Verify JSON-RPC 2.0 success envelope
+RPC_ERROR=$(jq -r '.error // empty' /tmp/completion-resp.json 2>/dev/null || echo "")
+if [ -n "${RPC_ERROR}" ]; then
+  echo "::error::Completion returned JSON-RPC error: ${RPC_ERROR}"
+  cat /tmp/completion-resp.json 2>/dev/null || true
+  EXIT_CODE=5
+  exit "${EXIT_CODE}"
+fi
+RPC_RESULT=$(jq -r '.result // empty' /tmp/completion-resp.json 2>/dev/null || echo "")
+if [ -z "${RPC_RESULT}" ] || [ "${RPC_RESULT}" = "null" ]; then
+  echo "::error::Completion response missing result field"
+  cat /tmp/completion-resp.json 2>/dev/null || true
+  EXIT_CODE=5
+  exit "${EXIT_CODE}"
+fi
+COMPLETION_STATUS="ok"
+echo "Completion OK"
+echo "::endgroup::"
+
+echo "All 4 assertions passed for ${SLUG} (workspace_id=${WORKSPACE_ID})"