Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 48b6011e17 | |||
| cc99d3fff4 | |||
| 71010e618a | |||
| 53ec08cbdb | |||
| d34d09db01 |
@@ -0,0 +1,196 @@
|
||||
name: E2E Staging Reconciler (heals terminated EC2)
|
||||
|
||||
# Live staging proof for the core#2261 instance-state reconciler
|
||||
# (workspace-server/internal/registry/cp_instance_reconciler.go). The
|
||||
# real-infra complement to the deterministic unit tests: provisions a real
|
||||
# staging workspace, TERMINATES its EC2, and asserts the reconciler flips it
|
||||
# off 'online' (PRIMARY gate) and auto-reprovisions on a new instance_id
|
||||
# (SECONDARY, best-effort). See
|
||||
# tests/e2e/test_reconciler_heals_terminated_instance.sh for the assertion
|
||||
# contract + timeouts.
|
||||
#
|
||||
# Modeled on e2e-staging-saas.yml. Same secrets + same Gitea-port caveats:
|
||||
# - Dropped workflow_dispatch.inputs (Gitea 1.22.6 parser rejects them).
|
||||
# - Dropped merge_group / environment (no Gitea equivalent).
|
||||
# - Workflow-level env.GITHUB_SERVER_URL pinned per
|
||||
# feedback_act_runner_github_server_url.
|
||||
#
|
||||
# NOT a required check (yet). This is a brand-new live E2E that provisions +
|
||||
# terminates real EC2 (costs money, shares the cp#245 cold-boot flake
|
||||
# surface). A new live e2e must NOT hard-gate every merge until it has a
|
||||
# green track record. continue-on-error: true surfaces failures without
|
||||
# blocking. PROMOTE to branch-required (flip continue-on-error → false AND
|
||||
# add "E2E Staging Reconciler" to branch protection) once it has run green on
|
||||
# main for several consecutive days — same de-flake discipline the
|
||||
# platform-boot job in e2e-staging-saas.yml documents.
|
||||
|
||||
on:
|
||||
# Run when the reconciler itself, the script, or the libs it depends on
|
||||
# change — so a reconciler regression is caught on the PR that introduces
|
||||
# it (paths filter), plus a daily schedule to catch infra/AMI drift.
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- 'workspace-server/internal/registry/cp_instance_reconciler.go'
|
||||
- 'tests/e2e/test_reconciler_heals_terminated_instance.sh'
|
||||
- 'tests/e2e/lib/aws_leak_check.sh'
|
||||
- 'tests/e2e/lib/model_slug.sh'
|
||||
- '.gitea/workflows/e2e-staging-reconciler.yml'
|
||||
pull_request:
|
||||
branches: [main]
|
||||
paths:
|
||||
- 'workspace-server/internal/registry/cp_instance_reconciler.go'
|
||||
- 'tests/e2e/test_reconciler_heals_terminated_instance.sh'
|
||||
- 'tests/e2e/lib/aws_leak_check.sh'
|
||||
- 'tests/e2e/lib/model_slug.sh'
|
||||
- '.gitea/workflows/e2e-staging-reconciler.yml'
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# 08:00 UTC daily — offset from e2e-staging-saas (07:00) so the two live
|
||||
# harnesses don't fight over staging's per-hour org-creation quota.
|
||||
- cron: '0 8 * * *'
|
||||
|
||||
# Serialize against itself: staging has a finite per-hour org-creation quota,
|
||||
# and a cancelled run mid-teardown leaks EC2. cancel-in-progress: false
|
||||
# mirrors e2e-staging-saas.yml.
|
||||
concurrency:
|
||||
group: e2e-staging-reconciler
|
||||
cancel-in-progress: false
|
||||
|
||||
env:
|
||||
GITHUB_SERVER_URL: https://git.moleculesai.app
|
||||
|
||||
jobs:
|
||||
# PR-validation path: always posts success so a workflow-only / script-only
|
||||
# PR has a status check (this workflow's real job only fires on the paths
|
||||
# filter). Mirrors the pr-validate job in e2e-staging-saas.yml.
|
||||
pr-validate:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
fetch-depth: 1
|
||||
continue-on-error: true
|
||||
- name: YAML validation (best-effort)
|
||||
run: |
|
||||
echo "e2e-staging-reconciler.yml — PR validation: workflow YAML is valid."
|
||||
echo "Live E2E step runs only when the reconciler / script / libs change."
|
||||
continue-on-error: true
|
||||
|
||||
e2e-staging-reconciler:
|
||||
name: E2E Staging Reconciler
|
||||
runs-on: ubuntu-latest
|
||||
# NOT required yet — surface failures without blocking merges. Flip to
|
||||
# false + add to branch protection once green on main for a de-flake
|
||||
# window (see the header note). mc#1982: do not renew this mask silently.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 60
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
env:
|
||||
MOLECULE_CP_URL: https://staging-api.moleculesai.app
|
||||
# Single admin-bearer secret drives provision + tenant-token retrieval +
|
||||
# teardown (= Railway staging CP_ADMIN_API_TOKEN). Same secret name the
|
||||
# saas workflow canonicalised to under internal#322.
|
||||
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
AWS_DEFAULT_REGION: us-east-2
|
||||
# Leak-check is REQUIRED here: this test deliberately terminates an EC2,
|
||||
# so teardown MUST positively confirm no slug-tagged box survives.
|
||||
E2E_AWS_LEAK_CHECK: required
|
||||
E2E_AWS_TERMINATE_LEAKS: '1'
|
||||
# claude-code + MiniMax is the cheapest boot-to-online path (same as the
|
||||
# saas job). The reconciler test never makes a completion, but the key is
|
||||
# wired so the first boot reaches online on the same path the saas
|
||||
# harness uses. First non-empty wins in the script's priority chain.
|
||||
E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
|
||||
E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
|
||||
E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }}
|
||||
E2E_RUNTIME: claude-code
|
||||
E2E_MODEL_SLUG: MiniMax-M2
|
||||
E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
- name: Verify required secrets present
|
||||
run: |
|
||||
if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
|
||||
echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
|
||||
exit 2
|
||||
fi
|
||||
for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
|
||||
if [ -z "${!var:-}" ]; then
|
||||
echo "::error::$var not set — this test terminates an EC2 and verifies no leak; AWS creds are mandatory"
|
||||
exit 2
|
||||
fi
|
||||
done
|
||||
echo "Required secrets present ✓"
|
||||
|
||||
- name: CP staging health preflight
|
||||
run: |
|
||||
code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health")
|
||||
if [ "$code" != "200" ]; then
|
||||
echo "::error::Staging CP unhealthy (got HTTP $code). Skipping — not a reconciler bug."
|
||||
exit 1
|
||||
fi
|
||||
echo "Staging CP healthy ✓"
|
||||
|
||||
- name: Run reconciler heal E2E
|
||||
id: e2e
|
||||
run: bash tests/e2e/test_reconciler_heals_terminated_instance.sh
|
||||
|
||||
# Belt-and-braces teardown: the script installs its own EXIT trap, but if
|
||||
# the runner is cancelled the trap may not fire. This always() step
|
||||
# double-deletes any e2e-rec-* org from THIS run. The admin DELETE is
|
||||
# idempotent so double-invoking is safe.
|
||||
- name: Teardown safety net (runs on cancel/failure)
|
||||
if: always()
|
||||
env:
|
||||
ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
|
||||
run: |
|
||||
set +e
|
||||
orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
|
||||
| python3 -c "
|
||||
import json, sys, os, datetime
|
||||
run_id = os.environ.get('GITHUB_RUN_ID', '')
|
||||
d = json.load(sys.stdin)
|
||||
today = datetime.date.today()
|
||||
yesterday = today - datetime.timedelta(days=1)
|
||||
dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
|
||||
# Slug shape: e2e-rec-YYYYMMDD-<run_id>-<attempt>-...
|
||||
if run_id:
|
||||
prefixes = tuple(f'e2e-rec-{d}-{run_id}-' for d in dates)
|
||||
else:
|
||||
prefixes = tuple(f'e2e-rec-{d}-' for d in dates)
|
||||
candidates = [o['slug'] for o in d.get('orgs', [])
|
||||
if any(o.get('slug','').startswith(p) for p in prefixes)
|
||||
and o.get('instance_status') not in ('purged',)]
|
||||
print('\n'.join(candidates))
|
||||
" 2>/dev/null)
|
||||
leaks=()
|
||||
for slug in $orgs; do
|
||||
echo "Safety-net teardown: $slug"
|
||||
set +e
|
||||
curl -sS -o /tmp/rec-cleanup.out -w "%{http_code}" \
|
||||
-X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"confirm\":\"$slug\"}" >/tmp/rec-cleanup.code
|
||||
set -e
|
||||
code=$(cat /tmp/rec-cleanup.code 2>/dev/null || echo "000")
|
||||
if [ "$code" = "200" ] || [ "$code" = "204" ]; then
|
||||
echo "[teardown] deleted $slug (HTTP $code)"
|
||||
else
|
||||
echo "::warning::reconciler teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/rec-cleanup.out 2>/dev/null)"
|
||||
leaks+=("$slug")
|
||||
fi
|
||||
done
|
||||
if [ ${#leaks[@]} -gt 0 ]; then
|
||||
echo "::warning::reconciler teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
|
||||
fi
|
||||
exit 0
|
||||
+493
@@ -0,0 +1,493 @@
|
||||
#!/usr/bin/env bash
|
||||
# Live staging E2E — the CP instance-state reconciler heals a terminated EC2.
|
||||
#
|
||||
# Real-infra complement to the deterministic unit tests for core#2261
|
||||
# (workspace-server/internal/registry/cp_instance_reconciler.go). Those unit
|
||||
# tests pin the reconcile logic against fakes; THIS script proves the loop
|
||||
# actually runs in a real tenant's workspace-server and drives the EXISTING
|
||||
# offline + auto-heal machinery against real AWS.
|
||||
#
|
||||
# Root regression (core#2247): a SaaS workspace whose EC2 is terminated out
|
||||
# from under the platform (manual AWS action, spot reclaim, CP reap) fell
|
||||
# through every existing liveness pass and kept reading status='online'
|
||||
# forever, pointing at a dead instance. The reconciler closes that gap with
|
||||
# CPProvisioner.IsRunning and feeds a clean "not running" into onOffline →
|
||||
# RestartByID (existing-volume reprovision).
|
||||
#
|
||||
# What this test does:
|
||||
# 1. Provision a fresh staging org + ONE workspace (same default
|
||||
# runtime/model as the full-saas harness, so it actually boots).
|
||||
# 2. Poll the tenant API until the workspace is status=online; capture its
|
||||
# instance_id.
|
||||
# 3. KILL it — terminate that exact EC2 via `aws ec2 terminate-instances`.
|
||||
# 4. Assert the reconciler heals it:
|
||||
# PRIMARY (gate) — within ~180s the workspace status LEAVES
|
||||
# 'online' (the reconciler detected the dead
|
||||
# instance via IsRunning and flipped it). This
|
||||
# is the core regression guard: a dead instance
|
||||
# must NOT keep reading 'online'.
|
||||
# SECONDARY (best-effort) — within ~10 min it auto-reprovisions:
|
||||
# status returns to 'online' with a NEW
|
||||
# instance_id (onOffline → RestartByID
|
||||
# existing-volume heal). If reprovision doesn't
|
||||
# finish in the bound we log it clearly but let
|
||||
# the PRIMARY assertion stand as the gate (see
|
||||
# the comment at the secondary block — a future
|
||||
# tightening that promotes this to a hard gate is
|
||||
# deliberately one edit away).
|
||||
# 5. Teardown ALWAYS (EXIT trap): delete the tenant + leak-sweep so no EC2
|
||||
# is orphaned, even on a mid-test failure.
|
||||
#
|
||||
# Auth model + provisioning conventions are copied verbatim from
|
||||
# test_staging_full_saas.sh (single MOLECULE_ADMIN_TOKEN → CP admin; per-
|
||||
# tenant admin token + X-Molecule-Org-Id header for tenant API). The kill
|
||||
# primitive + leak sweep reuse lib/aws_leak_check.sh.
|
||||
#
|
||||
# Required env:
|
||||
# MOLECULE_CP_URL default: https://staging-api.moleculesai.app
|
||||
# MOLECULE_ADMIN_TOKEN CP admin bearer — Railway staging CP_ADMIN_API_TOKEN
|
||||
#
|
||||
# Optional env (mirrors the full-saas harness where they overlap):
|
||||
# E2E_RUNTIME claude-code (default)
|
||||
# E2E_PROVISION_TIMEOUT_SECS default 900 (cold EC2 budget)
|
||||
# E2E_WORKSPACE_ONLINE_TIMEOUT_SECS default 3600 (cold-boot worst-case)
|
||||
# E2E_RECONCILE_OFFLINE_TIMEOUT_SECS default 180 (PRIMARY: leave 'online'.
|
||||
# Reconciler cadence is 60s — 3 cycles +
|
||||
# AWS terminate-visibility slack.)
|
||||
# E2E_REPROVISION_TIMEOUT_SECS default 600 (SECONDARY: back to online
|
||||
# with a NEW instance_id)
|
||||
# E2E_MINIMAX_API_KEY / E2E_ANTHROPIC_API_KEY / E2E_OPENAI_API_KEY
|
||||
# LLM key (same priority chain as
|
||||
# full-saas; needed so the FIRST boot
|
||||
# reaches online). Empty → '{}' (the
|
||||
# workspace still boots online; the LLM
|
||||
# key only matters for a completion,
|
||||
# which this test never makes).
|
||||
# E2E_KEEP_ORG 1 → skip teardown (debugging only)
|
||||
# E2E_RUN_ID Slug suffix; CI: ${GITHUB_RUN_ID}
|
||||
# E2E_AWS_LEAK_CHECK auto (default) | required | off
|
||||
# E2E_AWS_TERMINATE_LEAKS 1 → terminate slug-tagged leaked EC2 at
|
||||
# teardown
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 happy path (PRIMARY assertion held; SECONDARY logged either way)
|
||||
# 1 generic failure (incl. PRIMARY assertion failed = regression)
|
||||
# 2 missing required env
|
||||
# 3 provisioning timed out
|
||||
# 4 teardown left orphan resources
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}"
|
||||
ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}"
|
||||
RUNTIME="${E2E_RUNTIME:-claude-code}"
|
||||
PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}"
|
||||
WORKSPACE_ONLINE_TIMEOUT_SECS="${E2E_WORKSPACE_ONLINE_TIMEOUT_SECS:-3600}"
|
||||
# PRIMARY bound: the reconciler ticks every 60s; it needs one cycle to see
|
||||
# the dead instance after AWS makes the terminate visible to DescribeInstances
|
||||
# (typically seconds, but can lag). 180s = ~3 cycles + slack.
|
||||
RECONCILE_OFFLINE_TIMEOUT_SECS="${E2E_RECONCILE_OFFLINE_TIMEOUT_SECS:-180}"
|
||||
# SECONDARY bound: full existing-volume reprovision (new EC2 boot + agent
|
||||
# bootstrap) is a multi-minute cold path.
|
||||
REPROVISION_TIMEOUT_SECS="${E2E_REPROVISION_TIMEOUT_SECS:-600}"
|
||||
RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}"
|
||||
|
||||
# Slug MUST start with e2e- so sweep-stale-e2e-orgs.yml reaps any orphan this
|
||||
# run leaks (lint_cleanup_traps.sh enforces the e2e-/rt-e2e- prefix for any
|
||||
# staging tenant E2E; we honour it here too even though our filename isn't
|
||||
# *staging*).
|
||||
SLUG="e2e-rec-$(date +%Y%m%d)-${RUN_ID_SUFFIX}"
|
||||
SLUG=$(echo "$SLUG" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-' | head -c 32)
|
||||
|
||||
log() { echo "[$(date +%H:%M:%S)] $*"; }
|
||||
fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; }
|
||||
ok() { echo "[$(date +%H:%M:%S)] ✅ $*"; }
|
||||
|
||||
# Per-runtime model slug dispatch — shared with the full-saas harness.
|
||||
# shellcheck disable=SC1091
|
||||
# shellcheck source=lib/model_slug.sh
|
||||
source "$(dirname "$0")/lib/model_slug.sh"
|
||||
# AWS kill primitive + leak sweep (e2e_aws_region / e2e_ec2_instances_for_slug /
|
||||
# e2e_terminate_instances / e2e_verify_no_ec2_leaks_for_slug).
|
||||
# shellcheck disable=SC1091
|
||||
# shellcheck source=lib/aws_leak_check.sh
|
||||
source "$(dirname "$0")/lib/aws_leak_check.sh"
|
||||
|
||||
CURL_COMMON=(-sS --fail-with-body --max-time 30)
|
||||
|
||||
# ─── cleanup trap ───────────────────────────────────────────────────────
|
||||
# Identical teardown contract to test_staging_full_saas.sh: delete the
|
||||
# tenant (synchronous GDPR cascade), poll for the org row to disappear, then
|
||||
# assert no slug-tagged EC2 survives. A leaked resource at teardown is a CI
|
||||
# failure (exit 4). The trap is installed UP-FRONT so a mid-test failure
|
||||
# (including a failed PRIMARY assertion) still cleans up.
|
||||
CLEANUP_DONE=0
|
||||
cleanup_org() {
|
||||
# Capture upstream exit code IMMEDIATELY — must be the first statement in
|
||||
# the trap, before any command (including the CLEANUP_DONE check) clobbers $?.
|
||||
local entry_rc=$?
|
||||
|
||||
if [ "$CLEANUP_DONE" = "1" ]; then return 0; fi
|
||||
CLEANUP_DONE=1
|
||||
|
||||
if [ "${E2E_KEEP_ORG:-0}" = "1" ]; then
|
||||
log "E2E_KEEP_ORG=1 — skipping teardown. Manually delete $SLUG when done."
|
||||
return 0
|
||||
fi
|
||||
|
||||
log "🧹 Tearing down org $SLUG..."
|
||||
|
||||
# 120s curl budget for the synchronous DELETE cascade (EC2 terminate alone
|
||||
# is 30-60s), then poll up to 60s for organizations.status='purged'/gone.
|
||||
if curl "${CURL_COMMON[@]}" --max-time 120 -X DELETE "$CP_URL/cp/admin/tenants/$SLUG" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"confirm\":\"$SLUG\"}" >/dev/null 2>&1; then
|
||||
ok "Teardown request accepted"
|
||||
else
|
||||
log "Teardown returned non-2xx (may already be gone)"
|
||||
fi
|
||||
|
||||
local leak_count=1
|
||||
local elapsed=0
|
||||
while [ "$elapsed" -lt 60 ]; do
|
||||
leak_count=$(curl "${CURL_COMMON[@]}" "$CP_URL/cp/admin/orgs" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
|
||||
| python3 -c "import json,sys; d=json.load(sys.stdin); print(sum(1 for o in d.get('orgs', []) if o.get('slug')=='$SLUG' and o.get('status') != 'purged'))" \
|
||||
2>/dev/null || echo 1)
|
||||
if [ "$leak_count" = "0" ]; then
|
||||
break
|
||||
fi
|
||||
sleep 5
|
||||
elapsed=$((elapsed + 5))
|
||||
done
|
||||
|
||||
if [ "$leak_count" != "0" ]; then
|
||||
echo "⚠️ LEAK: org $SLUG still present post-teardown after ${elapsed}s (count=$leak_count)" >&2
|
||||
exit 4
|
||||
fi
|
||||
local aws_leak_rc=0
|
||||
e2e_verify_no_ec2_leaks_for_slug "$SLUG" || aws_leak_rc=$?
|
||||
if [ "$aws_leak_rc" != "0" ]; then
|
||||
case "$aws_leak_rc" in
|
||||
2) exit 2 ;;
|
||||
*) exit 4 ;;
|
||||
esac
|
||||
fi
|
||||
ok "Teardown clean — no orphan org or EC2 resources for $SLUG (${elapsed}s)"
|
||||
|
||||
# Normalize unexpected upstream exit codes to 1 — `set -e` propagates the
|
||||
# raw exit code of the failing command (e.g. curl exits 22 under
|
||||
# --fail-with-body), but this script's contract only emits {0,1,2,3,4}.
|
||||
case "$entry_rc" in
|
||||
0|1|2|3|4) ;;
|
||||
*) exit 1 ;;
|
||||
esac
|
||||
}
|
||||
trap cleanup_org EXIT INT TERM
|
||||
|
||||
# ─── 0. Preflight ───────────────────────────────────────────────────────
|
||||
log "═══════════════════════════════════════════════════════════════════"
|
||||
log " Staging reconciler-heals-terminated-instance E2E (core#2261)"
|
||||
log " CP: $CP_URL"
|
||||
log " Slug: $SLUG"
|
||||
log " Runtime: $RUNTIME"
|
||||
log " Online timeout: ${WORKSPACE_ONLINE_TIMEOUT_SECS}s"
|
||||
log " PRIMARY (offline): ${RECONCILE_OFFLINE_TIMEOUT_SECS}s"
|
||||
log " SECONDARY (reprov): ${REPROVISION_TIMEOUT_SECS}s"
|
||||
log "═══════════════════════════════════════════════════════════════════"
|
||||
|
||||
log "0/6 Preflight: CP reachable?"
|
||||
curl "${CURL_COMMON[@]}" "$CP_URL/health" >/dev/null || fail "CP health check failed"
|
||||
ok "CP reachable"
|
||||
|
||||
admin_call() {
|
||||
local method="$1"; shift
|
||||
local path="$1"; shift
|
||||
curl "${CURL_COMMON[@]}" -X "$method" "$CP_URL$path" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
"$@"
|
||||
}
|
||||
|
||||
# ─── 1. Create org ──────────────────────────────────────────────────────
|
||||
log "1/6 Creating org $SLUG via /cp/admin/orgs..."
|
||||
CREATE_RESP=$(admin_call POST /cp/admin/orgs \
|
||||
-d "{\"slug\":\"$SLUG\",\"name\":\"E2E $SLUG\",\"owner_user_id\":\"e2e-runner:$SLUG\"}")
|
||||
echo "$CREATE_RESP" | python3 -m json.tool >/dev/null || fail "Org create returned non-JSON: $CREATE_RESP"
|
||||
ORG_ID=$(echo "$CREATE_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))")
|
||||
[ -z "$ORG_ID" ] && fail "Org create response missing 'id': $CREATE_RESP"
|
||||
ok "Org created (id=$ORG_ID)"
|
||||
|
||||
# ─── 2. Wait for tenant provisioning ────────────────────────────────────
|
||||
log "2/6 Waiting for tenant provisioning (up to ${PROVISION_TIMEOUT_SECS}s)..."
|
||||
DEADLINE=$(( $(date +%s) + PROVISION_TIMEOUT_SECS ))
|
||||
LAST_STATUS=""
|
||||
while true; do
|
||||
if [ "$(date +%s)" -gt "$DEADLINE" ]; then
|
||||
fail "Tenant provisioning timed out after ${PROVISION_TIMEOUT_SECS}s (last: $LAST_STATUS)"
|
||||
fi
|
||||
LIST_JSON=$(admin_call GET /cp/admin/orgs 2>/dev/null || echo '{"orgs":[]}')
|
||||
# /cp/admin/orgs exposes 'instance_status' (org_instances.status), NOT 'status'.
|
||||
STATUS=$(echo "$LIST_JSON" | python3 -c "
|
||||
import json, sys
|
||||
d = json.load(sys.stdin)
|
||||
for o in d.get('orgs', []):
|
||||
if o.get('slug') == '$SLUG':
|
||||
print(o.get('instance_status', ''))
|
||||
sys.exit(0)
|
||||
print('')
|
||||
" 2>/dev/null || echo "")
|
||||
if [ "$STATUS" != "$LAST_STATUS" ]; then
|
||||
log " status → $STATUS"
|
||||
LAST_STATUS="$STATUS"
|
||||
fi
|
||||
case "$STATUS" in
|
||||
running) break ;;
|
||||
failed)
|
||||
log "── DIAGNOSTIC BURST (step 2 — tenant provisioning failed) ──"
|
||||
echo "$LIST_JSON" | python3 -c "
|
||||
import json, sys
|
||||
d = json.load(sys.stdin)
|
||||
for o in d.get('orgs', []):
|
||||
if o.get('slug') == '$SLUG':
|
||||
print(json.dumps(o, indent=2))
|
||||
sys.exit(0)
|
||||
print('(no org row found for slug=$SLUG — DB drift?)')
|
||||
" 2>&1 | sed 's/^/ /'
|
||||
log "── END DIAGNOSTIC ──"
|
||||
# Tenant provisioning failures are a CP-side fault, not a reconciler
|
||||
# regression — exit 3 (provisioning) to keep the signal honest.
|
||||
echo "[$(date +%H:%M:%S)] ❌ Tenant provisioning failed for $SLUG (see diagnostic above)" >&2
|
||||
exit 3
|
||||
;;
|
||||
*) sleep 15 ;;
|
||||
esac
|
||||
done
|
||||
ok "Tenant provisioning complete"
|
||||
|
||||
# Derive tenant domain from CP hostname (same logic as the full-saas harness).
|
||||
CP_HOST=$(echo "$CP_URL" | sed -E 's#^https?://##; s#/.*$##')
|
||||
case "$CP_HOST" in
|
||||
api.*) DERIVED_DOMAIN="${CP_HOST#api.}" ;;
|
||||
staging-api.*) DERIVED_DOMAIN="staging.${CP_HOST#staging-api.}" ;;
|
||||
*) DERIVED_DOMAIN="$CP_HOST" ;;
|
||||
esac
|
||||
TENANT_DOMAIN="${MOLECULE_TENANT_DOMAIN:-$DERIVED_DOMAIN}"
|
||||
TENANT_URL="https://$SLUG.$TENANT_DOMAIN"
|
||||
log " TENANT_URL=$TENANT_URL"
|
||||
|
||||
# ─── 3. Retrieve per-tenant admin token ────────────────────────────────
|
||||
log "3/6 Fetching per-tenant admin token..."
|
||||
TENANT_TOKEN_RESP=$(admin_call GET "/cp/admin/orgs/$SLUG/admin-token")
|
||||
TENANT_TOKEN=$(echo "$TENANT_TOKEN_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('admin_token',''))" 2>/dev/null || echo "")
|
||||
[ -z "$TENANT_TOKEN" ] && fail "Could not retrieve per-tenant admin token for $SLUG"
|
||||
ok "Tenant admin token retrieved (len=${#TENANT_TOKEN})"
|
||||
|
||||
# Wait for tenant TLS / DNS propagation before any tenant API call.
|
||||
log " Waiting for tenant TLS / DNS propagation..."
|
||||
TLS_DEADLINE=$(( $(date +%s) + 15 * 60 ))
|
||||
while true; do
|
||||
if curl -sSfk --max-time 5 "$TENANT_URL/health" >/dev/null 2>&1; then
|
||||
break
|
||||
fi
|
||||
if [ "$(date +%s)" -gt "$TLS_DEADLINE" ]; then
|
||||
fail "Tenant URL never responded 2xx on /health within 15m"
|
||||
fi
|
||||
sleep 5
|
||||
done
|
||||
ok "Tenant reachable at $TENANT_URL"
|
||||
|
||||
tenant_call() {
|
||||
local method="$1"; shift
|
||||
local path="$1"; shift
|
||||
# X-Molecule-Org-Id is REQUIRED — the tenant guard 404s anything without it
|
||||
# (it does NOT 403, to hide tenant existence from org scanners).
|
||||
curl "${CURL_COMMON[@]}" -X "$method" "$TENANT_URL$path" \
|
||||
-H "Authorization: Bearer $TENANT_TOKEN" \
|
||||
-H "X-Molecule-Org-Id: $ORG_ID" \
|
||||
"$@"
|
||||
}
|
||||
|
||||
# Helper: read a single field off GET /workspaces/<id>. Echoes '' on any
|
||||
# error so callers can poll without `set -e` aborting on a transient blip.
|
||||
ws_field() {
|
||||
local wid="$1"; local field="$2"
|
||||
tenant_call GET "/workspaces/$wid" 2>/dev/null \
|
||||
| python3 -c "import json,sys; print(json.load(sys.stdin).get('$field') or '')" 2>/dev/null \
|
||||
|| echo ""
|
||||
}
|
||||
|
||||
# ─── 4. Provision ONE workspace ─────────────────────────────────────────
|
||||
# Same secrets-injection priority chain as the full-saas harness so the
|
||||
# FIRST boot reaches online. We never make a completion in this test (the
|
||||
# whole exercise is instance-state, not the LLM), so an absent key is
|
||||
# tolerable — but wiring the same keys keeps boot behaviour identical to the
|
||||
# sibling and avoids a config path that only this test would exercise.
|
||||
SECRETS_JSON='{}'
|
||||
if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
|
||||
SECRETS_JSON=$(python3 -c "import json,os; print(json.dumps({'MINIMAX_API_KEY': os.environ['E2E_MINIMAX_API_KEY']}))")
|
||||
elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
|
||||
SECRETS_JSON=$(python3 -c "import json,os; print(json.dumps({'ANTHROPIC_API_KEY': os.environ['E2E_ANTHROPIC_API_KEY']}))")
|
||||
elif [ -n "${E2E_OPENAI_API_KEY:-}" ]; then
|
||||
SECRETS_JSON=$(python3 -c "
|
||||
import json, os
|
||||
k = os.environ['E2E_OPENAI_API_KEY']
|
||||
print(json.dumps({
|
||||
'OPENAI_API_KEY': k,
|
||||
'OPENAI_BASE_URL': 'https://api.openai.com/v1',
|
||||
'MODEL_PROVIDER': 'openai:gpt-4o',
|
||||
'HERMES_INFERENCE_PROVIDER': 'custom',
|
||||
'HERMES_CUSTOM_BASE_URL': 'https://api.openai.com/v1',
|
||||
'HERMES_CUSTOM_API_KEY': k,
|
||||
'HERMES_CUSTOM_API_MODE': 'chat_completions',
|
||||
}))
|
||||
")
|
||||
fi
|
||||
|
||||
MODEL_SLUG=$(pick_model_slug "$RUNTIME")
|
||||
log " MODEL_SLUG=$MODEL_SLUG"
|
||||
|
||||
log "4/6 Provisioning workspace (runtime=$RUNTIME)..."
|
||||
WS_RESP=$(tenant_call POST /workspaces \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"name\":\"E2E Reconciler\",\"runtime\":\"$RUNTIME\",\"tier\":2,\"model\":\"$MODEL_SLUG\",\"secrets\":$SECRETS_JSON}")
|
||||
WS_ID=$(echo "$WS_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['id'])")
|
||||
[ -z "$WS_ID" ] && fail "Workspace create response missing 'id': $WS_RESP"
|
||||
log " WS_ID=$WS_ID"
|
||||
|
||||
# Wait for the workspace to reach status=online and capture its instance_id.
|
||||
log " Waiting for workspace to reach status=online (up to $((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min)..."
|
||||
ONLINE_DEADLINE=$(( $(date +%s) + WORKSPACE_ONLINE_TIMEOUT_SECS ))
|
||||
ORIGINAL_INSTANCE_ID=""
|
||||
WS_LAST_STATUS=""
|
||||
while true; do
|
||||
if [ "$(date +%s)" -gt "$ONLINE_DEADLINE" ]; then
|
||||
WS_LAST_ERR=$(ws_field "$WS_ID" "last_sample_error")
|
||||
fail "Workspace $WS_ID never reached status=online within ${WORKSPACE_ONLINE_TIMEOUT_SECS}s (last status=$WS_LAST_STATUS, err=$WS_LAST_ERR)"
|
||||
fi
|
||||
WS_STATUS=$(ws_field "$WS_ID" "status")
|
||||
if [ "$WS_STATUS" != "$WS_LAST_STATUS" ]; then
|
||||
log " $WS_ID → $WS_STATUS"
|
||||
WS_LAST_STATUS="$WS_STATUS"
|
||||
fi
|
||||
if [ "$WS_STATUS" = "online" ]; then
|
||||
ORIGINAL_INSTANCE_ID=$(ws_field "$WS_ID" "instance_id")
|
||||
if [ -n "$ORIGINAL_INSTANCE_ID" ]; then
|
||||
break
|
||||
fi
|
||||
# online but instance_id not surfaced yet — keep polling briefly.
|
||||
log " $WS_ID online but instance_id not populated yet — waiting"
|
||||
fi
|
||||
# 'failed' is transient on cold boot (bootstrap-watcher deadline vs heartbeat
|
||||
# recovery, cp#245). Keep polling; only the deadline hard-fails.
|
||||
sleep 10
|
||||
done
|
||||
ok "Workspace online (instance_id=$ORIGINAL_INSTANCE_ID)"
|
||||
|
||||
# ─── 5. Kill the EC2 ────────────────────────────────────────────────────
|
||||
# Terminate the EXACT instance the workspace reported. Prefer the captured
|
||||
# instance_id (precise — kills only this workspace's box); fall back to the
|
||||
# slug-tag describe if the API didn't surface an id (shouldn't happen — we
|
||||
# only break out of the online-wait once instance_id is non-empty).
|
||||
log "5/6 KILLING the workspace EC2 to simulate an out-of-band termination..."
|
||||
if ! e2e_aws_creds_available; then
|
||||
fail "AWS CLI/creds unavailable — cannot terminate the EC2 to exercise the reconciler. Set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY (the CI workflow wires these)."
|
||||
fi
|
||||
AWS_REGION_RESOLVED=$(e2e_aws_region)
|
||||
if [ -n "$ORIGINAL_INSTANCE_ID" ]; then
|
||||
log " Terminating $ORIGINAL_INSTANCE_ID in $AWS_REGION_RESOLVED (aws ec2 terminate-instances)..."
|
||||
aws ec2 terminate-instances --region "$AWS_REGION_RESOLVED" --instance-ids "$ORIGINAL_INSTANCE_ID" >/dev/null \
|
||||
|| fail "aws ec2 terminate-instances failed for $ORIGINAL_INSTANCE_ID"
|
||||
KILLED_IDS="$ORIGINAL_INSTANCE_ID"
|
||||
else
|
||||
# Fallback path — find by slug tag and terminate.
|
||||
log " instance_id was empty — falling back to slug-tag describe ($SLUG)..."
|
||||
ROWS=$(e2e_ec2_instances_for_slug "$SLUG" 2>/dev/null || echo "")
|
||||
KILLED_IDS=$(echo "$ROWS" | awk 'NF {print $1}' | sort -u | tr '\n' ' ')
|
||||
[ -n "$KILLED_IDS" ] || fail "No slug-tagged EC2 found for $SLUG — nothing to terminate"
|
||||
log " Terminating $KILLED_IDS in $AWS_REGION_RESOLVED..."
|
||||
e2e_terminate_instances "$KILLED_IDS" || fail "terminate-instances failed for $KILLED_IDS"
|
||||
fi
|
||||
ok "Terminated EC2: $KILLED_IDS — reconciler should now detect the dead instance"
|
||||
|
||||
# ─── 6a. PRIMARY assertion — workspace leaves 'online' ─────────────────
|
||||
# This is THE regression gate for core#2261/#2247. The reconciler runs every
|
||||
# 60s in the tenant's workspace-server; when CPProvisioner.IsRunning returns a
|
||||
# clean "not running" for the terminated EC2, onOffline flips the row off
|
||||
# 'online'. A dead instance that keeps reading 'online' is exactly the bug.
|
||||
log "6a/6 PRIMARY: asserting workspace leaves 'online' within ${RECONCILE_OFFLINE_TIMEOUT_SECS}s (reconciler heal-detection)..."
|
||||
OFFLINE_DEADLINE=$(( $(date +%s) + RECONCILE_OFFLINE_TIMEOUT_SECS ))
|
||||
LEFT_ONLINE=0
|
||||
REC_LAST_STATUS=""
|
||||
while true; do
|
||||
if [ "$(date +%s)" -gt "$OFFLINE_DEADLINE" ]; then
|
||||
break
|
||||
fi
|
||||
REC_STATUS=$(ws_field "$WS_ID" "status")
|
||||
if [ "$REC_STATUS" != "$REC_LAST_STATUS" ]; then
|
||||
log " $WS_ID status → ${REC_STATUS:-<empty>}"
|
||||
REC_LAST_STATUS="$REC_STATUS"
|
||||
fi
|
||||
# Any non-online status (offline/provisioning/awaiting_agent/restarting/…)
|
||||
# proves the reconciler acted. We deliberately don't pin the exact target
|
||||
# status: onOffline flips offline AND kicks RestartByID, so the row may race
|
||||
# straight into a provisioning/restarting state — all of which are "no longer
|
||||
# falsely online".
|
||||
if [ -n "$REC_STATUS" ] && [ "$REC_STATUS" != "online" ]; then
|
||||
LEFT_ONLINE=1
|
||||
ok "PRIMARY held — workspace left 'online' (now '$REC_STATUS') after EC2 termination"
|
||||
break
|
||||
fi
|
||||
sleep 10
|
||||
done
|
||||
|
||||
if [ "$LEFT_ONLINE" != "1" ]; then
|
||||
fail "PRIMARY FAILED (core#2261 regression): workspace $WS_ID still reads status=online ${RECONCILE_OFFLINE_TIMEOUT_SECS}s after its EC2 ($KILLED_IDS) was terminated. The reconciler did NOT detect the dead instance — a terminated EC2 is masquerading as a healthy workspace."
|
||||
fi
|
||||
|
||||
# ─── 6b. SECONDARY assertion — auto-reprovision (best-effort) ──────────
|
||||
# The onOffline → RestartByID existing-volume heal should bring the workspace
|
||||
# back to 'online' on a NEW instance_id. This is best-effort: a full EC2 cold
|
||||
# reprovision is a multi-minute path that shares the same boot-flake surface
|
||||
# as the initial provision. If it doesn't finish within the bound we LOG it
|
||||
# clearly but DO NOT fail — the PRIMARY assertion above is the gate.
|
||||
#
|
||||
# FUTURE TIGHTENING (deliberately one edit away): once this reprovision path
|
||||
# is proven reliable on staging, promote the `log "SECONDARY ..."` soft-miss
|
||||
# below to a `fail ...` so a stuck reprovision becomes a hard gate.
|
||||
log "6b/6 SECONDARY (best-effort): asserting auto-reprovision to online with a NEW instance_id within ${REPROVISION_TIMEOUT_SECS}s..."
|
||||
REPROV_DEADLINE=$(( $(date +%s) + REPROVISION_TIMEOUT_SECS ))
|
||||
REPROV_OK=0
|
||||
REPROV_LAST_STATUS=""
|
||||
NEW_INSTANCE_ID=""
|
||||
while true; do
|
||||
if [ "$(date +%s)" -gt "$REPROV_DEADLINE" ]; then
|
||||
break
|
||||
fi
|
||||
RP_STATUS=$(ws_field "$WS_ID" "status")
|
||||
if [ "$RP_STATUS" != "$REPROV_LAST_STATUS" ]; then
|
||||
log " $WS_ID status → ${RP_STATUS:-<empty>}"
|
||||
REPROV_LAST_STATUS="$RP_STATUS"
|
||||
fi
|
||||
if [ "$RP_STATUS" = "online" ]; then
|
||||
NEW_INSTANCE_ID=$(ws_field "$WS_ID" "instance_id")
|
||||
if [ -n "$NEW_INSTANCE_ID" ] && [ "$NEW_INSTANCE_ID" != "$ORIGINAL_INSTANCE_ID" ]; then
|
||||
REPROV_OK=1
|
||||
break
|
||||
fi
|
||||
# online again but instance_id either not surfaced yet or still the old
|
||||
# (terminated) id — keep polling until the reprovision swaps it.
|
||||
fi
|
||||
sleep 15
|
||||
done
|
||||
|
||||
if [ "$REPROV_OK" = "1" ]; then
|
||||
ok "SECONDARY held — auto-reprovisioned to online on NEW instance_id=$NEW_INSTANCE_ID (was $ORIGINAL_INSTANCE_ID)"
|
||||
else
|
||||
# Soft-miss — see FUTURE TIGHTENING note above. PRIMARY is the gate.
|
||||
log "⚠️ SECONDARY not satisfied within ${REPROVISION_TIMEOUT_SECS}s (status=${REPROV_LAST_STATUS:-<empty>}, instance_id=${NEW_INSTANCE_ID:-<none>}, original=$ORIGINAL_INSTANCE_ID). NOT failing — the PRIMARY heal-detection assertion is the gate; reprovision is a slower, flakier cold path. Promote this to a hard fail once it's proven reliable."
|
||||
fi
|
||||
|
||||
ok "Reconciler live E2E PASSED — PRIMARY heal-detection held (SECONDARY: $([ "$REPROV_OK" = "1" ] && echo "held" || echo "soft-miss, logged"))"
|
||||
# Teardown runs via the EXIT trap.
|
||||
@@ -161,7 +161,7 @@ func (h *PluginsHandler) uninstallViaDocker(ctx context.Context, c *gin.Context,
|
||||
// 1. Strip plugin's rule/fragment markers from CLAUDE.md (mirrors
|
||||
// AgentskillsAdaptor.uninstall lines 184-188). Best-effort: if
|
||||
// the user edited CLAUDE.md, our marker stays untouched.
|
||||
h.stripPluginMarkersFromMemory(ctx, containerName, pluginName)
|
||||
h.stripPluginMarkersFromMemory(ctx, workspaceID, containerName, pluginName)
|
||||
|
||||
// 2. Remove copied skill dirs declared in the plugin's plugin.yaml.
|
||||
for _, skill := range skillNames {
|
||||
@@ -171,9 +171,11 @@ func (h *PluginsHandler) uninstallViaDocker(ctx context.Context, c *gin.Context,
|
||||
log.Printf("Plugin uninstall: skipping invalid skill name %q in %s: %v", skill, pluginName, err)
|
||||
continue
|
||||
}
|
||||
_, _ = h.execAsRoot(ctx, containerName, []string{
|
||||
if _, rmErr := h.execAsRoot(ctx, containerName, []string{
|
||||
"rm", "-rf", "/configs/skills/" + skill,
|
||||
})
|
||||
}); rmErr != nil {
|
||||
log.Printf("Plugin uninstall: failed to remove skill %s from %s: %v", skill, workspaceID, rmErr)
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Delete the plugin directory itself (as root to handle file ownership).
|
||||
|
||||
@@ -393,7 +393,7 @@ func (h *PluginsHandler) readPluginSkillsFromContainer(ctx context.Context, cont
|
||||
// `# Plugin: <name> /` — mirrors AgentskillsAdaptor.uninstall's stripping
|
||||
// logic so install/uninstall are symmetric. Best-effort: silent on read or
|
||||
// write failure, since the rest of uninstall must still succeed.
|
||||
func (h *PluginsHandler) stripPluginMarkersFromMemory(ctx context.Context, containerName, pluginName string) {
|
||||
func (h *PluginsHandler) stripPluginMarkersFromMemory(ctx context.Context, workspaceID, containerName, pluginName string) {
|
||||
// Use sed via bash -c for atomic in-place delete: drop the marker line
|
||||
// and the blank line that follows it (install adds a leading blank line
|
||||
// before the marker via append_to_memory). Three sed passes mirror the
|
||||
@@ -417,7 +417,9 @@ func (h *PluginsHandler) stripPluginMarkersFromMemory(ctx context.Context, conta
|
||||
`awk 'BEGIN{skip=0; blanks=0} /^%s/{skip=1; blanks=0; next} skip==1 && /^[[:space:]]*$/{blanks++; if(blanks>=2){skip=0; print; next} next} /^# Plugin: /{if(skip==1)skip=0} skip==1{next} {print}' /configs/CLAUDE.md > /tmp/claude.new && mv /tmp/claude.new /configs/CLAUDE.md`,
|
||||
regexpEscapeForAwk(marker),
|
||||
)
|
||||
_, _ = h.execAsRoot(ctx, containerName, []string{"bash", "-c", script})
|
||||
if _, awkErr := h.execAsRoot(ctx, containerName, []string{"bash", "-c", script}); awkErr != nil {
|
||||
log.Printf("Plugin uninstall: failed to strip markers from CLAUDE.md for %s in %s: %v", pluginName, workspaceID, awkErr)
|
||||
}
|
||||
}
|
||||
|
||||
// regexpEscapeForAwk escapes characters that have special meaning inside an
|
||||
|
||||
Reference in New Issue
Block a user