diff --git a/.gitea/workflows/continuous-synth-e2e.yml b/.gitea/workflows/continuous-synth-e2e.yml index 8b5f08546..d3f119a40 100644 --- a/.gitea/workflows/continuous-synth-e2e.yml +++ b/.gitea/workflows/continuous-synth-e2e.yml @@ -145,6 +145,11 @@ jobs: E2E_KEEP_ORG: ${{ github.event.inputs.keep_org == 'true' && '1' || '' }} MOLECULE_CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }} MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-east-2 + E2E_AWS_LEAK_CHECK: required + E2E_AWS_TERMINATE_LEAKS: '1' # MiniMax key is the canary's PRIMARY auth path. claude-code # template's `minimax` provider routes ANTHROPIC_BASE_URL to # api.minimax.io/anthropic and reads MINIMAX_API_KEY at boot. @@ -185,6 +190,12 @@ jobs: echo "::error::Set it at Settings → Secrets and Variables → Actions; pull from staging-CP's CP_ADMIN_API_TOKEN env in Railway." exit 1 fi + for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do + if [ -z "${!var:-}" ]; then + echo "::error::$var secret missing — EC2 leak verification cannot run" + exit 1 + fi + done # LLM-key requirement is per-runtime: claude-code accepts # EITHER MiniMax OR direct-Anthropic (whichever is set first), diff --git a/.gitea/workflows/e2e-staging-saas.yml b/.gitea/workflows/e2e-staging-saas.yml index f26cda9fc..096648bb3 100644 --- a/.gitea/workflows/e2e-staging-saas.yml +++ b/.gitea/workflows/e2e-staging-saas.yml @@ -49,6 +49,8 @@ on: - 'workspace-server/internal/middleware/**' - 'workspace-server/internal/provisioner/**' - 'tests/e2e/test_staging_full_saas.sh' + - 'tests/e2e/lib/aws_leak_check.sh' + - 'tests/e2e/test_aws_leak_check.sh' - '.gitea/workflows/e2e-staging-saas.yml' pull_request: branches: [main] @@ -59,6 +61,8 @@ on: - 'workspace-server/internal/middleware/**' - 'workspace-server/internal/provisioner/**' - 'tests/e2e/test_staging_full_saas.sh' + - 'tests/e2e/lib/aws_leak_check.sh' + - 'tests/e2e/test_aws_leak_check.sh' - '.gitea/workflows/e2e-staging-saas.yml' workflow_dispatch: schedule: @@ -127,6 +131,11 @@ jobs: # (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per # internal#322 — see this PR for the cross-workflow sweep. MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-east-2 + E2E_AWS_LEAK_CHECK: required + E2E_AWS_TERMINATE_LEAKS: '1' # MiniMax is the PRIMARY LLM auth path post-2026-05-04. Switched # from hermes+OpenAI default after #2578 (the staging OpenAI key # account went over quota and stayed dead for 36+ hours, taking @@ -165,6 +174,12 @@ jobs: echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)" exit 2 fi + for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do + if [ -z "${!var:-}" ]; then + echo "::error::$var not set — EC2 leak verification cannot run" + exit 2 + fi + done echo "Admin token present ✓" - name: Verify LLM key present diff --git a/.gitea/workflows/e2e-staging-sanity.yml b/.gitea/workflows/e2e-staging-sanity.yml index 03431ce8b..d1b8f8eb9 100644 --- a/.gitea/workflows/e2e-staging-sanity.yml +++ b/.gitea/workflows/e2e-staging-sanity.yml @@ -47,6 +47,11 @@ jobs: # (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per # internal#322 — see this PR for the cross-workflow sweep. MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-east-2 + E2E_AWS_LEAK_CHECK: required + E2E_AWS_TERMINATE_LEAKS: '1' E2E_MODE: smoke E2E_RUNTIME: hermes E2E_RUN_ID: "sanity-${{ github.run_id }}" @@ -61,6 +66,12 @@ jobs: echo "::error::CP_STAGING_ADMIN_API_TOKEN not set" exit 2 fi + for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do + if [ -z "${!var:-}" ]; then + echo "::error::$var not set — EC2 leak verification cannot run" + exit 2 + fi + done # Inverted assertion: the run MUST fail. If it passes, the # E2E_INTENTIONAL_FAILURE path is broken. diff --git a/.gitea/workflows/staging-smoke.yml b/.gitea/workflows/staging-smoke.yml index 623c47ff7..2d12fd32d 100644 --- a/.gitea/workflows/staging-smoke.yml +++ b/.gitea/workflows/staging-smoke.yml @@ -81,6 +81,11 @@ jobs: # (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per # internal#322 — see this PR for the cross-workflow sweep. MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-east-2 + E2E_AWS_LEAK_CHECK: required + E2E_AWS_TERMINATE_LEAKS: '1' # MiniMax is the smoke's PRIMARY LLM auth path post-2026-05-04. # Switched from hermes+OpenAI after #2578 (the staging OpenAI key # account went over quota and stayed dead for 36+ hours, taking @@ -129,6 +134,12 @@ jobs: echo "::error::CP_STAGING_ADMIN_API_TOKEN not set" exit 2 fi + for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do + if [ -z "${!var:-}" ]; then + echo "::error::$var not set — EC2 leak verification cannot run" + exit 2 + fi + done - name: Verify LLM key present run: | diff --git a/tests/e2e/lib/aws_leak_check.sh b/tests/e2e/lib/aws_leak_check.sh new file mode 100755 index 000000000..7f23fb18b --- /dev/null +++ b/tests/e2e/lib/aws_leak_check.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash + +# EC2 leak check for staging E2E harnesses. +# +# Modes: +# E2E_AWS_LEAK_CHECK=off skip +# E2E_AWS_LEAK_CHECK=auto check only when aws + credentials exist +# E2E_AWS_LEAK_CHECK=required fail if aws + credentials are unavailable +# +# Optional: +# E2E_AWS_LEAK_CHECK_SECS poll budget, default 90 +# E2E_AWS_LEAK_CHECK_INTERVAL poll interval, default 10 +# E2E_AWS_TERMINATE_LEAKS=1 terminate matching leaked instances + +e2e_aws_leak_mode() { + echo "${E2E_AWS_LEAK_CHECK:-auto}" +} + +e2e_aws_region() { + echo "${E2E_AWS_REGION:-${AWS_REGION:-${AWS_DEFAULT_REGION:-us-east-2}}}" +} + +e2e_aws_creds_available() { + command -v aws >/dev/null 2>&1 || return 1 + [ -n "${AWS_ACCESS_KEY_ID:-}" ] || return 1 + [ -n "${AWS_SECRET_ACCESS_KEY:-}" ] || return 1 +} + +e2e_ec2_instances_for_slug() { + local slug="$1" + local region + region=$(e2e_aws_region) + + # shellcheck disable=SC2016 + aws ec2 describe-instances \ + --region "$region" \ + --filters "Name=tag:Name,Values=*$slug*" \ + "Name=instance-state-name,Values=pending,running,stopping,stopped" \ + --query 'Reservations[].Instances[].[InstanceId,State.Name,Tags[?Key==`Name`].Value|[0]]' \ + --output text +} + +e2e_terminate_instances() { + local ids="$1" + local region + region=$(e2e_aws_region) + + [ -n "$ids" ] || return 0 + # shellcheck disable=SC2086 + aws ec2 terminate-instances --region "$region" --instance-ids $ids >/dev/null +} + +e2e_verify_no_ec2_leaks_for_slug() { + local slug="$1" + local mode + local max_secs + local interval + local elapsed=0 + local rows="" + local ids="" + + mode=$(e2e_aws_leak_mode) + case "$mode" in + off) + echo "[aws-leak-check] skipped: E2E_AWS_LEAK_CHECK=off" >&2 + return 0 + ;; + auto|required) ;; + *) + echo "[aws-leak-check] invalid E2E_AWS_LEAK_CHECK=$mode (expected off|auto|required)" >&2 + return 2 + ;; + esac + + if ! e2e_aws_creds_available; then + if [ "$mode" = "required" ]; then + echo "[aws-leak-check] required but aws CLI or AWS credentials are unavailable" >&2 + return 2 + fi + echo "[aws-leak-check] skipped: aws CLI or AWS credentials unavailable" >&2 + return 0 + fi + + max_secs="${E2E_AWS_LEAK_CHECK_SECS:-90}" + interval="${E2E_AWS_LEAK_CHECK_INTERVAL:-10}" + + while true; do + rows=$(e2e_ec2_instances_for_slug "$slug" 2>&1) || { + echo "[aws-leak-check] aws ec2 describe-instances failed for slug=$slug" >&2 + echo "$rows" >&2 + return 2 + } + + if [ -z "$rows" ] || [ "$rows" = "None" ]; then + echo "[aws-leak-check] no live EC2 instances for slug=$slug" >&2 + return 0 + fi + + if [ "$elapsed" -ge "$max_secs" ]; then + echo "[aws-leak-check] leaked EC2 instance(s) for slug=$slug after ${elapsed}s:" >&2 + echo "$rows" >&2 + if [ "${E2E_AWS_TERMINATE_LEAKS:-0}" = "1" ]; then + ids=$(echo "$rows" | awk 'NF {print $1}' | sort -u | tr '\n' ' ') + echo "[aws-leak-check] terminating leaked EC2 instance(s): $ids" >&2 + e2e_terminate_instances "$ids" || { + echo "[aws-leak-check] terminate-instances failed for: $ids" >&2 + return 4 + } + fi + return 4 + fi + + sleep "$interval" + elapsed=$((elapsed + interval)) + done +} diff --git a/tests/e2e/test_aws_leak_check.sh b/tests/e2e/test_aws_leak_check.sh new file mode 100755 index 000000000..ae0473cea --- /dev/null +++ b/tests/e2e/test_aws_leak_check.sh @@ -0,0 +1,109 @@ +#!/usr/bin/env bash +set -uo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck disable=SC1091 +# shellcheck source=lib/aws_leak_check.sh +source "$SCRIPT_DIR/lib/aws_leak_check.sh" + +PASS=0 +FAIL=0 + +TMPDIR_E2E=$(mktemp -d -t aws-leak-check-e2e-XXXXXX) +trap 'rm -rf "$TMPDIR_E2E"' EXIT INT TERM + +make_fake_aws() { + local body="$1" + mkdir -p "$TMPDIR_E2E/bin" + cat > "$TMPDIR_E2E/bin/aws" <> "$TMPDIR_E2E/aws.calls" +$body +EOF + chmod +x "$TMPDIR_E2E/bin/aws" +} + +reset_env() { + /bin/rm -f "$TMPDIR_E2E/aws.calls" + export PATH="$TMPDIR_E2E/bin:$ORIG_PATH" + export AWS_ACCESS_KEY_ID=test-access + export AWS_SECRET_ACCESS_KEY=test-secret + export AWS_DEFAULT_REGION=us-east-2 + export E2E_AWS_LEAK_CHECK=required + export E2E_AWS_LEAK_CHECK_SECS=0 + export E2E_AWS_LEAK_CHECK_INTERVAL=1 + unset E2E_AWS_TERMINATE_LEAKS +} + +assert_rc() { + local label="$1" + local expected="$2" + shift 2 + local observed + "$@" >/tmp/aws-leak-check.out 2>/tmp/aws-leak-check.err + observed=$? + if [ "$observed" = "$expected" ]; then + echo " PASS $label" + PASS=$((PASS + 1)) + else + echo " FAIL $label: expected rc=$expected observed=$observed" >&2 + echo " stderr:" >&2 + sed 's/^/ /' /tmp/aws-leak-check.err >&2 + FAIL=$((FAIL + 1)) + fi +} + +ORIG_PATH="$PATH" + +echo "Test: AWS EC2 leak check helper" + +reset_env +/bin/rm -rf "${TMPDIR_E2E:?}/bin" +/bin/mkdir -p "$TMPDIR_E2E/noaws" +export PATH="$TMPDIR_E2E/noaws" +export E2E_AWS_LEAK_CHECK=auto +assert_rc "auto mode skips when aws is unavailable" 0 e2e_verify_no_ec2_leaks_for_slug e2e-smoke-test + +reset_env +/bin/rm -rf "${TMPDIR_E2E:?}/bin" +/bin/mkdir -p "$TMPDIR_E2E/noaws" +export PATH="$TMPDIR_E2E/noaws" +export E2E_AWS_LEAK_CHECK=required +assert_rc "required mode fails when aws is unavailable" 2 e2e_verify_no_ec2_leaks_for_slug e2e-smoke-test + +reset_env +# shellcheck disable=SC2016 +make_fake_aws 'if [ "$1 $2" = "ec2 describe-instances" ]; then exit 0; fi' +assert_rc "no matching EC2 returns clean" 0 e2e_verify_no_ec2_leaks_for_slug e2e-smoke-test + +reset_env +# shellcheck disable=SC2016 +make_fake_aws 'if [ "$1 $2" = "ec2 describe-instances" ]; then echo "i-123 running ws-tenant-e2e-smoke-test-abc"; exit 0; fi' +assert_rc "persistent matching EC2 is a leak" 4 e2e_verify_no_ec2_leaks_for_slug e2e-smoke-test + +reset_env +export E2E_AWS_TERMINATE_LEAKS=1 +# shellcheck disable=SC2016 +make_fake_aws ' +if [ "$1 $2" = "ec2 describe-instances" ]; then + echo "i-123 running ws-tenant-e2e-smoke-test-abc" + exit 0 +fi +if [ "$1 $2" = "ec2 terminate-instances" ]; then + echo "terminated" >/dev/null + exit 0 +fi +' +assert_rc "terminate mode attempts cleanup before returning leak" 4 e2e_verify_no_ec2_leaks_for_slug e2e-smoke-test +if grep -q "terminate-instances" "$TMPDIR_E2E/aws.calls"; then + echo " PASS terminate-instances was called" + PASS=$((PASS + 1)) +else + echo " FAIL terminate-instances was not called" >&2 + FAIL=$((FAIL + 1)) +fi + +echo +echo "passed=$PASS failed=$FAIL" +[ "$FAIL" = "0" ] diff --git a/tests/e2e/test_staging_full_saas.sh b/tests/e2e/test_staging_full_saas.sh index a6e0ac3c2..a199d149d 100755 --- a/tests/e2e/test_staging_full_saas.sh +++ b/tests/e2e/test_staging_full_saas.sh @@ -32,6 +32,11 @@ # mapped to `smoke` for back-compat with # any in-flight runner picking up an older # workflow checkout) +# E2E_AWS_LEAK_CHECK auto (default) | required | off +# required in CI so teardown cannot report +# clean while slug-tagged EC2 remains alive +# E2E_AWS_TERMINATE_LEAKS 1 → terminate slug-tagged leaked EC2 before +# exiting 4 # E2E_INTENTIONAL_FAILURE 1 → poison tenant token mid-run so the # script fails; the EXIT trap MUST still # tear down cleanly (and exit 4 on leak). @@ -82,8 +87,12 @@ ok() { echo "[$(date +%H:%M:%S)] ✅ $*"; } # Per-runtime model slug dispatch — see lib/model_slug.sh for the rationale. # Extracted so unit tests (tests/e2e/test_model_slug.sh) can pin every branch # without booting the full 11-step lifecycle. +# shellcheck disable=SC1091 # shellcheck source=lib/model_slug.sh source "$(dirname "$0")/lib/model_slug.sh" +# shellcheck disable=SC1091 +# shellcheck source=lib/aws_leak_check.sh +source "$(dirname "$0")/lib/aws_leak_check.sh" CURL_COMMON=(-sS --fail-with-body --max-time 30) @@ -119,12 +128,14 @@ cleanup_org() { # DELETE returns 5xx mid-cascade and the cascade finishes anyway, # and the case where DELETE legitimately exceeds 120s and we want # eventual-consistency confirmation. - curl "${CURL_COMMON[@]}" --max-time 120 -X DELETE "$CP_URL/cp/admin/tenants/$SLUG" \ + if curl "${CURL_COMMON[@]}" --max-time 120 -X DELETE "$CP_URL/cp/admin/tenants/$SLUG" \ -H "Authorization: Bearer $ADMIN_TOKEN" \ -H "Content-Type: application/json" \ - -d "{\"confirm\":\"$SLUG\"}" >/dev/null 2>&1 \ - && ok "Teardown request accepted" \ - || log "Teardown returned non-2xx (may already be gone)" + -d "{\"confirm\":\"$SLUG\"}" >/dev/null 2>&1; then + ok "Teardown request accepted" + else + log "Teardown returned non-2xx (may already be gone)" + fi local leak_count=1 local elapsed=0 @@ -144,7 +155,15 @@ cleanup_org() { echo "⚠️ LEAK: org $SLUG still present post-teardown after ${elapsed}s (count=$leak_count)" >&2 exit 4 fi - ok "Teardown clean — no orphan resources for $SLUG (${elapsed}s)" + local aws_leak_rc=0 + e2e_verify_no_ec2_leaks_for_slug "$SLUG" || aws_leak_rc=$? + if [ "$aws_leak_rc" != "0" ]; then + case "$aws_leak_rc" in + 2) exit 2 ;; + *) exit 4 ;; + esac + fi + ok "Teardown clean — no orphan org or EC2 resources for $SLUG (${elapsed}s)" # Normalize unexpected upstream exit codes to 1 (generic failure). The # script's documented contract (header "Exit codes" section) only emits