fix(e2e): fail teardown on leaked EC2 #1660

Merged
hongming merged 1 commits from fix/e2e-aws-leak-verification into main 2026-05-22 00:36:10 +00:00
7 changed files with 297 additions and 5 deletions
+11
View File
@@ -145,6 +145,11 @@ jobs:
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org == 'true' && '1' || '' }}
MOLECULE_CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }}
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-east-2
E2E_AWS_LEAK_CHECK: required
E2E_AWS_TERMINATE_LEAKS: '1'
# MiniMax key is the canary's PRIMARY auth path. claude-code
# template's `minimax` provider routes ANTHROPIC_BASE_URL to
# api.minimax.io/anthropic and reads MINIMAX_API_KEY at boot.
@@ -185,6 +190,12 @@ jobs:
echo "::error::Set it at Settings → Secrets and Variables → Actions; pull from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
exit 1
fi
for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
if [ -z "${!var:-}" ]; then
echo "::error::$var secret missing — EC2 leak verification cannot run"
exit 1
fi
done
# LLM-key requirement is per-runtime: claude-code accepts
# EITHER MiniMax OR direct-Anthropic (whichever is set first),
+15
View File
@@ -49,6 +49,8 @@ on:
- 'workspace-server/internal/middleware/**'
- 'workspace-server/internal/provisioner/**'
- 'tests/e2e/test_staging_full_saas.sh'
- 'tests/e2e/lib/aws_leak_check.sh'
- 'tests/e2e/test_aws_leak_check.sh'
- '.gitea/workflows/e2e-staging-saas.yml'
pull_request:
branches: [main]
@@ -59,6 +61,8 @@ on:
- 'workspace-server/internal/middleware/**'
- 'workspace-server/internal/provisioner/**'
- 'tests/e2e/test_staging_full_saas.sh'
- 'tests/e2e/lib/aws_leak_check.sh'
- 'tests/e2e/test_aws_leak_check.sh'
- '.gitea/workflows/e2e-staging-saas.yml'
workflow_dispatch:
schedule:
@@ -127,6 +131,11 @@ jobs:
# (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
# internal#322 — see this PR for the cross-workflow sweep.
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-east-2
E2E_AWS_LEAK_CHECK: required
E2E_AWS_TERMINATE_LEAKS: '1'
# MiniMax is the PRIMARY LLM auth path post-2026-05-04. Switched
# from hermes+OpenAI default after #2578 (the staging OpenAI key
# account went over quota and stayed dead for 36+ hours, taking
@@ -165,6 +174,12 @@ jobs:
echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
exit 2
fi
for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
if [ -z "${!var:-}" ]; then
echo "::error::$var not set — EC2 leak verification cannot run"
exit 2
fi
done
echo "Admin token present ✓"
- name: Verify LLM key present
+11
View File
@@ -47,6 +47,11 @@ jobs:
# (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
# internal#322 — see this PR for the cross-workflow sweep.
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-east-2
E2E_AWS_LEAK_CHECK: required
E2E_AWS_TERMINATE_LEAKS: '1'
E2E_MODE: smoke
E2E_RUNTIME: hermes
E2E_RUN_ID: "sanity-${{ github.run_id }}"
@@ -61,6 +66,12 @@ jobs:
echo "::error::CP_STAGING_ADMIN_API_TOKEN not set"
exit 2
fi
for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
if [ -z "${!var:-}" ]; then
echo "::error::$var not set — EC2 leak verification cannot run"
exit 2
fi
done
# Inverted assertion: the run MUST fail. If it passes, the
# E2E_INTENTIONAL_FAILURE path is broken.
+11
View File
@@ -81,6 +81,11 @@ jobs:
# (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
# internal#322 — see this PR for the cross-workflow sweep.
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-east-2
E2E_AWS_LEAK_CHECK: required
E2E_AWS_TERMINATE_LEAKS: '1'
# MiniMax is the smoke's PRIMARY LLM auth path post-2026-05-04.
# Switched from hermes+OpenAI after #2578 (the staging OpenAI key
# account went over quota and stayed dead for 36+ hours, taking
@@ -129,6 +134,12 @@ jobs:
echo "::error::CP_STAGING_ADMIN_API_TOKEN not set"
exit 2
fi
for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
if [ -z "${!var:-}" ]; then
echo "::error::$var not set — EC2 leak verification cannot run"
exit 2
fi
done
- name: Verify LLM key present
run: |
+116
View File
@@ -0,0 +1,116 @@
#!/usr/bin/env bash
# EC2 leak check for staging E2E harnesses.
#
# Modes:
# E2E_AWS_LEAK_CHECK=off skip
# E2E_AWS_LEAK_CHECK=auto check only when aws + credentials exist
# E2E_AWS_LEAK_CHECK=required fail if aws + credentials are unavailable
#
# Optional:
# E2E_AWS_LEAK_CHECK_SECS poll budget, default 90
# E2E_AWS_LEAK_CHECK_INTERVAL poll interval, default 10
# E2E_AWS_TERMINATE_LEAKS=1 terminate matching leaked instances
e2e_aws_leak_mode() {
echo "${E2E_AWS_LEAK_CHECK:-auto}"
}
e2e_aws_region() {
echo "${E2E_AWS_REGION:-${AWS_REGION:-${AWS_DEFAULT_REGION:-us-east-2}}}"
}
e2e_aws_creds_available() {
command -v aws >/dev/null 2>&1 || return 1
[ -n "${AWS_ACCESS_KEY_ID:-}" ] || return 1
[ -n "${AWS_SECRET_ACCESS_KEY:-}" ] || return 1
}
e2e_ec2_instances_for_slug() {
local slug="$1"
local region
region=$(e2e_aws_region)
# shellcheck disable=SC2016
aws ec2 describe-instances \
--region "$region" \
--filters "Name=tag:Name,Values=*$slug*" \
"Name=instance-state-name,Values=pending,running,stopping,stopped" \
--query 'Reservations[].Instances[].[InstanceId,State.Name,Tags[?Key==`Name`].Value|[0]]' \
--output text
}
e2e_terminate_instances() {
local ids="$1"
local region
region=$(e2e_aws_region)
[ -n "$ids" ] || return 0
# shellcheck disable=SC2086
aws ec2 terminate-instances --region "$region" --instance-ids $ids >/dev/null
}
e2e_verify_no_ec2_leaks_for_slug() {
local slug="$1"
local mode
local max_secs
local interval
local elapsed=0
local rows=""
local ids=""
mode=$(e2e_aws_leak_mode)
case "$mode" in
off)
echo "[aws-leak-check] skipped: E2E_AWS_LEAK_CHECK=off" >&2
return 0
;;
auto|required) ;;
*)
echo "[aws-leak-check] invalid E2E_AWS_LEAK_CHECK=$mode (expected off|auto|required)" >&2
return 2
;;
esac
if ! e2e_aws_creds_available; then
if [ "$mode" = "required" ]; then
echo "[aws-leak-check] required but aws CLI or AWS credentials are unavailable" >&2
return 2
fi
echo "[aws-leak-check] skipped: aws CLI or AWS credentials unavailable" >&2
return 0
fi
max_secs="${E2E_AWS_LEAK_CHECK_SECS:-90}"
interval="${E2E_AWS_LEAK_CHECK_INTERVAL:-10}"
while true; do
rows=$(e2e_ec2_instances_for_slug "$slug" 2>&1) || {
echo "[aws-leak-check] aws ec2 describe-instances failed for slug=$slug" >&2
echo "$rows" >&2
return 2
}
if [ -z "$rows" ] || [ "$rows" = "None" ]; then
echo "[aws-leak-check] no live EC2 instances for slug=$slug" >&2
return 0
fi
if [ "$elapsed" -ge "$max_secs" ]; then
echo "[aws-leak-check] leaked EC2 instance(s) for slug=$slug after ${elapsed}s:" >&2
echo "$rows" >&2
if [ "${E2E_AWS_TERMINATE_LEAKS:-0}" = "1" ]; then
ids=$(echo "$rows" | awk 'NF {print $1}' | sort -u | tr '\n' ' ')
echo "[aws-leak-check] terminating leaked EC2 instance(s): $ids" >&2
e2e_terminate_instances "$ids" || {
echo "[aws-leak-check] terminate-instances failed for: $ids" >&2
return 4
}
fi
return 4
fi
sleep "$interval"
elapsed=$((elapsed + interval))
done
}
+109
View File
@@ -0,0 +1,109 @@
#!/usr/bin/env bash
set -uo pipefail
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck disable=SC1091
# shellcheck source=lib/aws_leak_check.sh
source "$SCRIPT_DIR/lib/aws_leak_check.sh"
PASS=0
FAIL=0
TMPDIR_E2E=$(mktemp -d -t aws-leak-check-e2e-XXXXXX)
trap 'rm -rf "$TMPDIR_E2E"' EXIT INT TERM
make_fake_aws() {
local body="$1"
mkdir -p "$TMPDIR_E2E/bin"
cat > "$TMPDIR_E2E/bin/aws" <<EOF
#!/usr/bin/env bash
set -euo pipefail
echo "\$*" >> "$TMPDIR_E2E/aws.calls"
$body
EOF
chmod +x "$TMPDIR_E2E/bin/aws"
}
reset_env() {
/bin/rm -f "$TMPDIR_E2E/aws.calls"
export PATH="$TMPDIR_E2E/bin:$ORIG_PATH"
export AWS_ACCESS_KEY_ID=test-access
export AWS_SECRET_ACCESS_KEY=test-secret
export AWS_DEFAULT_REGION=us-east-2
export E2E_AWS_LEAK_CHECK=required
export E2E_AWS_LEAK_CHECK_SECS=0
export E2E_AWS_LEAK_CHECK_INTERVAL=1
unset E2E_AWS_TERMINATE_LEAKS
}
assert_rc() {
local label="$1"
local expected="$2"
shift 2
local observed
"$@" >/tmp/aws-leak-check.out 2>/tmp/aws-leak-check.err
observed=$?
if [ "$observed" = "$expected" ]; then
echo " PASS $label"
PASS=$((PASS + 1))
else
echo " FAIL $label: expected rc=$expected observed=$observed" >&2
echo " stderr:" >&2
sed 's/^/ /' /tmp/aws-leak-check.err >&2
FAIL=$((FAIL + 1))
fi
}
ORIG_PATH="$PATH"
echo "Test: AWS EC2 leak check helper"
reset_env
/bin/rm -rf "${TMPDIR_E2E:?}/bin"
/bin/mkdir -p "$TMPDIR_E2E/noaws"
export PATH="$TMPDIR_E2E/noaws"
export E2E_AWS_LEAK_CHECK=auto
assert_rc "auto mode skips when aws is unavailable" 0 e2e_verify_no_ec2_leaks_for_slug e2e-smoke-test
reset_env
/bin/rm -rf "${TMPDIR_E2E:?}/bin"
/bin/mkdir -p "$TMPDIR_E2E/noaws"
export PATH="$TMPDIR_E2E/noaws"
export E2E_AWS_LEAK_CHECK=required
assert_rc "required mode fails when aws is unavailable" 2 e2e_verify_no_ec2_leaks_for_slug e2e-smoke-test
reset_env
# shellcheck disable=SC2016
make_fake_aws 'if [ "$1 $2" = "ec2 describe-instances" ]; then exit 0; fi'
assert_rc "no matching EC2 returns clean" 0 e2e_verify_no_ec2_leaks_for_slug e2e-smoke-test
reset_env
# shellcheck disable=SC2016
make_fake_aws 'if [ "$1 $2" = "ec2 describe-instances" ]; then echo "i-123 running ws-tenant-e2e-smoke-test-abc"; exit 0; fi'
assert_rc "persistent matching EC2 is a leak" 4 e2e_verify_no_ec2_leaks_for_slug e2e-smoke-test
reset_env
export E2E_AWS_TERMINATE_LEAKS=1
# shellcheck disable=SC2016
make_fake_aws '
if [ "$1 $2" = "ec2 describe-instances" ]; then
echo "i-123 running ws-tenant-e2e-smoke-test-abc"
exit 0
fi
if [ "$1 $2" = "ec2 terminate-instances" ]; then
echo "terminated" >/dev/null
exit 0
fi
'
assert_rc "terminate mode attempts cleanup before returning leak" 4 e2e_verify_no_ec2_leaks_for_slug e2e-smoke-test
if grep -q "terminate-instances" "$TMPDIR_E2E/aws.calls"; then
echo " PASS terminate-instances was called"
PASS=$((PASS + 1))
else
echo " FAIL terminate-instances was not called" >&2
FAIL=$((FAIL + 1))
fi
echo
echo "passed=$PASS failed=$FAIL"
[ "$FAIL" = "0" ]
+24 -5
View File
@@ -32,6 +32,11 @@
# mapped to `smoke` for back-compat with
# any in-flight runner picking up an older
# workflow checkout)
# E2E_AWS_LEAK_CHECK auto (default) | required | off
# required in CI so teardown cannot report
# clean while slug-tagged EC2 remains alive
# E2E_AWS_TERMINATE_LEAKS 1 → terminate slug-tagged leaked EC2 before
# exiting 4
# E2E_INTENTIONAL_FAILURE 1 → poison tenant token mid-run so the
# script fails; the EXIT trap MUST still
# tear down cleanly (and exit 4 on leak).
@@ -82,8 +87,12 @@ ok() { echo "[$(date +%H:%M:%S)] ✅ $*"; }
# Per-runtime model slug dispatch — see lib/model_slug.sh for the rationale.
# Extracted so unit tests (tests/e2e/test_model_slug.sh) can pin every branch
# without booting the full 11-step lifecycle.
# shellcheck disable=SC1091
# shellcheck source=lib/model_slug.sh
source "$(dirname "$0")/lib/model_slug.sh"
# shellcheck disable=SC1091
# shellcheck source=lib/aws_leak_check.sh
source "$(dirname "$0")/lib/aws_leak_check.sh"
CURL_COMMON=(-sS --fail-with-body --max-time 30)
@@ -119,12 +128,14 @@ cleanup_org() {
# DELETE returns 5xx mid-cascade and the cascade finishes anyway,
# and the case where DELETE legitimately exceeds 120s and we want
# eventual-consistency confirmation.
curl "${CURL_COMMON[@]}" --max-time 120 -X DELETE "$CP_URL/cp/admin/tenants/$SLUG" \
if curl "${CURL_COMMON[@]}" --max-time 120 -X DELETE "$CP_URL/cp/admin/tenants/$SLUG" \
-H "Authorization: Bearer $ADMIN_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"confirm\":\"$SLUG\"}" >/dev/null 2>&1 \
&& ok "Teardown request accepted" \
|| log "Teardown returned non-2xx (may already be gone)"
-d "{\"confirm\":\"$SLUG\"}" >/dev/null 2>&1; then
ok "Teardown request accepted"
else
log "Teardown returned non-2xx (may already be gone)"
fi
local leak_count=1
local elapsed=0
@@ -144,7 +155,15 @@ cleanup_org() {
echo "⚠️ LEAK: org $SLUG still present post-teardown after ${elapsed}s (count=$leak_count)" >&2
exit 4
fi
ok "Teardown clean — no orphan resources for $SLUG (${elapsed}s)"
local aws_leak_rc=0
e2e_verify_no_ec2_leaks_for_slug "$SLUG" || aws_leak_rc=$?
if [ "$aws_leak_rc" != "0" ]; then
case "$aws_leak_rc" in
2) exit 2 ;;
*) exit 4 ;;
esac
fi
ok "Teardown clean — no orphan org or EC2 resources for $SLUG (${elapsed}s)"
# Normalize unexpected upstream exit codes to 1 (generic failure). The
# script's documented contract (header "Exit codes" section) only emits