fix(e2e): fail teardown on leaked EC2 #1660
@@ -145,6 +145,11 @@ jobs:
|
||||
E2E_KEEP_ORG: ${{ github.event.inputs.keep_org == 'true' && '1' || '' }}
|
||||
MOLECULE_CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }}
|
||||
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
AWS_DEFAULT_REGION: us-east-2
|
||||
E2E_AWS_LEAK_CHECK: required
|
||||
E2E_AWS_TERMINATE_LEAKS: '1'
|
||||
# MiniMax key is the canary's PRIMARY auth path. claude-code
|
||||
# template's `minimax` provider routes ANTHROPIC_BASE_URL to
|
||||
# api.minimax.io/anthropic and reads MINIMAX_API_KEY at boot.
|
||||
@@ -185,6 +190,12 @@ jobs:
|
||||
echo "::error::Set it at Settings → Secrets and Variables → Actions; pull from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
|
||||
exit 1
|
||||
fi
|
||||
for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
|
||||
if [ -z "${!var:-}" ]; then
|
||||
echo "::error::$var secret missing — EC2 leak verification cannot run"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
# LLM-key requirement is per-runtime: claude-code accepts
|
||||
# EITHER MiniMax OR direct-Anthropic (whichever is set first),
|
||||
|
||||
@@ -49,6 +49,8 @@ on:
|
||||
- 'workspace-server/internal/middleware/**'
|
||||
- 'workspace-server/internal/provisioner/**'
|
||||
- 'tests/e2e/test_staging_full_saas.sh'
|
||||
- 'tests/e2e/lib/aws_leak_check.sh'
|
||||
- 'tests/e2e/test_aws_leak_check.sh'
|
||||
- '.gitea/workflows/e2e-staging-saas.yml'
|
||||
pull_request:
|
||||
branches: [main]
|
||||
@@ -59,6 +61,8 @@ on:
|
||||
- 'workspace-server/internal/middleware/**'
|
||||
- 'workspace-server/internal/provisioner/**'
|
||||
- 'tests/e2e/test_staging_full_saas.sh'
|
||||
- 'tests/e2e/lib/aws_leak_check.sh'
|
||||
- 'tests/e2e/test_aws_leak_check.sh'
|
||||
- '.gitea/workflows/e2e-staging-saas.yml'
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
@@ -127,6 +131,11 @@ jobs:
|
||||
# (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
|
||||
# internal#322 — see this PR for the cross-workflow sweep.
|
||||
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
AWS_DEFAULT_REGION: us-east-2
|
||||
E2E_AWS_LEAK_CHECK: required
|
||||
E2E_AWS_TERMINATE_LEAKS: '1'
|
||||
# MiniMax is the PRIMARY LLM auth path post-2026-05-04. Switched
|
||||
# from hermes+OpenAI default after #2578 (the staging OpenAI key
|
||||
# account went over quota and stayed dead for 36+ hours, taking
|
||||
@@ -165,6 +174,12 @@ jobs:
|
||||
echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
|
||||
exit 2
|
||||
fi
|
||||
for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
|
||||
if [ -z "${!var:-}" ]; then
|
||||
echo "::error::$var not set — EC2 leak verification cannot run"
|
||||
exit 2
|
||||
fi
|
||||
done
|
||||
echo "Admin token present ✓"
|
||||
|
||||
- name: Verify LLM key present
|
||||
|
||||
@@ -47,6 +47,11 @@ jobs:
|
||||
# (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
|
||||
# internal#322 — see this PR for the cross-workflow sweep.
|
||||
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
AWS_DEFAULT_REGION: us-east-2
|
||||
E2E_AWS_LEAK_CHECK: required
|
||||
E2E_AWS_TERMINATE_LEAKS: '1'
|
||||
E2E_MODE: smoke
|
||||
E2E_RUNTIME: hermes
|
||||
E2E_RUN_ID: "sanity-${{ github.run_id }}"
|
||||
@@ -61,6 +66,12 @@ jobs:
|
||||
echo "::error::CP_STAGING_ADMIN_API_TOKEN not set"
|
||||
exit 2
|
||||
fi
|
||||
for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
|
||||
if [ -z "${!var:-}" ]; then
|
||||
echo "::error::$var not set — EC2 leak verification cannot run"
|
||||
exit 2
|
||||
fi
|
||||
done
|
||||
|
||||
# Inverted assertion: the run MUST fail. If it passes, the
|
||||
# E2E_INTENTIONAL_FAILURE path is broken.
|
||||
|
||||
@@ -81,6 +81,11 @@ jobs:
|
||||
# (dead in org secret store) to CP_STAGING_ADMIN_API_TOKEN per
|
||||
# internal#322 — see this PR for the cross-workflow sweep.
|
||||
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
AWS_DEFAULT_REGION: us-east-2
|
||||
E2E_AWS_LEAK_CHECK: required
|
||||
E2E_AWS_TERMINATE_LEAKS: '1'
|
||||
# MiniMax is the smoke's PRIMARY LLM auth path post-2026-05-04.
|
||||
# Switched from hermes+OpenAI after #2578 (the staging OpenAI key
|
||||
# account went over quota and stayed dead for 36+ hours, taking
|
||||
@@ -129,6 +134,12 @@ jobs:
|
||||
echo "::error::CP_STAGING_ADMIN_API_TOKEN not set"
|
||||
exit 2
|
||||
fi
|
||||
for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
|
||||
if [ -z "${!var:-}" ]; then
|
||||
echo "::error::$var not set — EC2 leak verification cannot run"
|
||||
exit 2
|
||||
fi
|
||||
done
|
||||
|
||||
- name: Verify LLM key present
|
||||
run: |
|
||||
|
||||
Executable
+116
@@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# EC2 leak check for staging E2E harnesses.
|
||||
#
|
||||
# Modes:
|
||||
# E2E_AWS_LEAK_CHECK=off skip
|
||||
# E2E_AWS_LEAK_CHECK=auto check only when aws + credentials exist
|
||||
# E2E_AWS_LEAK_CHECK=required fail if aws + credentials are unavailable
|
||||
#
|
||||
# Optional:
|
||||
# E2E_AWS_LEAK_CHECK_SECS poll budget, default 90
|
||||
# E2E_AWS_LEAK_CHECK_INTERVAL poll interval, default 10
|
||||
# E2E_AWS_TERMINATE_LEAKS=1 terminate matching leaked instances
|
||||
|
||||
e2e_aws_leak_mode() {
|
||||
echo "${E2E_AWS_LEAK_CHECK:-auto}"
|
||||
}
|
||||
|
||||
e2e_aws_region() {
|
||||
echo "${E2E_AWS_REGION:-${AWS_REGION:-${AWS_DEFAULT_REGION:-us-east-2}}}"
|
||||
}
|
||||
|
||||
e2e_aws_creds_available() {
|
||||
command -v aws >/dev/null 2>&1 || return 1
|
||||
[ -n "${AWS_ACCESS_KEY_ID:-}" ] || return 1
|
||||
[ -n "${AWS_SECRET_ACCESS_KEY:-}" ] || return 1
|
||||
}
|
||||
|
||||
e2e_ec2_instances_for_slug() {
|
||||
local slug="$1"
|
||||
local region
|
||||
region=$(e2e_aws_region)
|
||||
|
||||
# shellcheck disable=SC2016
|
||||
aws ec2 describe-instances \
|
||||
--region "$region" \
|
||||
--filters "Name=tag:Name,Values=*$slug*" \
|
||||
"Name=instance-state-name,Values=pending,running,stopping,stopped" \
|
||||
--query 'Reservations[].Instances[].[InstanceId,State.Name,Tags[?Key==`Name`].Value|[0]]' \
|
||||
--output text
|
||||
}
|
||||
|
||||
e2e_terminate_instances() {
|
||||
local ids="$1"
|
||||
local region
|
||||
region=$(e2e_aws_region)
|
||||
|
||||
[ -n "$ids" ] || return 0
|
||||
# shellcheck disable=SC2086
|
||||
aws ec2 terminate-instances --region "$region" --instance-ids $ids >/dev/null
|
||||
}
|
||||
|
||||
e2e_verify_no_ec2_leaks_for_slug() {
|
||||
local slug="$1"
|
||||
local mode
|
||||
local max_secs
|
||||
local interval
|
||||
local elapsed=0
|
||||
local rows=""
|
||||
local ids=""
|
||||
|
||||
mode=$(e2e_aws_leak_mode)
|
||||
case "$mode" in
|
||||
off)
|
||||
echo "[aws-leak-check] skipped: E2E_AWS_LEAK_CHECK=off" >&2
|
||||
return 0
|
||||
;;
|
||||
auto|required) ;;
|
||||
*)
|
||||
echo "[aws-leak-check] invalid E2E_AWS_LEAK_CHECK=$mode (expected off|auto|required)" >&2
|
||||
return 2
|
||||
;;
|
||||
esac
|
||||
|
||||
if ! e2e_aws_creds_available; then
|
||||
if [ "$mode" = "required" ]; then
|
||||
echo "[aws-leak-check] required but aws CLI or AWS credentials are unavailable" >&2
|
||||
return 2
|
||||
fi
|
||||
echo "[aws-leak-check] skipped: aws CLI or AWS credentials unavailable" >&2
|
||||
return 0
|
||||
fi
|
||||
|
||||
max_secs="${E2E_AWS_LEAK_CHECK_SECS:-90}"
|
||||
interval="${E2E_AWS_LEAK_CHECK_INTERVAL:-10}"
|
||||
|
||||
while true; do
|
||||
rows=$(e2e_ec2_instances_for_slug "$slug" 2>&1) || {
|
||||
echo "[aws-leak-check] aws ec2 describe-instances failed for slug=$slug" >&2
|
||||
echo "$rows" >&2
|
||||
return 2
|
||||
}
|
||||
|
||||
if [ -z "$rows" ] || [ "$rows" = "None" ]; then
|
||||
echo "[aws-leak-check] no live EC2 instances for slug=$slug" >&2
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [ "$elapsed" -ge "$max_secs" ]; then
|
||||
echo "[aws-leak-check] leaked EC2 instance(s) for slug=$slug after ${elapsed}s:" >&2
|
||||
echo "$rows" >&2
|
||||
if [ "${E2E_AWS_TERMINATE_LEAKS:-0}" = "1" ]; then
|
||||
ids=$(echo "$rows" | awk 'NF {print $1}' | sort -u | tr '\n' ' ')
|
||||
echo "[aws-leak-check] terminating leaked EC2 instance(s): $ids" >&2
|
||||
e2e_terminate_instances "$ids" || {
|
||||
echo "[aws-leak-check] terminate-instances failed for: $ids" >&2
|
||||
return 4
|
||||
}
|
||||
fi
|
||||
return 4
|
||||
fi
|
||||
|
||||
sleep "$interval"
|
||||
elapsed=$((elapsed + interval))
|
||||
done
|
||||
}
|
||||
Executable
+109
@@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env bash
|
||||
set -uo pipefail
|
||||
|
||||
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck disable=SC1091
|
||||
# shellcheck source=lib/aws_leak_check.sh
|
||||
source "$SCRIPT_DIR/lib/aws_leak_check.sh"
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
|
||||
TMPDIR_E2E=$(mktemp -d -t aws-leak-check-e2e-XXXXXX)
|
||||
trap 'rm -rf "$TMPDIR_E2E"' EXIT INT TERM
|
||||
|
||||
make_fake_aws() {
|
||||
local body="$1"
|
||||
mkdir -p "$TMPDIR_E2E/bin"
|
||||
cat > "$TMPDIR_E2E/bin/aws" <<EOF
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
echo "\$*" >> "$TMPDIR_E2E/aws.calls"
|
||||
$body
|
||||
EOF
|
||||
chmod +x "$TMPDIR_E2E/bin/aws"
|
||||
}
|
||||
|
||||
reset_env() {
|
||||
/bin/rm -f "$TMPDIR_E2E/aws.calls"
|
||||
export PATH="$TMPDIR_E2E/bin:$ORIG_PATH"
|
||||
export AWS_ACCESS_KEY_ID=test-access
|
||||
export AWS_SECRET_ACCESS_KEY=test-secret
|
||||
export AWS_DEFAULT_REGION=us-east-2
|
||||
export E2E_AWS_LEAK_CHECK=required
|
||||
export E2E_AWS_LEAK_CHECK_SECS=0
|
||||
export E2E_AWS_LEAK_CHECK_INTERVAL=1
|
||||
unset E2E_AWS_TERMINATE_LEAKS
|
||||
}
|
||||
|
||||
assert_rc() {
|
||||
local label="$1"
|
||||
local expected="$2"
|
||||
shift 2
|
||||
local observed
|
||||
"$@" >/tmp/aws-leak-check.out 2>/tmp/aws-leak-check.err
|
||||
observed=$?
|
||||
if [ "$observed" = "$expected" ]; then
|
||||
echo " PASS $label"
|
||||
PASS=$((PASS + 1))
|
||||
else
|
||||
echo " FAIL $label: expected rc=$expected observed=$observed" >&2
|
||||
echo " stderr:" >&2
|
||||
sed 's/^/ /' /tmp/aws-leak-check.err >&2
|
||||
FAIL=$((FAIL + 1))
|
||||
fi
|
||||
}
|
||||
|
||||
ORIG_PATH="$PATH"
|
||||
|
||||
echo "Test: AWS EC2 leak check helper"
|
||||
|
||||
reset_env
|
||||
/bin/rm -rf "${TMPDIR_E2E:?}/bin"
|
||||
/bin/mkdir -p "$TMPDIR_E2E/noaws"
|
||||
export PATH="$TMPDIR_E2E/noaws"
|
||||
export E2E_AWS_LEAK_CHECK=auto
|
||||
assert_rc "auto mode skips when aws is unavailable" 0 e2e_verify_no_ec2_leaks_for_slug e2e-smoke-test
|
||||
|
||||
reset_env
|
||||
/bin/rm -rf "${TMPDIR_E2E:?}/bin"
|
||||
/bin/mkdir -p "$TMPDIR_E2E/noaws"
|
||||
export PATH="$TMPDIR_E2E/noaws"
|
||||
export E2E_AWS_LEAK_CHECK=required
|
||||
assert_rc "required mode fails when aws is unavailable" 2 e2e_verify_no_ec2_leaks_for_slug e2e-smoke-test
|
||||
|
||||
reset_env
|
||||
# shellcheck disable=SC2016
|
||||
make_fake_aws 'if [ "$1 $2" = "ec2 describe-instances" ]; then exit 0; fi'
|
||||
assert_rc "no matching EC2 returns clean" 0 e2e_verify_no_ec2_leaks_for_slug e2e-smoke-test
|
||||
|
||||
reset_env
|
||||
# shellcheck disable=SC2016
|
||||
make_fake_aws 'if [ "$1 $2" = "ec2 describe-instances" ]; then echo "i-123 running ws-tenant-e2e-smoke-test-abc"; exit 0; fi'
|
||||
assert_rc "persistent matching EC2 is a leak" 4 e2e_verify_no_ec2_leaks_for_slug e2e-smoke-test
|
||||
|
||||
reset_env
|
||||
export E2E_AWS_TERMINATE_LEAKS=1
|
||||
# shellcheck disable=SC2016
|
||||
make_fake_aws '
|
||||
if [ "$1 $2" = "ec2 describe-instances" ]; then
|
||||
echo "i-123 running ws-tenant-e2e-smoke-test-abc"
|
||||
exit 0
|
||||
fi
|
||||
if [ "$1 $2" = "ec2 terminate-instances" ]; then
|
||||
echo "terminated" >/dev/null
|
||||
exit 0
|
||||
fi
|
||||
'
|
||||
assert_rc "terminate mode attempts cleanup before returning leak" 4 e2e_verify_no_ec2_leaks_for_slug e2e-smoke-test
|
||||
if grep -q "terminate-instances" "$TMPDIR_E2E/aws.calls"; then
|
||||
echo " PASS terminate-instances was called"
|
||||
PASS=$((PASS + 1))
|
||||
else
|
||||
echo " FAIL terminate-instances was not called" >&2
|
||||
FAIL=$((FAIL + 1))
|
||||
fi
|
||||
|
||||
echo
|
||||
echo "passed=$PASS failed=$FAIL"
|
||||
[ "$FAIL" = "0" ]
|
||||
@@ -32,6 +32,11 @@
|
||||
# mapped to `smoke` for back-compat with
|
||||
# any in-flight runner picking up an older
|
||||
# workflow checkout)
|
||||
# E2E_AWS_LEAK_CHECK auto (default) | required | off
|
||||
# required in CI so teardown cannot report
|
||||
# clean while slug-tagged EC2 remains alive
|
||||
# E2E_AWS_TERMINATE_LEAKS 1 → terminate slug-tagged leaked EC2 before
|
||||
# exiting 4
|
||||
# E2E_INTENTIONAL_FAILURE 1 → poison tenant token mid-run so the
|
||||
# script fails; the EXIT trap MUST still
|
||||
# tear down cleanly (and exit 4 on leak).
|
||||
@@ -82,8 +87,12 @@ ok() { echo "[$(date +%H:%M:%S)] ✅ $*"; }
|
||||
# Per-runtime model slug dispatch — see lib/model_slug.sh for the rationale.
|
||||
# Extracted so unit tests (tests/e2e/test_model_slug.sh) can pin every branch
|
||||
# without booting the full 11-step lifecycle.
|
||||
# shellcheck disable=SC1091
|
||||
# shellcheck source=lib/model_slug.sh
|
||||
source "$(dirname "$0")/lib/model_slug.sh"
|
||||
# shellcheck disable=SC1091
|
||||
# shellcheck source=lib/aws_leak_check.sh
|
||||
source "$(dirname "$0")/lib/aws_leak_check.sh"
|
||||
|
||||
CURL_COMMON=(-sS --fail-with-body --max-time 30)
|
||||
|
||||
@@ -119,12 +128,14 @@ cleanup_org() {
|
||||
# DELETE returns 5xx mid-cascade and the cascade finishes anyway,
|
||||
# and the case where DELETE legitimately exceeds 120s and we want
|
||||
# eventual-consistency confirmation.
|
||||
curl "${CURL_COMMON[@]}" --max-time 120 -X DELETE "$CP_URL/cp/admin/tenants/$SLUG" \
|
||||
if curl "${CURL_COMMON[@]}" --max-time 120 -X DELETE "$CP_URL/cp/admin/tenants/$SLUG" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"confirm\":\"$SLUG\"}" >/dev/null 2>&1 \
|
||||
&& ok "Teardown request accepted" \
|
||||
|| log "Teardown returned non-2xx (may already be gone)"
|
||||
-d "{\"confirm\":\"$SLUG\"}" >/dev/null 2>&1; then
|
||||
ok "Teardown request accepted"
|
||||
else
|
||||
log "Teardown returned non-2xx (may already be gone)"
|
||||
fi
|
||||
|
||||
local leak_count=1
|
||||
local elapsed=0
|
||||
@@ -144,7 +155,15 @@ cleanup_org() {
|
||||
echo "⚠️ LEAK: org $SLUG still present post-teardown after ${elapsed}s (count=$leak_count)" >&2
|
||||
exit 4
|
||||
fi
|
||||
ok "Teardown clean — no orphan resources for $SLUG (${elapsed}s)"
|
||||
local aws_leak_rc=0
|
||||
e2e_verify_no_ec2_leaks_for_slug "$SLUG" || aws_leak_rc=$?
|
||||
if [ "$aws_leak_rc" != "0" ]; then
|
||||
case "$aws_leak_rc" in
|
||||
2) exit 2 ;;
|
||||
*) exit 4 ;;
|
||||
esac
|
||||
fi
|
||||
ok "Teardown clean — no orphan org or EC2 resources for $SLUG (${elapsed}s)"
|
||||
|
||||
# Normalize unexpected upstream exit codes to 1 (generic failure). The
|
||||
# script's documented contract (header "Exit codes" section) only emits
|
||||
|
||||
Reference in New Issue
Block a user