From 4fce32ec3c9b0e8f42f6fdb4f53992500b8d4ab0 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 28 Apr 2026 11:13:56 -0700 Subject: [PATCH] =?UTF-8?q?fix(e2e):=20teardown=20patience=20matches=20pro?= =?UTF-8?q?d=20cascade=20duration=20(~30=E2=80=9390s)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit E2E Staging SaaS has been failing on every cron + push run since 2026-04-27 with `LEAK: org … still present post-teardown (count=1)`, exit 4. Root cause: the curl timeout on the teardown DELETE was 30s and the post-DELETE leak check was a single 10s sleep — but the DELETE handler runs the full GDPR Art. 17 cascade synchronously, including EC2 termination which AWS reports in 30–60s. Real-world wall time on a prod-shaped run was 57s on 2026-04-27 (hongmingwang DELETE); the 30s curl timeout aborted the request mid-cascade and the 10s post-sleep check found the row still present (status not yet 'purged'). Two-part fix to match real cascade timing: 1. DELETE curl gets its own --max-time 120 (was 30) so the synchronous cascade has room to complete in-band. 2. The leak check polls up to 60s for status='purged' instead of one rigid 10s sleep. Covers two cases: - DELETE returns 5xx mid-cascade but the cascade finishes anyway (we still observe a clean state). - DELETE legitimately exceeds 120s — eventual-consistency catches the eventual purge instead of false-flagging a leak. The 5–15s estimate in `molecule-controlplane/internal/handlers/ purge.go`'s comment is the API-call cost only, not the AWS-side time-to-termination it waits on. The async-purge refactor noted in that comment would let us drop these timeouts back to ~15s — file that under future work. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/e2e/test_staging_full_saas.sh | 41 ++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/tests/e2e/test_staging_full_saas.sh b/tests/e2e/test_staging_full_saas.sh index 370df37b..47f11c28 100755 --- a/tests/e2e/test_staging_full_saas.sh +++ b/tests/e2e/test_staging_full_saas.sh @@ -86,24 +86,47 @@ cleanup_org() { fi log "🧹 Tearing down org $SLUG..." - curl "${CURL_COMMON[@]}" -X DELETE "$CP_URL/cp/admin/tenants/$SLUG" \ + + # The DELETE handler runs the GDPR Art. 17 cascade synchronously + # (Stripe + Redis + EC2 terminate + CF tunnel + DNS + DB rows). Real + # observed wall-time on prod-shaped infra is ~30–90s — EC2 termination + # alone takes 30–60s. The 5–15s estimate in `purge.go`'s comment is + # the API-call cost, NOT the AWS-side time-to-termination it waits on. + # + # Two-part patience to match reality: + # 1. 120s curl timeout on the DELETE itself (was 30s) so the + # synchronous cascade has room to complete in-band. + # 2. Poll up to 60s after for organizations.status='purged' (or row + # gone) instead of one rigid 10s sleep — covers the case where + # DELETE returns 5xx mid-cascade and the cascade finishes anyway, + # and the case where DELETE legitimately exceeds 120s and we want + # eventual-consistency confirmation. + curl "${CURL_COMMON[@]}" --max-time 120 -X DELETE "$CP_URL/cp/admin/tenants/$SLUG" \ -H "Authorization: Bearer $ADMIN_TOKEN" \ -H "Content-Type: application/json" \ -d "{\"confirm\":\"$SLUG\"}" >/dev/null 2>&1 \ && ok "Teardown request accepted" \ || log "Teardown returned non-2xx (may already be gone)" - sleep 10 - local leak_count - leak_count=$(curl "${CURL_COMMON[@]}" "$CP_URL/cp/admin/orgs" \ - -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \ - | python3 -c "import json,sys; d=json.load(sys.stdin); print(sum(1 for o in d.get('orgs', []) if o.get('slug')=='$SLUG' and o.get('status') != 'purged'))" \ - 2>/dev/null || echo 0) + local leak_count=1 + local elapsed=0 + while [ "$elapsed" -lt 60 ]; do + leak_count=$(curl "${CURL_COMMON[@]}" "$CP_URL/cp/admin/orgs" \ + -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \ + | python3 -c "import json,sys; d=json.load(sys.stdin); print(sum(1 for o in d.get('orgs', []) if o.get('slug')=='$SLUG' and o.get('status') != 'purged'))" \ + 2>/dev/null || echo 1) + if [ "$leak_count" = "0" ]; then + break + fi + sleep 5 + elapsed=$((elapsed + 5)) + done + if [ "$leak_count" != "0" ]; then - echo "⚠️ LEAK: org $SLUG still present post-teardown (count=$leak_count)" >&2 + echo "⚠️ LEAK: org $SLUG still present post-teardown after ${elapsed}s (count=$leak_count)" >&2 exit 4 fi - ok "Teardown clean — no orphan resources for $SLUG" + ok "Teardown clean — no orphan resources for $SLUG (${elapsed}s)" # Normalize unexpected upstream exit codes to 1 (generic failure). The # script's documented contract (header "Exit codes" section) only emits