From 3cd8c53de032f56a52ae4f6c526ef6654029447f Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Sun, 3 May 2026 16:08:40 -0700
Subject: [PATCH] ci: tighten e2e cleanup race window 120m -> ~45m worst case
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two changes that close one of the leak classes from the
molecule-controlplane#420 vCPU audit:

1. sweep-stale-e2e-orgs.yml: cron */15 (was hourly), MAX_AGE_MINUTES
   30 (was 120). E2E runs are 8-25 min wall clock; 30 min is safely
   above the longest run while shrinking the worst-case leak window
   from ~2h to ~45 min (15-min sweep cadence + 30-min threshold).

2. canary-staging.yml teardown: the per-slug DELETE used `>/dev/null
   || true`, which swallowed every failure. A 5xx or timeout from CP
   looked identical to "successfully deleted" and the canary tenant
   kept eating ~2 vCPU until the sweeper caught it. Now we capture
   the response code and surface non-2xx as a workflow warning that
   names the leaked slug.

The exit semantics stay unchanged — a single-canary cleanup miss
shouldn't fail-flag the canary itself when the actual smoke check
passed. The sweeper is the safety net for whatever slips past.

Caught during the molecule-controlplane#420 audit on 2026-05-03 —
3 e2e canary tenant orphans were running for 24-95 min, all under
the previous 120-min sweep threshold so they went unnoticed until
manual cleanup. Same `|| true` pattern exists in
e2e-staging-{canvas,external,saas,sanity}.yml; out of scope for
this PR (mechanical port; tracking separately) but the sweeper
tightening covers all of them by reducing the safety-net latency.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/canary-staging.yml       | 28 ++++++++++++++++++++--
 .github/workflows/sweep-stale-e2e-orgs.yml | 21 ++++++++++------
 2 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/canary-staging.yml b/.github/workflows/canary-staging.yml
index 396d53ca..93f53ca7 100644
--- a/.github/workflows/canary-staging.yml
+++ b/.github/workflows/canary-staging.yml
@@ -231,10 +231,34 @@ jobs:
                         and o.get('status') not in ('purged',)]
           print('\n'.join(candidates))
           " 2>/dev/null)
+          # Per-slug DELETE with HTTP-code verification. The previous
+          # `... >/dev/null || true` swallowed every failure, so a 5xx
+          # or timeout from CP looked identical to "successfully cleaned
+          # up" and the tenant kept eating ~2 vCPU until the hourly
+          # stale sweep caught it (up to 2h later). Now we capture the
+          # response code and surface non-2xx as a workflow warning, so
+          # the run page shows which slug leaked. We still don't `exit 1`
+          # on cleanup failure — a single-canary cleanup miss shouldn't
+          # fail-flag the canary itself when the actual smoke check
+          # passed. The sweep-stale-e2e-orgs cron (now every 15 min,
+          # 30-min threshold) is the safety net for whatever slips past.
+          # See molecule-controlplane#420.
+          leaks=()
           for slug in $orgs; do
-            curl -sS -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
+            code=$(curl -sS -o /tmp/canary-cleanup.out -w "%{http_code}" \
+              -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
               -H "Authorization: Bearer $ADMIN_TOKEN" \
               -H "Content-Type: application/json" \
-              -d "{\"confirm\":\"$slug\"}" >/dev/null || true
+              -d "{\"confirm\":\"$slug\"}" \
+              || echo "000")
+            if [ "$code" = "200" ] || [ "$code" = "204" ]; then
+              echo "[teardown] deleted $slug (HTTP $code)"
+            else
+              echo "::warning::canary teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/canary-cleanup.out 2>/dev/null)"
+              leaks+=("$slug")
+            fi
           done
+          if [ ${#leaks[@]} -gt 0 ]; then
+            echo "::warning::canary teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
+          fi
           exit 0
diff --git a/.github/workflows/sweep-stale-e2e-orgs.yml b/.github/workflows/sweep-stale-e2e-orgs.yml
index 5a0dce30..d2fcb8be 100644
--- a/.github/workflows/sweep-stale-e2e-orgs.yml
+++ b/.github/workflows/sweep-stale-e2e-orgs.yml
@@ -25,16 +25,23 @@ name: Sweep stale e2e-* orgs (staging)
 
 on:
   schedule:
-    # Every hour on the hour. E2E orgs are short-lived (~10-25 min wall
-    # clock from create to teardown). Anything older than the
-    # MAX_AGE_MINUTES threshold below is presumed dead.
-    - cron: '0 * * * *'
+    # Every 15 min. E2E orgs are short-lived (~8-25 min wall clock from
+    # create to teardown — canary is ~8 min, full SaaS ~25 min). The
+    # previous hourly + 120-min stale threshold meant a leaked tenant
+    # could keep an EC2 alive for up to 2 hours, eating ~2 vCPU per
+    # leak. Tightening the cadence + threshold reduces the worst-case
+    # leak window from 120 min to ~45 min (15-min sweep cadence + 30-min
+    # threshold) without risk of catching in-progress runs (the longest
+    # e2e run is the 25-min canary, well under the 30-min threshold).
+    # See molecule-controlplane#420 for the leak-class accounting that
+    # motivated this tightening.
+    - cron: '*/15 * * * *'
   workflow_dispatch:
     inputs:
       max_age_minutes:
-        description: "Delete e2e-* orgs older than N minutes (default 120)"
+        description: "Delete e2e-* orgs older than N minutes (default 30)"
         required: false
-        default: "120"
+        default: "30"
       dry_run:
         description: "Dry run only — list what would be deleted"
         required: false
@@ -58,7 +65,7 @@ jobs:
     env:
       MOLECULE_CP_URL: https://staging-api.moleculesai.app
       ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
-      MAX_AGE_MINUTES: ${{ github.event.inputs.max_age_minutes || '120' }}
+      MAX_AGE_MINUTES: ${{ github.event.inputs.max_age_minutes || '30' }}
       DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }}
       # Refuse to delete more than this many orgs in one tick. If the
       # CP DB is briefly empty (or the admin endpoint goes weird and