From 3cd8c53de032f56a52ae4f6c526ef6654029447f Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 3 May 2026 16:08:40 -0700 Subject: [PATCH] ci: tighten e2e cleanup race window 120m -> ~45m worst case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes that close one of the leak classes from the molecule-controlplane#420 vCPU audit: 1. sweep-stale-e2e-orgs.yml: cron */15 (was hourly), MAX_AGE_MINUTES 30 (was 120). E2E runs are 8-25 min wall clock; 30 min is safely above the longest run while shrinking the worst-case leak window from ~2h to ~45 min (15-min sweep cadence + 30-min threshold). 2. canary-staging.yml teardown: the per-slug DELETE used `>/dev/null || true`, which swallowed every failure. A 5xx or timeout from CP looked identical to "successfully deleted" and the canary tenant kept eating ~2 vCPU until the sweeper caught it. Now we capture the response code and surface non-2xx as a workflow warning that names the leaked slug. The exit semantics stay unchanged — a single-canary cleanup miss shouldn't fail-flag the canary itself when the actual smoke check passed. The sweeper is the safety net for whatever slips past. Caught during the molecule-controlplane#420 audit on 2026-05-03 — 3 e2e canary tenant orphans were running for 24-95 min, all under the previous 120-min sweep threshold so they went unnoticed until manual cleanup. Same `|| true` pattern exists in e2e-staging-{canvas,external,saas,sanity}.yml; out of scope for this PR (mechanical port; tracking separately) but the sweeper tightening covers all of them by reducing the safety-net latency. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/canary-staging.yml | 28 ++++++++++++++++++++-- .github/workflows/sweep-stale-e2e-orgs.yml | 21 ++++++++++------ 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/.github/workflows/canary-staging.yml b/.github/workflows/canary-staging.yml index 396d53ca..93f53ca7 100644 --- a/.github/workflows/canary-staging.yml +++ b/.github/workflows/canary-staging.yml @@ -231,10 +231,34 @@ jobs: and o.get('status') not in ('purged',)] print('\n'.join(candidates)) " 2>/dev/null) + # Per-slug DELETE with HTTP-code verification. The previous + # `... >/dev/null || true` swallowed every failure, so a 5xx + # or timeout from CP looked identical to "successfully cleaned + # up" and the tenant kept eating ~2 vCPU until the hourly + # stale sweep caught it (up to 2h later). Now we capture the + # response code and surface non-2xx as a workflow warning, so + # the run page shows which slug leaked. We still don't `exit 1` + # on cleanup failure — a single-canary cleanup miss shouldn't + # fail-flag the canary itself when the actual smoke check + # passed. The sweep-stale-e2e-orgs cron (now every 15 min, + # 30-min threshold) is the safety net for whatever slips past. + # See molecule-controlplane#420. + leaks=() for slug in $orgs; do - curl -sS -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ + code=$(curl -sS -o /tmp/canary-cleanup.out -w "%{http_code}" \ + -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ -H "Authorization: Bearer $ADMIN_TOKEN" \ -H "Content-Type: application/json" \ - -d "{\"confirm\":\"$slug\"}" >/dev/null || true + -d "{\"confirm\":\"$slug\"}" \ + || echo "000") + if [ "$code" = "200" ] || [ "$code" = "204" ]; then + echo "[teardown] deleted $slug (HTTP $code)" + else + echo "::warning::canary teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/canary-cleanup.out 2>/dev/null)" + leaks+=("$slug") + fi done + if [ ${#leaks[@]} -gt 0 ]; then + echo "::warning::canary teardown left ${#leaks[@]} leak(s): ${leaks[*]}" + fi exit 0 diff --git a/.github/workflows/sweep-stale-e2e-orgs.yml b/.github/workflows/sweep-stale-e2e-orgs.yml index 5a0dce30..d2fcb8be 100644 --- a/.github/workflows/sweep-stale-e2e-orgs.yml +++ b/.github/workflows/sweep-stale-e2e-orgs.yml @@ -25,16 +25,23 @@ name: Sweep stale e2e-* orgs (staging) on: schedule: - # Every hour on the hour. E2E orgs are short-lived (~10-25 min wall - # clock from create to teardown). Anything older than the - # MAX_AGE_MINUTES threshold below is presumed dead. - - cron: '0 * * * *' + # Every 15 min. E2E orgs are short-lived (~8-25 min wall clock from + # create to teardown — canary is ~8 min, full SaaS ~25 min). The + # previous hourly + 120-min stale threshold meant a leaked tenant + # could keep an EC2 alive for up to 2 hours, eating ~2 vCPU per + # leak. Tightening the cadence + threshold reduces the worst-case + # leak window from 120 min to ~45 min (15-min sweep cadence + 30-min + # threshold) without risk of catching in-progress runs (the longest + # e2e run is the 25-min canary, well under the 30-min threshold). + # See molecule-controlplane#420 for the leak-class accounting that + # motivated this tightening. + - cron: '*/15 * * * *' workflow_dispatch: inputs: max_age_minutes: - description: "Delete e2e-* orgs older than N minutes (default 120)" + description: "Delete e2e-* orgs older than N minutes (default 30)" required: false - default: "120" + default: "30" dry_run: description: "Dry run only — list what would be deleted" required: false @@ -58,7 +65,7 @@ jobs: env: MOLECULE_CP_URL: https://staging-api.moleculesai.app ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} - MAX_AGE_MINUTES: ${{ github.event.inputs.max_age_minutes || '120' }} + MAX_AGE_MINUTES: ${{ github.event.inputs.max_age_minutes || '30' }} DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }} # Refuse to delete more than this many orgs in one tick. If the # CP DB is briefly empty (or the admin endpoint goes weird and