From 8b9e7e6d59efebcf960cc47eba2587e26bb25564 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 3 May 2026 16:24:43 -0700 Subject: [PATCH] ci: port DELETE-verify pattern to remaining staging e2e workflows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to #2648 — same `>/dev/null || true` swallow-on-error pattern existed in: e2e-staging-canvas.yml (single-slug) e2e-staging-saas.yml (loop) e2e-staging-sanity.yml (loop) e2e-staging-external.yml (loop, was `>/dev/null 2>&1` variant) All four now capture the HTTP code, log a "[teardown] deleted $slug (HTTP $code)" line on success, and emit a workflow warning naming the slug + body excerpt on non-2xx. Loop bodies also tally + summarise total leaks at the end. Exit semantics unchanged: a single cleanup miss still doesn't fail-flag the test (sweep-stale-e2e-orgs is the safety net within ~45 min). The behavior change is purely surfacing — failures that were silent are now visible on the workflow run page. Pairs with #2648's tightened sweeper. Together: per-run cleanup failures are visible AND the safety net catches them quickly. Closes the per-workflow port noted as out-of-scope in #2648. See molecule-controlplane#420. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/e2e-staging-canvas.yml | 19 +++++++++++++++++-- .github/workflows/e2e-staging-external.yml | 20 ++++++++++++++++++-- .github/workflows/e2e-staging-saas.yml | 20 ++++++++++++++++++-- .github/workflows/e2e-staging-sanity.yml | 19 +++++++++++++++++-- 4 files changed, 70 insertions(+), 8 deletions(-) diff --git a/.github/workflows/e2e-staging-canvas.yml b/.github/workflows/e2e-staging-canvas.yml index c1620a20..6c59e72a 100644 --- a/.github/workflows/e2e-staging-canvas.yml +++ b/.github/workflows/e2e-staging-canvas.yml @@ -184,8 +184,23 @@ jobs: exit 0 fi echo "Deleting orphan tenant: $slug" - curl -sS -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ + # Verify HTTP 2xx instead of `>/dev/null || true` swallowing + # failures. A 5xx or timeout previously looked identical to + # success, leaving the tenant alive for up to ~45 min until + # sweep-stale-e2e-orgs caught it. Surface failures as + # workflow warnings naming the slug. Don't `exit 1` — a single + # cleanup miss shouldn't fail-flag the canvas test when the + # actual smoke check passed; the sweeper is the safety net. + # See molecule-controlplane#420. + code=$(curl -sS -o /tmp/canvas-cleanup.out -w "%{http_code}" \ + -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ -H "Authorization: Bearer $ADMIN_TOKEN" \ -H "Content-Type: application/json" \ - -d "{\"confirm\":\"$slug\"}" >/dev/null || true + -d "{\"confirm\":\"$slug\"}" \ + || echo "000") + if [ "$code" = "200" ] || [ "$code" = "204" ]; then + echo "[teardown] deleted $slug (HTTP $code)" + else + echo "::warning::canvas teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/canvas-cleanup.out 2>/dev/null)" + fi exit 0 diff --git a/.github/workflows/e2e-staging-external.yml b/.github/workflows/e2e-staging-external.yml index d1d8def7..12ac4577 100644 --- a/.github/workflows/e2e-staging-external.yml +++ b/.github/workflows/e2e-staging-external.yml @@ -153,12 +153,28 @@ jobs: if [ -n "$orgs" ]; then echo "Safety-net sweep: deleting leftover orgs:" echo "$orgs" + # Per-slug verified DELETE — see molecule-controlplane#420. + # `>/dev/null 2>&1` previously hid every failure; surface + # non-2xx as workflow warnings so the run page names what + # leaked. Sweeper catches the rest within ~45 min. + leaks=() for slug in $orgs; do - curl -sS -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ + code=$(curl -sS -o /tmp/external-cleanup.out -w "%{http_code}" \ + -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ -H "Authorization: Bearer $ADMIN_TOKEN" \ -H "Content-Type: application/json" \ - -d "{\"confirm\":\"$slug\"}" >/dev/null 2>&1 + -d "{\"confirm\":\"$slug\"}" \ + || echo "000") + if [ "$code" = "200" ] || [ "$code" = "204" ]; then + echo "[teardown] deleted $slug (HTTP $code)" + else + echo "::warning::external teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/external-cleanup.out 2>/dev/null)" + leaks+=("$slug") + fi done + if [ ${#leaks[@]} -gt 0 ]; then + echo "::warning::external teardown left ${#leaks[@]} leak(s): ${leaks[*]}" + fi else echo "Safety-net sweep: no leftover orgs to clean." fi diff --git a/.github/workflows/e2e-staging-saas.yml b/.github/workflows/e2e-staging-saas.yml index f055c491..2a7efe16 100644 --- a/.github/workflows/e2e-staging-saas.yml +++ b/.github/workflows/e2e-staging-saas.yml @@ -164,11 +164,27 @@ jobs: and o.get('instance_status') not in ('purged',)] print('\n'.join(candidates)) " 2>/dev/null) + # Per-slug verified DELETE (was `>/dev/null || true` — see + # molecule-controlplane#420). Surface non-2xx as a workflow + # warning naming the leaked slug; don't exit 1 (sweeper is + # the safety net within ~45 min). + leaks=() for slug in $orgs; do echo "Safety-net teardown: $slug" - curl -sS -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ + code=$(curl -sS -o /tmp/saas-cleanup.out -w "%{http_code}" \ + -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ -H "Authorization: Bearer $ADMIN_TOKEN" \ -H "Content-Type: application/json" \ - -d "{\"confirm\":\"$slug\"}" >/dev/null || true + -d "{\"confirm\":\"$slug\"}" \ + || echo "000") + if [ "$code" = "200" ] || [ "$code" = "204" ]; then + echo "[teardown] deleted $slug (HTTP $code)" + else + echo "::warning::saas teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/saas-cleanup.out 2>/dev/null)" + leaks+=("$slug") + fi done + if [ ${#leaks[@]} -gt 0 ]; then + echo "::warning::saas teardown left ${#leaks[@]} leak(s): ${leaks[*]}" + fi exit 0 diff --git a/.github/workflows/e2e-staging-sanity.yml b/.github/workflows/e2e-staging-sanity.yml index edfa5359..e98b38fe 100644 --- a/.github/workflows/e2e-staging-sanity.yml +++ b/.github/workflows/e2e-staging-sanity.yml @@ -143,10 +143,25 @@ jobs: and o.get('status') not in ('purged',)] print('\n'.join(candidates)) " 2>/dev/null) + # Per-slug verified DELETE — see molecule-controlplane#420. + # Failures surface as workflow warnings; the sweeper is the + # safety net within ~45 min. + leaks=() for slug in $orgs; do - curl -sS -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ + code=$(curl -sS -o /tmp/sanity-cleanup.out -w "%{http_code}" \ + -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ -H "Authorization: Bearer $ADMIN_TOKEN" \ -H "Content-Type: application/json" \ - -d "{\"confirm\":\"$slug\"}" >/dev/null || true + -d "{\"confirm\":\"$slug\"}" \ + || echo "000") + if [ "$code" = "200" ] || [ "$code" = "204" ]; then + echo "[teardown] deleted $slug (HTTP $code)" + else + echo "::warning::sanity teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/sanity-cleanup.out 2>/dev/null)" + leaks+=("$slug") + fi done + if [ ${#leaks[@]} -gt 0 ]; then + echo "::warning::sanity teardown left ${#leaks[@]} leak(s): ${leaks[*]}" + fi exit 0