From 88ff0d770be72f2e6b0976146548f0a69a44e6e5 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 5 May 2026 19:36:20 -0700 Subject: [PATCH] chore(sweep): add orphan-tunnel cleanup step (#2987 / #340) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 15-min sweeper has been deleting stale e2e orgs but not the orphan tunnels left behind when the org-delete cascade half-fails (CP transient 5xx after the org row is gone but before the CF tunnel delete completes). Result: tunnels accumulate in CF until manual operator cleanup. Add a final step that POSTs `/cp/admin/orphan-tunnels/cleanup` every tick. Best-effort — failure doesn't fail the workflow; next tick re-attempts. Output reports deleted_count + failed count for ops visibility. This is the catch-all for the orphan-tunnel class. The proper upstream fix (transactional org delete) lives in CP and tracks as issue #2989. Until that lands, the sweeper bounded-time-to-cleanup keeps the leak from escalating. Note: PR #492 (cf-tunnel silent-success fix) makes this step actually effective — pre-fix DeleteTunnel silent-succeeded on 1022, so the cleanup endpoint reported success without deleting. Post-fix the cleanup chains CleanupTunnelConnections + retry on 1022, which actually clears stuck-connector orphans. 🤖 Generated with [Claude Code](https://claude.com/claude-code) --- .github/workflows/sweep-stale-e2e-orgs.yml | 42 +++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/.github/workflows/sweep-stale-e2e-orgs.yml b/.github/workflows/sweep-stale-e2e-orgs.yml index a8427672..18bec191 100644 --- a/.github/workflows/sweep-stale-e2e-orgs.yml +++ b/.github/workflows/sweep-stale-e2e-orgs.yml @@ -193,7 +193,47 @@ jobs: # sweeper is best-effort. Next hourly tick re-attempts. We # only fail loud at the safety-cap gate above. + - name: Sweep orphan tunnels + # Stale-org cleanup deletes the org (which cascades to tunnel + # delete inside the CP). But when that cascade fails partway — + # CP transient 5xx after the org row is deleted but before the + # CF tunnel delete completes — the tunnel persists with no + # matching org row. The reconciler in internal/sweep flags this + # as `cf_tunnel kind=orphan`, but nothing automatically reaps it. + # + # `/cp/admin/orphan-tunnels/cleanup` is the operator-triggered + # reaper. Calling it here at the end of every sweep tick + # converges the staging CF account to clean even when CP + # cascades half-fail. + # + # PR #492 made the underlying DeleteTunnel actually check + # status — pre-fix it silent-succeeded on CF code 1022 + # ("active connections"), so this step would have been a no-op + # against stuck connectors. Post-fix the cleanup invokes + # CleanupTunnelConnections + retry, which actually clears the + # 1022 case. (#2987) + # + # Best-effort. Failure here doesn't fail the workflow — next + # tick re-attempts. Errors flow to step output for ops review. + if: env.DRY_RUN != 'true' + run: | + set +e + curl -sS -o /tmp/cleanup_resp -w "%{http_code}" \ + --max-time 60 \ + -X POST "$MOLECULE_CP_URL/cp/admin/orphan-tunnels/cleanup" \ + -H "Authorization: Bearer $ADMIN_TOKEN" >/tmp/cleanup_code + set -e + http_code=$(cat /tmp/cleanup_code 2>/dev/null || echo "000") + body=$(cat /tmp/cleanup_resp 2>/dev/null | head -c 500) + if [ "$http_code" = "200" ]; then + count=$(echo "$body" | python3 -c "import sys,json; d=json.loads(sys.stdin.read() or '{}'); print(d.get('deleted_count', 0))" 2>/dev/null || echo "0") + failed_n=$(echo "$body" | python3 -c "import sys,json; d=json.loads(sys.stdin.read() or '{}'); print(len(d.get('failed') or {}))" 2>/dev/null || echo "0") + echo "Orphan-tunnel sweep: deleted=$count failed=$failed_n" + else + echo "::warning::orphan-tunnels cleanup returned HTTP $http_code — body: $body" + fi + - name: Dry-run summary if: env.DRY_RUN == 'true' run: | - echo "DRY RUN — would have deleted ${{ steps.identify.outputs.count }} org(s). Re-run with dry_run=false to actually delete." + echo "DRY RUN — would have deleted ${{ steps.identify.outputs.count }} org(s) AND triggered orphan-tunnels cleanup. Re-run with dry_run=false to actually delete."