Merge pull request #2328 from Molecule-AI/auto/sweep-cf-tunnel-orphans

feat(ops): sweep-cf-tunnels janitor — orphan tunnels accumulate forever
This commit is contained in:
Hongming Wang 2026-04-30 02:45:16 +00:00 committed by GitHub
commit 16796431b9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 369 additions and 0 deletions

112
.github/workflows/sweep-cf-tunnels.yml vendored Normal file
View File

@ -0,0 +1,112 @@
name: Sweep stale Cloudflare Tunnels
# Janitor for Cloudflare Tunnels whose backing tenant no longer
# exists. Parallel-shape to sweep-cf-orphans.yml (which sweeps DNS
# records); same justification, different CF resource.
#
# Why this exists separately from sweep-cf-orphans:
# - DNS records live on the zone (`/zones/<id>/dns_records`).
# - Tunnels live on the account (`/accounts/<id>/cfd_tunnel`).
# - Different CF API surface, different scopes; the existing CF
# token might not have `account:cloudflare_tunnel:edit`. Splitting
# the workflows keeps each one's secret-presence gate independent
# so neither silent-skips when the other's secret is missing.
# - Cleaner blast radius — operators can disable one without the
# other if a regression surfaces.
#
# Safety: the script's MAX_DELETE_PCT gate (default 90% — higher than
# the DNS sweep's 50% because tenant-shaped tunnels are mostly
# orphans by design) refuses to nuke past the threshold.
on:
schedule:
# Hourly at :45 — offset from sweep-cf-orphans (:15) so the two
# janitors don't issue parallel CF API bursts at the same minute.
- cron: '45 * * * *'
workflow_dispatch:
inputs:
dry_run:
description: "Dry run only — list what would be deleted, no deletion"
required: false
type: boolean
default: true
max_delete_pct:
description: "Override safety gate (default 90, set higher only for major cleanup)"
required: false
default: "90"
# Don't let two sweeps race the same account.
concurrency:
group: sweep-cf-tunnels
cancel-in-progress: false
permissions:
contents: read
jobs:
sweep:
name: Sweep CF tunnels
runs-on: ubuntu-latest
# 5 min surfaces hangs (CF API stall, slow pagination on busy
# accounts). Realistic worst case is ~3 min: 2 CP curls + N CF
# list pages + N×CF-DELETE, each capped at 10-15s by curl -m.
timeout-minutes: 5
env:
CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }}
CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }}
CP_PROD_ADMIN_TOKEN: ${{ secrets.CP_PROD_ADMIN_TOKEN }}
CP_STAGING_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_TOKEN }}
MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '90' }}
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
- name: Verify required secrets present
id: verify
# Schedule-vs-dispatch behaviour split mirrors sweep-cf-orphans
# (hardened 2026-04-28 after the silent-no-op incident: the
# janitor reported green while doing nothing because secrets
# were unset, masking a 152/200 zone-record leak). Same
# principle applies here:
# - schedule → exit 1 on missing secrets (red CI surfaces it)
# - workflow_dispatch → exit 0 with warning (operator-driven,
# they already accepted the repo state)
run: |
missing=()
for var in CF_API_TOKEN CF_ACCOUNT_ID CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN; do
if [ -z "${!var:-}" ]; then
missing+=("$var")
fi
done
if [ ${#missing[@]} -gt 0 ]; then
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "::warning::skipping sweep — secrets not configured: ${missing[*]}"
echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun."
echo "::warning::CF_API_TOKEN must include account:cloudflare_tunnel:edit scope (separate from the zone:dns:edit scope used by sweep-cf-orphans)."
echo "skip=true" >> "$GITHUB_OUTPUT"
exit 0
fi
echo "::error::sweep cannot run — required secrets missing: ${missing[*]}"
echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow."
echo "::error::CF_API_TOKEN must include account:cloudflare_tunnel:edit scope."
exit 1
fi
echo "All required secrets present ✓"
echo "skip=false" >> "$GITHUB_OUTPUT"
- name: Run sweep
if: steps.verify.outputs.skip != 'true'
# Schedule-vs-dispatch dry-run asymmetry mirrors sweep-cf-orphans:
# - Scheduled: input empty → "false" → --execute (the whole
# point of an hourly janitor).
# - Manual workflow_dispatch: input default true → dry-run;
# operator must flip it to actually delete.
run: |
set -euo pipefail
if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then
echo "Running in dry-run mode — no deletions"
bash scripts/ops/sweep-cf-tunnels.sh
else
echo "Running with --execute — will delete identified orphans"
bash scripts/ops/sweep-cf-tunnels.sh --execute
fi

257
scripts/ops/sweep-cf-tunnels.sh Executable file
View File

@ -0,0 +1,257 @@
#!/usr/bin/env bash
# sweep-cf-tunnels.sh — safe, targeted sweep of Cloudflare Tunnels
# whose corresponding tenant no longer exists.
#
# Why this exists: CP's tenant-delete cascade removes the DNS record
# (caught by sweep-cf-orphans.sh as a backstop) but does NOT delete
# the underlying Cloudflare Tunnel. Each E2E provision creates one
# Tunnel named `tenant-<slug>`; without cleanup these accumulate
# indefinitely on the account, consuming the account's tunnel quota
# and cluttering the Cloudflare dashboard.
#
# Observed 2026-04-30: dozens of `tenant-e2e-canvas-*` tunnels in
# Down state with zero replicas, weeks past their tenant's deletion.
#
# This script is a parallel-shape janitor to sweep-cf-orphans.sh:
# 1. Query CP admin API to enumerate live org slugs (prod + staging)
# 2. Enumerate Cloudflare Tunnels via the account-scoped API
# 3. For each tunnel matching `tenant-<slug>`, check if <slug>
# appears in the live set
# 4. Skip tunnels with active connections (defense-in-depth — never
# delete a healthy tunnel even if CP claims the org is gone)
# 5. Only delete tunnels with NO live counterpart AND NO active
# connections
#
# Dry-run by default; must pass --execute to actually delete.
#
# Env vars required:
# CF_API_TOKEN — Cloudflare token with
# account:cloudflare_tunnel:edit scope.
# (Same secret as sweep-cf-orphans, but the
# token must include the tunnel scope.)
# CF_ACCOUNT_ID — the account that owns the tunnels (visible
# in dash.cloudflare.com URL path)
# CP_PROD_ADMIN_TOKEN — CP admin bearer for api.moleculesai.app
# CP_STAGING_ADMIN_TOKEN — CP admin bearer for staging-api.moleculesai.app
#
# Exit codes:
# 0 — dry-run completed or sweep executed successfully
# 1 — missing required env, API failure, or unexpected state
# 2 — safety check failed (would delete >MAX_DELETE_PCT% of
# tenant-shaped tunnels; refusing)
set -euo pipefail
DRY_RUN=1
# Tenant tunnels are short-lived by design — most of them at any
# given moment are orphans from finished E2E runs. The default is
# tuned higher than sweep-cf-orphans (50%) to reflect that the
# steady-state for tenant-* tunnels is mostly-orphan, not mostly-live.
MAX_DELETE_PCT="${MAX_DELETE_PCT:-90}"
for arg in "$@"; do
case "$arg" in
--execute|--no-dry-run) DRY_RUN=0 ;;
--help|-h)
grep '^#' "$0" | head -45 | sed 's/^# \{0,1\}//'
exit 0
;;
*)
echo "unknown arg: $arg (use --help)" >&2
exit 1
;;
esac
done
need() {
local var="$1"
if [ -z "${!var:-}" ]; then
echo "ERROR: $var is required" >&2
exit 1
fi
}
need CF_API_TOKEN
need CF_ACCOUNT_ID
need CP_PROD_ADMIN_TOKEN
need CP_STAGING_ADMIN_TOKEN
log() { echo "[$(date -u +%H:%M:%S)] $*"; }
# --- Gather live sets ------------------------------------------------------
log "Fetching CP prod org slugs..."
PROD_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_PROD_ADMIN_TOKEN" \
"https://api.moleculesai.app/cp/admin/orgs?limit=500" \
| python3 -c "import json,sys; print(' '.join(o['slug'] for o in json.load(sys.stdin).get('orgs',[])))")
log " prod orgs: $(echo "$PROD_SLUGS" | wc -w | tr -d ' ')"
log "Fetching CP staging org slugs..."
STAGING_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_STAGING_ADMIN_TOKEN" \
"https://staging-api.moleculesai.app/cp/admin/orgs?limit=500" \
| python3 -c "import json,sys; print(' '.join(o['slug'] for o in json.load(sys.stdin).get('orgs',[])))")
log " staging orgs: $(echo "$STAGING_SLUGS" | wc -w | tr -d ' ')"
log "Fetching Cloudflare tunnels..."
# The cfd_tunnel list endpoint is paginated; per_page max is 50.
# Walk all pages so we don't silently miss orphans on busy accounts.
PAGE=1
TUNNEL_JSON='{"result":[]}'
while :; do
page_json=$(curl -sS -m 15 -H "Authorization: Bearer $CF_API_TOKEN" \
"https://api.cloudflare.com/client/v4/accounts/$CF_ACCOUNT_ID/cfd_tunnel?per_page=50&page=$PAGE&is_deleted=false")
page_count=$(echo "$page_json" | python3 -c "import json,sys; print(len(json.load(sys.stdin).get('result') or []))")
if [ "$page_count" = "0" ]; then break; fi
# Merge pages
TUNNEL_JSON=$(python3 -c "
import json, sys
acc = json.loads(sys.argv[1])
new = json.loads(sys.argv[2])
acc['result'].extend(new.get('result') or [])
print(json.dumps(acc))
" "$TUNNEL_JSON" "$page_json")
PAGE=$((PAGE + 1))
if [ "$PAGE" -gt 20 ]; then
log "::warning::stopping pagination at page 20 (1000 tunnels) — re-run if more"
break
fi
done
TOTAL_TUNNELS=$(echo "$TUNNEL_JSON" | python3 -c "import json,sys; print(len(json.load(sys.stdin)['result']))")
log " total tunnels: $TOTAL_TUNNELS"
# --- Compute orphans -------------------------------------------------------
#
# Rules (in order):
# 1. Name doesn't match `tenant-<slug>` → keep (unknown — never sweep
# arbitrary tunnels that might belong to platform infra).
# 2. Tunnel has active connections (status=healthy or non-empty
# connections array) → keep (defense-in-depth: don't kill a live
# tunnel even if CP forgot the org).
# 3. Slug ∈ {prod_slugs staging_slugs} → keep (live tenant).
# 4. Otherwise → delete (orphan).
export PROD_SLUGS STAGING_SLUGS
DECISIONS=$(echo "$TUNNEL_JSON" | python3 -c '
import json, os, re, sys
prod_slugs = set(os.environ["PROD_SLUGS"].split())
staging_slugs = set(os.environ["STAGING_SLUGS"].split())
all_slugs = prod_slugs | staging_slugs
_TENANT_RE = re.compile(r"^tenant-(.+)$")
def decide(t, all_slugs):
name = t.get("name", "")
tid = t.get("id", "")
status = t.get("status", "")
conns = t.get("connections") or []
m = _TENANT_RE.match(name)
if not m:
return ("keep", "not-a-tenant-tunnel", tid, name, status)
slug = m.group(1)
# Defense-in-depth: never delete a tunnel with live connectors.
# The CF tunnel "status" field is one of inactive/degraded/healthy/down.
# "down" with empty connections is the orphan state we sweep.
if status == "healthy" or len(conns) > 0:
return ("keep", "active-connections", tid, name, status)
if slug in all_slugs:
return ("keep", "live-tenant", tid, name, status)
return ("delete", "orphan-tenant", tid, name, status)
d = json.loads(sys.stdin.read())
for t in d.get("result", []):
action, reason, tid, name, status = decide(t, all_slugs)
print(json.dumps({"action": action, "reason": reason, "id": tid, "name": name, "status": status}))
')
# --- Summarize + safety gate ----------------------------------------------
DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
KEEP_COUNT=$((TOTAL_TUNNELS - DELETE_COUNT))
TENANT_TUNNELS=$(echo "$DECISIONS" | python3 -c "
import json, sys
n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-tunnel')
print(n)
")
log ""
log "== Sweep plan =="
log " total tunnels: $TOTAL_TUNNELS"
log " tenant-shaped tunnels: $TENANT_TUNNELS"
log " would delete: $DELETE_COUNT"
log " would keep: $KEEP_COUNT"
log ""
# Per-reason breakdown of deletes
echo "$DECISIONS" | python3 -c "
import json,sys,collections
c = collections.Counter()
for l in sys.stdin:
d = json.loads(l)
if d['action'] == 'delete':
c[d['reason']] += 1
for reason, n in c.most_common():
print(f' delete/{reason}: {n}')
"
# Safety gate operates against the tenant-shaped subset (the reasonable
# "all of these could conceivably be ours" denominator), not the total.
# A miscount of platform-infra tunnels shouldn't relax the gate.
if [ "$TENANT_TUNNELS" -gt 0 ]; then
PCT=$(( DELETE_COUNT * 100 / TENANT_TUNNELS ))
if [ "$PCT" -gt "$MAX_DELETE_PCT" ]; then
log ""
log "SAFETY: would delete $PCT% of tenant-shaped tunnels (threshold $MAX_DELETE_PCT%) — refusing."
log " If this is expected (e.g. major cleanup after incident), rerun with"
log " MAX_DELETE_PCT=$((PCT+5)) $0 $*"
exit 2
fi
fi
if [ "$DRY_RUN" = "1" ]; then
log ""
log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT tunnels."
log ""
log "First 20 tunnels that would be deleted:"
echo "$DECISIONS" | python3 -c "
import json, sys
shown = 0
for l in sys.stdin:
d = json.loads(l)
if d['action'] == 'delete':
print(f\" {d['reason']:25s} {d['name']:40s} status={d['status']}\")
shown += 1
if shown >= 20: break
"
exit 0
fi
# --- Execute deletes -------------------------------------------------------
log ""
log "Executing $DELETE_COUNT deletions..."
DELETED=0
FAILED=0
while IFS= read -r line; do
action=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['action'])")
[ "$action" = "delete" ] || continue
tid=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['id'])")
name=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['name'])")
if curl -sS -m 10 -X DELETE \
-H "Authorization: Bearer $CF_API_TOKEN" \
"https://api.cloudflare.com/client/v4/accounts/$CF_ACCOUNT_ID/cfd_tunnel/$tid" \
| grep -q '"success":true'; then
DELETED=$((DELETED+1))
else
FAILED=$((FAILED+1))
log " FAILED: $name ($tid)"
fi
done <<< "$DECISIONS"
log ""
log "Done. deleted=$DELETED failed=$FAILED"
[ "$FAILED" -eq 0 ]