From 3a6d2f179d02f41bbf28580f50f02a56377d7c1a Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Wed, 29 Apr 2026 19:42:47 -0700 Subject: [PATCH] =?UTF-8?q?feat(ops):=20add=20sweep-cf-tunnels=20janitor?= =?UTF-8?q?=20=E2=80=94=20orphan=20Cloudflare=20Tunnels=20accumulate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CP's tenant-delete cascade removes the DNS record (with sweep-cf-orphans as a backstop) but does NOT delete the underlying Cloudflare Tunnel. Each E2E provision creates one Tunnel named `tenant-`; without cleanup these accumulate indefinitely on the account, consuming the tunnel quota and cluttering the dashboard. Observed 2026-04-30: dozens of `tenant-e2e-canvas-*` tunnels in Down state with zero replicas, weeks past their tenant's deletion. Same class of bug as the DNS-records leak that drove sweep-cf-orphans (controlplane#239). Parallel-shape to sweep-cf-orphans: - Same dry-run-by-default + --execute pattern - Same MAX_DELETE_PCT safety gate (default 90% — higher than DNS sweep's 50% because tenant-shaped tunnels are orphans by design) - Same schedule/dispatch hardening (hard-fail on missing secrets when scheduled, soft-skip when dispatched) - Cron offset to :45 to avoid CF API bursts colliding with the DNS sweep at :15 Decision rules (in order): 1. Name doesn't match `tenant-` → keep (unknown — never sweep tunnels that might belong to platform infra). 2. Tunnel has active connections (status=healthy or non-empty connections array) → keep (defense-in-depth: don't kill a live tunnel even if CP forgot the org). 3. Slug ∈ {prod_slugs ∪ staging_slugs} → keep. 4. Otherwise → delete (orphan). Verified by: - shell syntax check (bash -n) - YAML lint - Decide-logic offline smoke (7 cases, all pass) - End-to-end dry-run smoke with stubbed CP + CF APIs Required secrets (added to existing org-secrets): CF_API_TOKEN must include account:cloudflare_tunnel:edit scope (separate from zone:dns:edit used by sweep-cf-orphans — same token if scope is broad, or a new token if narrowly scoped). CF_ACCOUNT_ID account that owns the tunnels (visible in dash.cloudflare.com URL path). CP_PROD_ADMIN_TOKEN reused from sweep-cf-orphans. CP_STAGING_ADMIN_TOKEN reused from sweep-cf-orphans. Note: CP-side root cause (tenant-delete should cascade to tunnel delete) is in molecule-controlplane and worth fixing separately. This janitor is the operational backstop in the meantime — same pattern applied to DNS records when the same root cause was unaddressed. --- .github/workflows/sweep-cf-tunnels.yml | 112 +++++++++++ scripts/ops/sweep-cf-tunnels.sh | 257 +++++++++++++++++++++++++ 2 files changed, 369 insertions(+) create mode 100644 .github/workflows/sweep-cf-tunnels.yml create mode 100755 scripts/ops/sweep-cf-tunnels.sh diff --git a/.github/workflows/sweep-cf-tunnels.yml b/.github/workflows/sweep-cf-tunnels.yml new file mode 100644 index 00000000..3d29b44e --- /dev/null +++ b/.github/workflows/sweep-cf-tunnels.yml @@ -0,0 +1,112 @@ +name: Sweep stale Cloudflare Tunnels + +# Janitor for Cloudflare Tunnels whose backing tenant no longer +# exists. Parallel-shape to sweep-cf-orphans.yml (which sweeps DNS +# records); same justification, different CF resource. +# +# Why this exists separately from sweep-cf-orphans: +# - DNS records live on the zone (`/zones//dns_records`). +# - Tunnels live on the account (`/accounts//cfd_tunnel`). +# - Different CF API surface, different scopes; the existing CF +# token might not have `account:cloudflare_tunnel:edit`. Splitting +# the workflows keeps each one's secret-presence gate independent +# so neither silent-skips when the other's secret is missing. +# - Cleaner blast radius — operators can disable one without the +# other if a regression surfaces. +# +# Safety: the script's MAX_DELETE_PCT gate (default 90% — higher than +# the DNS sweep's 50% because tenant-shaped tunnels are mostly +# orphans by design) refuses to nuke past the threshold. + +on: + schedule: + # Hourly at :45 — offset from sweep-cf-orphans (:15) so the two + # janitors don't issue parallel CF API bursts at the same minute. + - cron: '45 * * * *' + workflow_dispatch: + inputs: + dry_run: + description: "Dry run only — list what would be deleted, no deletion" + required: false + type: boolean + default: true + max_delete_pct: + description: "Override safety gate (default 90, set higher only for major cleanup)" + required: false + default: "90" + +# Don't let two sweeps race the same account. +concurrency: + group: sweep-cf-tunnels + cancel-in-progress: false + +permissions: + contents: read + +jobs: + sweep: + name: Sweep CF tunnels + runs-on: ubuntu-latest + # 5 min surfaces hangs (CF API stall, slow pagination on busy + # accounts). Realistic worst case is ~3 min: 2 CP curls + N CF + # list pages + N×CF-DELETE, each capped at 10-15s by curl -m. + timeout-minutes: 5 + env: + CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }} + CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }} + CP_PROD_ADMIN_TOKEN: ${{ secrets.CP_PROD_ADMIN_TOKEN }} + CP_STAGING_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_TOKEN }} + MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '90' }} + + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + + - name: Verify required secrets present + id: verify + # Schedule-vs-dispatch behaviour split mirrors sweep-cf-orphans + # (hardened 2026-04-28 after the silent-no-op incident: the + # janitor reported green while doing nothing because secrets + # were unset, masking a 152/200 zone-record leak). Same + # principle applies here: + # - schedule → exit 1 on missing secrets (red CI surfaces it) + # - workflow_dispatch → exit 0 with warning (operator-driven, + # they already accepted the repo state) + run: | + missing=() + for var in CF_API_TOKEN CF_ACCOUNT_ID CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN; do + if [ -z "${!var:-}" ]; then + missing+=("$var") + fi + done + if [ ${#missing[@]} -gt 0 ]; then + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + echo "::warning::skipping sweep — secrets not configured: ${missing[*]}" + echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun." + echo "::warning::CF_API_TOKEN must include account:cloudflare_tunnel:edit scope (separate from the zone:dns:edit scope used by sweep-cf-orphans)." + echo "skip=true" >> "$GITHUB_OUTPUT" + exit 0 + fi + echo "::error::sweep cannot run — required secrets missing: ${missing[*]}" + echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow." + echo "::error::CF_API_TOKEN must include account:cloudflare_tunnel:edit scope." + exit 1 + fi + echo "All required secrets present ✓" + echo "skip=false" >> "$GITHUB_OUTPUT" + + - name: Run sweep + if: steps.verify.outputs.skip != 'true' + # Schedule-vs-dispatch dry-run asymmetry mirrors sweep-cf-orphans: + # - Scheduled: input empty → "false" → --execute (the whole + # point of an hourly janitor). + # - Manual workflow_dispatch: input default true → dry-run; + # operator must flip it to actually delete. + run: | + set -euo pipefail + if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then + echo "Running in dry-run mode — no deletions" + bash scripts/ops/sweep-cf-tunnels.sh + else + echo "Running with --execute — will delete identified orphans" + bash scripts/ops/sweep-cf-tunnels.sh --execute + fi diff --git a/scripts/ops/sweep-cf-tunnels.sh b/scripts/ops/sweep-cf-tunnels.sh new file mode 100755 index 00000000..7834c80c --- /dev/null +++ b/scripts/ops/sweep-cf-tunnels.sh @@ -0,0 +1,257 @@ +#!/usr/bin/env bash +# sweep-cf-tunnels.sh — safe, targeted sweep of Cloudflare Tunnels +# whose corresponding tenant no longer exists. +# +# Why this exists: CP's tenant-delete cascade removes the DNS record +# (caught by sweep-cf-orphans.sh as a backstop) but does NOT delete +# the underlying Cloudflare Tunnel. Each E2E provision creates one +# Tunnel named `tenant-`; without cleanup these accumulate +# indefinitely on the account, consuming the account's tunnel quota +# and cluttering the Cloudflare dashboard. +# +# Observed 2026-04-30: dozens of `tenant-e2e-canvas-*` tunnels in +# Down state with zero replicas, weeks past their tenant's deletion. +# +# This script is a parallel-shape janitor to sweep-cf-orphans.sh: +# 1. Query CP admin API to enumerate live org slugs (prod + staging) +# 2. Enumerate Cloudflare Tunnels via the account-scoped API +# 3. For each tunnel matching `tenant-`, check if +# appears in the live set +# 4. Skip tunnels with active connections (defense-in-depth — never +# delete a healthy tunnel even if CP claims the org is gone) +# 5. Only delete tunnels with NO live counterpart AND NO active +# connections +# +# Dry-run by default; must pass --execute to actually delete. +# +# Env vars required: +# CF_API_TOKEN — Cloudflare token with +# account:cloudflare_tunnel:edit scope. +# (Same secret as sweep-cf-orphans, but the +# token must include the tunnel scope.) +# CF_ACCOUNT_ID — the account that owns the tunnels (visible +# in dash.cloudflare.com URL path) +# CP_PROD_ADMIN_TOKEN — CP admin bearer for api.moleculesai.app +# CP_STAGING_ADMIN_TOKEN — CP admin bearer for staging-api.moleculesai.app +# +# Exit codes: +# 0 — dry-run completed or sweep executed successfully +# 1 — missing required env, API failure, or unexpected state +# 2 — safety check failed (would delete >MAX_DELETE_PCT% of +# tenant-shaped tunnels; refusing) + +set -euo pipefail + +DRY_RUN=1 +# Tenant tunnels are short-lived by design — most of them at any +# given moment are orphans from finished E2E runs. The default is +# tuned higher than sweep-cf-orphans (50%) to reflect that the +# steady-state for tenant-* tunnels is mostly-orphan, not mostly-live. +MAX_DELETE_PCT="${MAX_DELETE_PCT:-90}" + +for arg in "$@"; do + case "$arg" in + --execute|--no-dry-run) DRY_RUN=0 ;; + --help|-h) + grep '^#' "$0" | head -45 | sed 's/^# \{0,1\}//' + exit 0 + ;; + *) + echo "unknown arg: $arg (use --help)" >&2 + exit 1 + ;; + esac +done + +need() { + local var="$1" + if [ -z "${!var:-}" ]; then + echo "ERROR: $var is required" >&2 + exit 1 + fi +} +need CF_API_TOKEN +need CF_ACCOUNT_ID +need CP_PROD_ADMIN_TOKEN +need CP_STAGING_ADMIN_TOKEN + +log() { echo "[$(date -u +%H:%M:%S)] $*"; } + +# --- Gather live sets ------------------------------------------------------ + +log "Fetching CP prod org slugs..." +PROD_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_PROD_ADMIN_TOKEN" \ + "https://api.moleculesai.app/cp/admin/orgs?limit=500" \ + | python3 -c "import json,sys; print(' '.join(o['slug'] for o in json.load(sys.stdin).get('orgs',[])))") +log " prod orgs: $(echo "$PROD_SLUGS" | wc -w | tr -d ' ')" + +log "Fetching CP staging org slugs..." +STAGING_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_STAGING_ADMIN_TOKEN" \ + "https://staging-api.moleculesai.app/cp/admin/orgs?limit=500" \ + | python3 -c "import json,sys; print(' '.join(o['slug'] for o in json.load(sys.stdin).get('orgs',[])))") +log " staging orgs: $(echo "$STAGING_SLUGS" | wc -w | tr -d ' ')" + +log "Fetching Cloudflare tunnels..." +# The cfd_tunnel list endpoint is paginated; per_page max is 50. +# Walk all pages so we don't silently miss orphans on busy accounts. +PAGE=1 +TUNNEL_JSON='{"result":[]}' +while :; do + page_json=$(curl -sS -m 15 -H "Authorization: Bearer $CF_API_TOKEN" \ + "https://api.cloudflare.com/client/v4/accounts/$CF_ACCOUNT_ID/cfd_tunnel?per_page=50&page=$PAGE&is_deleted=false") + page_count=$(echo "$page_json" | python3 -c "import json,sys; print(len(json.load(sys.stdin).get('result') or []))") + if [ "$page_count" = "0" ]; then break; fi + # Merge pages + TUNNEL_JSON=$(python3 -c " +import json, sys +acc = json.loads(sys.argv[1]) +new = json.loads(sys.argv[2]) +acc['result'].extend(new.get('result') or []) +print(json.dumps(acc)) +" "$TUNNEL_JSON" "$page_json") + PAGE=$((PAGE + 1)) + if [ "$PAGE" -gt 20 ]; then + log "::warning::stopping pagination at page 20 (1000 tunnels) — re-run if more" + break + fi +done +TOTAL_TUNNELS=$(echo "$TUNNEL_JSON" | python3 -c "import json,sys; print(len(json.load(sys.stdin)['result']))") +log " total tunnels: $TOTAL_TUNNELS" + +# --- Compute orphans ------------------------------------------------------- +# +# Rules (in order): +# 1. Name doesn't match `tenant-` → keep (unknown — never sweep +# arbitrary tunnels that might belong to platform infra). +# 2. Tunnel has active connections (status=healthy or non-empty +# connections array) → keep (defense-in-depth: don't kill a live +# tunnel even if CP forgot the org). +# 3. Slug ∈ {prod_slugs ∪ staging_slugs} → keep (live tenant). +# 4. Otherwise → delete (orphan). + +export PROD_SLUGS STAGING_SLUGS +DECISIONS=$(echo "$TUNNEL_JSON" | python3 -c ' +import json, os, re, sys + +prod_slugs = set(os.environ["PROD_SLUGS"].split()) +staging_slugs = set(os.environ["STAGING_SLUGS"].split()) +all_slugs = prod_slugs | staging_slugs + +_TENANT_RE = re.compile(r"^tenant-(.+)$") + +def decide(t, all_slugs): + name = t.get("name", "") + tid = t.get("id", "") + status = t.get("status", "") + conns = t.get("connections") or [] + + m = _TENANT_RE.match(name) + if not m: + return ("keep", "not-a-tenant-tunnel", tid, name, status) + + slug = m.group(1) + + # Defense-in-depth: never delete a tunnel with live connectors. + # The CF tunnel "status" field is one of inactive/degraded/healthy/down. + # "down" with empty connections is the orphan state we sweep. + if status == "healthy" or len(conns) > 0: + return ("keep", "active-connections", tid, name, status) + + if slug in all_slugs: + return ("keep", "live-tenant", tid, name, status) + + return ("delete", "orphan-tenant", tid, name, status) + +d = json.loads(sys.stdin.read()) +for t in d.get("result", []): + action, reason, tid, name, status = decide(t, all_slugs) + print(json.dumps({"action": action, "reason": reason, "id": tid, "name": name, "status": status})) +') + +# --- Summarize + safety gate ---------------------------------------------- + +DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))") +KEEP_COUNT=$((TOTAL_TUNNELS - DELETE_COUNT)) +TENANT_TUNNELS=$(echo "$DECISIONS" | python3 -c " +import json, sys +n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-tunnel') +print(n) +") + +log "" +log "== Sweep plan ==" +log " total tunnels: $TOTAL_TUNNELS" +log " tenant-shaped tunnels: $TENANT_TUNNELS" +log " would delete: $DELETE_COUNT" +log " would keep: $KEEP_COUNT" +log "" + +# Per-reason breakdown of deletes +echo "$DECISIONS" | python3 -c " +import json,sys,collections +c = collections.Counter() +for l in sys.stdin: + d = json.loads(l) + if d['action'] == 'delete': + c[d['reason']] += 1 +for reason, n in c.most_common(): + print(f' delete/{reason}: {n}') +" + +# Safety gate operates against the tenant-shaped subset (the reasonable +# "all of these could conceivably be ours" denominator), not the total. +# A miscount of platform-infra tunnels shouldn't relax the gate. +if [ "$TENANT_TUNNELS" -gt 0 ]; then + PCT=$(( DELETE_COUNT * 100 / TENANT_TUNNELS )) + if [ "$PCT" -gt "$MAX_DELETE_PCT" ]; then + log "" + log "SAFETY: would delete $PCT% of tenant-shaped tunnels (threshold $MAX_DELETE_PCT%) — refusing." + log " If this is expected (e.g. major cleanup after incident), rerun with" + log " MAX_DELETE_PCT=$((PCT+5)) $0 $*" + exit 2 + fi +fi + +if [ "$DRY_RUN" = "1" ]; then + log "" + log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT tunnels." + log "" + log "First 20 tunnels that would be deleted:" + echo "$DECISIONS" | python3 -c " +import json, sys +shown = 0 +for l in sys.stdin: + d = json.loads(l) + if d['action'] == 'delete': + print(f\" {d['reason']:25s} {d['name']:40s} status={d['status']}\") + shown += 1 + if shown >= 20: break +" + exit 0 +fi + +# --- Execute deletes ------------------------------------------------------- + +log "" +log "Executing $DELETE_COUNT deletions..." +DELETED=0 +FAILED=0 +while IFS= read -r line; do + action=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['action'])") + [ "$action" = "delete" ] || continue + tid=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['id'])") + name=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['name'])") + if curl -sS -m 10 -X DELETE \ + -H "Authorization: Bearer $CF_API_TOKEN" \ + "https://api.cloudflare.com/client/v4/accounts/$CF_ACCOUNT_ID/cfd_tunnel/$tid" \ + | grep -q '"success":true'; then + DELETED=$((DELETED+1)) + else + FAILED=$((FAILED+1)) + log " FAILED: $name ($tid)" + fi +done <<< "$DECISIONS" + +log "" +log "Done. deleted=$DELETED failed=$FAILED" +[ "$FAILED" -eq 0 ]