From 0576e341b9c99c02bbcc218e373d273d60663ffa Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 23 Apr 2026 21:19:49 -0700 Subject: [PATCH] ops(#1976): add smart-sweep script for orphan Cloudflare DNS records (#1978) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the "panic-button at >65 records" manual sweep that nukes every pattern-match unconditionally (would delete live workspaces along with orphans). This version: - Queries CP prod + staging /admin/orgs for live tenant slugs - Queries AWS EC2 describe-instances for live workspace Name tags - Only deletes CF records whose slug/ws-id has no live counterpart - Dry-run by default (--execute to actually delete) - Safety gate refuses to delete >50% of records (configurable via MAX_DELETE_PCT env var) — catches the "API returned zero orgs, every tenant looks orphan" failure mode before it nukes production - Per-category accounting: orphan-ws / orphan-e2e-tenant / etc. Usage: CF_API_TOKEN=... CF_ZONE_ID=... \ CP_PROD_ADMIN_TOKEN=... CP_STAGING_ADMIN_TOKEN=... \ bash scripts/ops/sweep-cf-orphans.sh # dry-run bash scripts/ops/sweep-cf-orphans.sh --execute # actually delete Ref: #1976 (root-cause: tenant.Delete + workspace.Delete don't clean their CF records — until that's fixed, this script is the maintenance path) Co-authored-by: Hongming Wang Co-authored-by: Claude Opus 4.7 (1M context) Co-authored-by: molecule-ai[bot] <276602405+molecule-ai[bot]@users.noreply.github.com> --- scripts/ops/sweep-cf-orphans.sh | 257 ++++++++++++++++++++++++++++++++ 1 file changed, 257 insertions(+) create mode 100755 scripts/ops/sweep-cf-orphans.sh diff --git a/scripts/ops/sweep-cf-orphans.sh b/scripts/ops/sweep-cf-orphans.sh new file mode 100755 index 00000000..2a734ad1 --- /dev/null +++ b/scripts/ops/sweep-cf-orphans.sh @@ -0,0 +1,257 @@ +#!/usr/bin/env bash +# sweep-cf-orphans.sh — safe, targeted sweep of Cloudflare DNS records whose +# corresponding workspace/tenant no longer exists. +# +# Why this exists: tenant.Delete + workspace.Delete don't currently clean +# their CF records — see #1976. Until that lands, records accumulate at +# ~10/hour under normal E2E cadence. The old "sweep when >65" approach +# (deletes every record matching a pattern, regardless of liveness) was a +# panic button that would nuke live workspaces too. +# +# This script is the do-it-right version: +# 1. Query CP admin API to enumerate live org slugs +# 2. Query AWS EC2 to enumerate live workspace Name tags +# 3. For each CF record matching the sweep patterns, check if the +# corresponding slug / ws-id appears in the live sets +# 4. Only delete records with NO live counterpart +# +# Dry-run by default; must pass --execute to actually delete. +# +# Env vars required: +# CF_API_TOKEN — Cloudflare token with zone:dns:edit +# CF_ZONE_ID — the zone (moleculesai.app) +# CP_PROD_ADMIN_TOKEN — CP admin bearer for api.moleculesai.app +# CP_STAGING_ADMIN_TOKEN — CP admin bearer for staging-api.moleculesai.app +# AWS_* — standard AWS creds (default region us-east-2) +# +# Exit codes: +# 0 — dry-run completed or sweep executed successfully +# 1 — missing required env, API failure, or unexpected state +# 2 — safety check failed (would delete >50% of records; refusing) + +set -euo pipefail + +DRY_RUN=1 +MAX_DELETE_PCT=50 # refuse to delete more than half the records in one run +REGION="${AWS_DEFAULT_REGION:-us-east-2}" + +for arg in "$@"; do + case "$arg" in + --execute|--no-dry-run) DRY_RUN=0 ;; + --help|-h) + grep '^#' "$0" | head -35 | sed 's/^# \{0,1\}//' + exit 0 + ;; + *) + echo "unknown arg: $arg (use --help)" >&2 + exit 1 + ;; + esac +done + +need() { + local var="$1" + if [ -z "${!var:-}" ]; then + echo "ERROR: $var is required" >&2 + exit 1 + fi +} +need CF_API_TOKEN +need CF_ZONE_ID +need CP_PROD_ADMIN_TOKEN +need CP_STAGING_ADMIN_TOKEN + +log() { echo "[$(date -u +%H:%M:%S)] $*"; } + +# --- Gather live sets ------------------------------------------------------ + +log "Fetching CP prod org slugs..." +PROD_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_PROD_ADMIN_TOKEN" \ + "https://api.moleculesai.app/cp/admin/orgs?limit=500" \ + | python3 -c "import json,sys; print(' '.join(o['slug'] for o in json.load(sys.stdin).get('orgs',[])))") +log " prod orgs: $(echo "$PROD_SLUGS" | wc -w | tr -d ' ')" + +log "Fetching CP staging org slugs..." +STAGING_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_STAGING_ADMIN_TOKEN" \ + "https://staging-api.moleculesai.app/cp/admin/orgs?limit=500" \ + | python3 -c "import json,sys; print(' '.join(o['slug'] for o in json.load(sys.stdin).get('orgs',[])))") +log " staging orgs: $(echo "$STAGING_SLUGS" | wc -w | tr -d ' ')" + +log "Fetching live EC2 Name tags (region=$REGION)..." +# Use JSON output + python — AWS CLI's --query with nested filters has +# surprising flattening behavior that dropped tags silently on first attempt. +EC2_NAMES=$(aws ec2 describe-instances --region "$REGION" \ + --filters "Name=instance-state-name,Values=running,pending" \ + --output json 2>/dev/null | python3 -c ' +import json, sys +out = [] +for r in json.load(sys.stdin).get("Reservations", []): + for inst in r.get("Instances", []): + for t in inst.get("Tags", []): + if t.get("Key") == "Name" and t.get("Value"): + out.append(t["Value"]) +print(" ".join(out)) +') +log " live EC2s: $(echo "$EC2_NAMES" | wc -w | tr -d ' ')" + +log "Fetching Cloudflare DNS records..." +CF_JSON=$(curl -sS -m 15 -H "Authorization: Bearer $CF_API_TOKEN" \ + "https://api.cloudflare.com/client/v4/zones/$CF_ZONE_ID/dns_records?per_page=500") +TOTAL_CF=$(echo "$CF_JSON" | python3 -c "import json,sys; print(len(json.load(sys.stdin)['result']))") +log " CF records: $TOTAL_CF" + +# --- Compute orphans ------------------------------------------------------- + +# We emit NDJSON so downstream can pipe into jq etc. Each line is one decision. +# Fields: action=keep|delete, reason, id, name, type. +# +# Rules (in order of priority — first match wins): +# 1. Platform-core (api, app, doc, apex, www, _vercel, _domainkey, _railway-verify, +# send, status, MX root) → always keep. +# 2. Tenant subdomain `.moleculesai.app` or `.staging.moleculesai.app` +# → keep if ∈ {prod_slugs ∪ staging_slugs}, else delete. +# 3. ws-.moleculesai.app / ws-.staging.moleculesai.app +# → keep if ws-* matches any live EC2 Name (prefix match), else delete. +# 4. e2e-.staging.moleculesai.app (or canary/canvas variants) +# → keep if ∈ {prod_slugs ∪ staging_slugs}, else delete. +# 5. Anything else → keep (we only sweep patterns we understand). + +export PROD_SLUGS STAGING_SLUGS EC2_NAMES TOTAL_CF +DECISIONS=$(echo "$CF_JSON" | python3 -c ' +import json, os, re, sys +d = json.load(sys.stdin) +prod = set(os.environ["PROD_SLUGS"].split()) +staging = set(os.environ["STAGING_SLUGS"].split()) +all_slugs = prod | staging +ec2_names = set(n for n in os.environ["EC2_NAMES"].split() if n) + +def decide(r): + n = r["name"] + rid = r["id"] + typ = r["type"] + + # Rule 1: platform core — leave alone + if n == "moleculesai.app": + return ("keep", "apex", rid, n, typ) + if n.startswith("_") or n.endswith("._domainkey.moleculesai.app"): + return ("keep", "verification/key", rid, n, typ) + if n in {"api.moleculesai.app","app.moleculesai.app","doc.moleculesai.app", + "send.moleculesai.app","status.moleculesai.app","www.moleculesai.app", + "staging-api.moleculesai.app"}: + return ("keep", "platform-core", rid, n, typ) + + # Rule 3: ws--.(staging.)moleculesai.app + m = re.match(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$", n) + if m: + prefix = m.group(1) + # Live EC2 names are like "ws-d3605ef2-f7d" — same shape as DNS subdomain. + for ename in ec2_names: + if ename.startswith(prefix): + return ("keep", "live-ec2", rid, n, typ) + return ("delete", "orphan-ws", rid, n, typ) + + # Rule 4: e2e-* tenants (includes canary, canvas variants) + m = re.match(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$", n) + if m: + slug = m.group(1) + if slug in all_slugs: + return ("keep", "live-e2e-tenant", rid, n, typ) + return ("delete", "orphan-e2e-tenant", rid, n, typ) + + # Rule 2: any other tenant subdomain (slug.moleculesai.app or slug.staging.moleculesai.app) + m = re.match(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$", n) + if m: + slug = m.group(1) + if slug in all_slugs: + return ("keep", "live-tenant", rid, n, typ) + # Only flag as orphan if name looks like a tenant (not a one-off like "hermes-final-*") + # To avoid false-positive nukes on ad-hoc records, we KEEP anything that + # does not match a known pattern. Orphan only for explicit tenant-shaped names. + return ("keep", "unknown-subdomain-kept-for-safety", rid, n, typ) + + return ("keep", "not-a-pattern-we-sweep", rid, n, typ) + +for r in d["result"]: + action, reason, rid, name, typ = decide(r) + print(json.dumps({"action": action, "reason": reason, "id": rid, "name": name, "type": typ})) +') + +# --- Summarize + safety gate ---------------------------------------------- + +DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))") +KEEP_COUNT=$((TOTAL_CF - DELETE_COUNT)) + +log "" +log "== Sweep plan ==" +log " total CF records: $TOTAL_CF" +log " would delete: $DELETE_COUNT" +log " would keep: $KEEP_COUNT" +log "" + +# Per-reason breakdown of deletes +echo "$DECISIONS" | python3 -c " +import json,sys,collections +c = collections.Counter() +for l in sys.stdin: + d = json.loads(l) + if d['action'] == 'delete': + c[d['reason']] += 1 +for reason, n in c.most_common(): + print(f' delete/{reason}: {n}') +" + +# Safety gate: refuse to delete more than MAX_DELETE_PCT of records. If we +# hit this, something is wrong — probably CP admin API returned no orgs, +# making every tenant look orphan. Bail before nuking production. +if [ "$TOTAL_CF" -gt 0 ]; then + PCT=$(( DELETE_COUNT * 100 / TOTAL_CF )) + if [ "$PCT" -gt "$MAX_DELETE_PCT" ]; then + log "" + log "SAFETY: would delete $PCT% of records (threshold $MAX_DELETE_PCT%) — refusing." + log " If this is expected (e.g. major cleanup after incident), rerun with" + log " MAX_DELETE_PCT=$((PCT+5)) $0 $*" + exit 2 + fi +fi + +if [ "$DRY_RUN" = "1" ]; then + log "" + log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT records." + log "" + log "First 20 records that would be deleted:" + echo "$DECISIONS" | python3 -c " +import json, sys +for i, l in enumerate(sys.stdin): + d = json.loads(l) + if d['action'] == 'delete': + print(f\" {d['reason']:25s} {d['name']}\") + if i > 50: break +" | head -20 + exit 0 +fi + +# --- Execute deletes ------------------------------------------------------- + +log "" +log "Executing $DELETE_COUNT deletions..." +DELETED=0 +FAILED=0 +while IFS= read -r line; do + action=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['action'])") + [ "$action" = "delete" ] || continue + rid=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['id'])") + name=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['name'])") + if curl -sS -m 10 -X DELETE \ + -H "Authorization: Bearer $CF_API_TOKEN" \ + "https://api.cloudflare.com/client/v4/zones/$CF_ZONE_ID/dns_records/$rid" \ + | grep -q '"success":true'; then + DELETED=$((DELETED+1)) + else + FAILED=$((FAILED+1)) + log " FAILED: $name ($rid)" + fi +done <<< "$DECISIONS" + +log "" +log "Done. deleted=$DELETED failed=$FAILED" +[ "$FAILED" -eq 0 ]