molecule-core/scripts/ops/sweep-cf-orphans.sh
Hongming Wang 0576e341b9
ops(#1976): add smart-sweep script for orphan Cloudflare DNS records (#1978)
Replaces the "panic-button at >65 records" manual sweep that nukes
every pattern-match unconditionally (would delete live workspaces
along with orphans).

This version:
- Queries CP prod + staging /admin/orgs for live tenant slugs
- Queries AWS EC2 describe-instances for live workspace Name tags
- Only deletes CF records whose slug/ws-id has no live counterpart
- Dry-run by default (--execute to actually delete)
- Safety gate refuses to delete >50% of records (configurable via
  MAX_DELETE_PCT env var) — catches the "API returned zero orgs, every
  tenant looks orphan" failure mode before it nukes production
- Per-category accounting: orphan-ws / orphan-e2e-tenant / etc.

Usage:
  CF_API_TOKEN=... CF_ZONE_ID=... \
    CP_PROD_ADMIN_TOKEN=... CP_STAGING_ADMIN_TOKEN=... \
    bash scripts/ops/sweep-cf-orphans.sh           # dry-run
  bash scripts/ops/sweep-cf-orphans.sh --execute   # actually delete

Ref: #1976 (root-cause: tenant.Delete + workspace.Delete don't clean
their CF records — until that's fixed, this script is the maintenance
path)

Co-authored-by: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-authored-by: molecule-ai[bot] <276602405+molecule-ai[bot]@users.noreply.github.com>
2026-04-24 04:19:49 +00:00

258 lines
9.8 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# sweep-cf-orphans.sh — safe, targeted sweep of Cloudflare DNS records whose
# corresponding workspace/tenant no longer exists.
#
# Why this exists: tenant.Delete + workspace.Delete don't currently clean
# their CF records — see #1976. Until that lands, records accumulate at
# ~10/hour under normal E2E cadence. The old "sweep when >65" approach
# (deletes every record matching a pattern, regardless of liveness) was a
# panic button that would nuke live workspaces too.
#
# This script is the do-it-right version:
# 1. Query CP admin API to enumerate live org slugs
# 2. Query AWS EC2 to enumerate live workspace Name tags
# 3. For each CF record matching the sweep patterns, check if the
# corresponding slug / ws-id appears in the live sets
# 4. Only delete records with NO live counterpart
#
# Dry-run by default; must pass --execute to actually delete.
#
# Env vars required:
# CF_API_TOKEN — Cloudflare token with zone:dns:edit
# CF_ZONE_ID — the zone (moleculesai.app)
# CP_PROD_ADMIN_TOKEN — CP admin bearer for api.moleculesai.app
# CP_STAGING_ADMIN_TOKEN — CP admin bearer for staging-api.moleculesai.app
# AWS_* — standard AWS creds (default region us-east-2)
#
# Exit codes:
# 0 — dry-run completed or sweep executed successfully
# 1 — missing required env, API failure, or unexpected state
# 2 — safety check failed (would delete >50% of records; refusing)
set -euo pipefail
DRY_RUN=1
MAX_DELETE_PCT=50 # refuse to delete more than half the records in one run
REGION="${AWS_DEFAULT_REGION:-us-east-2}"
for arg in "$@"; do
case "$arg" in
--execute|--no-dry-run) DRY_RUN=0 ;;
--help|-h)
grep '^#' "$0" | head -35 | sed 's/^# \{0,1\}//'
exit 0
;;
*)
echo "unknown arg: $arg (use --help)" >&2
exit 1
;;
esac
done
need() {
local var="$1"
if [ -z "${!var:-}" ]; then
echo "ERROR: $var is required" >&2
exit 1
fi
}
need CF_API_TOKEN
need CF_ZONE_ID
need CP_PROD_ADMIN_TOKEN
need CP_STAGING_ADMIN_TOKEN
log() { echo "[$(date -u +%H:%M:%S)] $*"; }
# --- Gather live sets ------------------------------------------------------
log "Fetching CP prod org slugs..."
PROD_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_PROD_ADMIN_TOKEN" \
"https://api.moleculesai.app/cp/admin/orgs?limit=500" \
| python3 -c "import json,sys; print(' '.join(o['slug'] for o in json.load(sys.stdin).get('orgs',[])))")
log " prod orgs: $(echo "$PROD_SLUGS" | wc -w | tr -d ' ')"
log "Fetching CP staging org slugs..."
STAGING_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_STAGING_ADMIN_TOKEN" \
"https://staging-api.moleculesai.app/cp/admin/orgs?limit=500" \
| python3 -c "import json,sys; print(' '.join(o['slug'] for o in json.load(sys.stdin).get('orgs',[])))")
log " staging orgs: $(echo "$STAGING_SLUGS" | wc -w | tr -d ' ')"
log "Fetching live EC2 Name tags (region=$REGION)..."
# Use JSON output + python — AWS CLI's --query with nested filters has
# surprising flattening behavior that dropped tags silently on first attempt.
EC2_NAMES=$(aws ec2 describe-instances --region "$REGION" \
--filters "Name=instance-state-name,Values=running,pending" \
--output json 2>/dev/null | python3 -c '
import json, sys
out = []
for r in json.load(sys.stdin).get("Reservations", []):
for inst in r.get("Instances", []):
for t in inst.get("Tags", []):
if t.get("Key") == "Name" and t.get("Value"):
out.append(t["Value"])
print(" ".join(out))
')
log " live EC2s: $(echo "$EC2_NAMES" | wc -w | tr -d ' ')"
log "Fetching Cloudflare DNS records..."
CF_JSON=$(curl -sS -m 15 -H "Authorization: Bearer $CF_API_TOKEN" \
"https://api.cloudflare.com/client/v4/zones/$CF_ZONE_ID/dns_records?per_page=500")
TOTAL_CF=$(echo "$CF_JSON" | python3 -c "import json,sys; print(len(json.load(sys.stdin)['result']))")
log " CF records: $TOTAL_CF"
# --- Compute orphans -------------------------------------------------------
# We emit NDJSON so downstream can pipe into jq etc. Each line is one decision.
# Fields: action=keep|delete, reason, id, name, type.
#
# Rules (in order of priority — first match wins):
# 1. Platform-core (api, app, doc, apex, www, _vercel, _domainkey, _railway-verify,
# send, status, MX root) → always keep.
# 2. Tenant subdomain `<slug>.moleculesai.app` or `<slug>.staging.moleculesai.app`
# → keep if <slug> ∈ {prod_slugs staging_slugs}, else delete.
# 3. ws-<id8>.moleculesai.app / ws-<id8>.staging.moleculesai.app
# → keep if ws-<id8>* matches any live EC2 Name (prefix match), else delete.
# 4. e2e-<slug>.staging.moleculesai.app (or canary/canvas variants)
# → keep if <slug> ∈ {prod_slugs staging_slugs}, else delete.
# 5. Anything else → keep (we only sweep patterns we understand).
export PROD_SLUGS STAGING_SLUGS EC2_NAMES TOTAL_CF
DECISIONS=$(echo "$CF_JSON" | python3 -c '
import json, os, re, sys
d = json.load(sys.stdin)
prod = set(os.environ["PROD_SLUGS"].split())
staging = set(os.environ["STAGING_SLUGS"].split())
all_slugs = prod | staging
ec2_names = set(n for n in os.environ["EC2_NAMES"].split() if n)
def decide(r):
n = r["name"]
rid = r["id"]
typ = r["type"]
# Rule 1: platform core — leave alone
if n == "moleculesai.app":
return ("keep", "apex", rid, n, typ)
if n.startswith("_") or n.endswith("._domainkey.moleculesai.app"):
return ("keep", "verification/key", rid, n, typ)
if n in {"api.moleculesai.app","app.moleculesai.app","doc.moleculesai.app",
"send.moleculesai.app","status.moleculesai.app","www.moleculesai.app",
"staging-api.moleculesai.app"}:
return ("keep", "platform-core", rid, n, typ)
# Rule 3: ws-<hex8>-<rest>.(staging.)moleculesai.app
m = re.match(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$", n)
if m:
prefix = m.group(1)
# Live EC2 names are like "ws-d3605ef2-f7d" — same shape as DNS subdomain.
for ename in ec2_names:
if ename.startswith(prefix):
return ("keep", "live-ec2", rid, n, typ)
return ("delete", "orphan-ws", rid, n, typ)
# Rule 4: e2e-* tenants (includes canary, canvas variants)
m = re.match(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$", n)
if m:
slug = m.group(1)
if slug in all_slugs:
return ("keep", "live-e2e-tenant", rid, n, typ)
return ("delete", "orphan-e2e-tenant", rid, n, typ)
# Rule 2: any other tenant subdomain (slug.moleculesai.app or slug.staging.moleculesai.app)
m = re.match(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$", n)
if m:
slug = m.group(1)
if slug in all_slugs:
return ("keep", "live-tenant", rid, n, typ)
# Only flag as orphan if name looks like a tenant (not a one-off like "hermes-final-*")
# To avoid false-positive nukes on ad-hoc records, we KEEP anything that
# does not match a known pattern. Orphan only for explicit tenant-shaped names.
return ("keep", "unknown-subdomain-kept-for-safety", rid, n, typ)
return ("keep", "not-a-pattern-we-sweep", rid, n, typ)
for r in d["result"]:
action, reason, rid, name, typ = decide(r)
print(json.dumps({"action": action, "reason": reason, "id": rid, "name": name, "type": typ}))
')
# --- Summarize + safety gate ----------------------------------------------
DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
KEEP_COUNT=$((TOTAL_CF - DELETE_COUNT))
log ""
log "== Sweep plan =="
log " total CF records: $TOTAL_CF"
log " would delete: $DELETE_COUNT"
log " would keep: $KEEP_COUNT"
log ""
# Per-reason breakdown of deletes
echo "$DECISIONS" | python3 -c "
import json,sys,collections
c = collections.Counter()
for l in sys.stdin:
d = json.loads(l)
if d['action'] == 'delete':
c[d['reason']] += 1
for reason, n in c.most_common():
print(f' delete/{reason}: {n}')
"
# Safety gate: refuse to delete more than MAX_DELETE_PCT of records. If we
# hit this, something is wrong — probably CP admin API returned no orgs,
# making every tenant look orphan. Bail before nuking production.
if [ "$TOTAL_CF" -gt 0 ]; then
PCT=$(( DELETE_COUNT * 100 / TOTAL_CF ))
if [ "$PCT" -gt "$MAX_DELETE_PCT" ]; then
log ""
log "SAFETY: would delete $PCT% of records (threshold $MAX_DELETE_PCT%) — refusing."
log " If this is expected (e.g. major cleanup after incident), rerun with"
log " MAX_DELETE_PCT=$((PCT+5)) $0 $*"
exit 2
fi
fi
if [ "$DRY_RUN" = "1" ]; then
log ""
log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT records."
log ""
log "First 20 records that would be deleted:"
echo "$DECISIONS" | python3 -c "
import json, sys
for i, l in enumerate(sys.stdin):
d = json.loads(l)
if d['action'] == 'delete':
print(f\" {d['reason']:25s} {d['name']}\")
if i > 50: break
" | head -20
exit 0
fi
# --- Execute deletes -------------------------------------------------------
log ""
log "Executing $DELETE_COUNT deletions..."
DELETED=0
FAILED=0
while IFS= read -r line; do
action=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['action'])")
[ "$action" = "delete" ] || continue
rid=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['id'])")
name=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['name'])")
if curl -sS -m 10 -X DELETE \
-H "Authorization: Bearer $CF_API_TOKEN" \
"https://api.cloudflare.com/client/v4/zones/$CF_ZONE_ID/dns_records/$rid" \
| grep -q '"success":true'; then
DELETED=$((DELETED+1))
else
FAILED=$((FAILED+1))
log " FAILED: $name ($rid)"
fi
done <<< "$DECISIONS"
log ""
log "Done. deleted=$DELETED failed=$FAILED"
[ "$FAILED" -eq 0 ]