#!/usr/bin/env bash # sweep-aws-secrets.sh — safe, targeted sweep of AWS Secrets Manager # secrets whose corresponding tenant no longer exists. # # Why this exists: CP's tenant-delete cascade calls # Secrets.DeleteSecret() at deprovision time, but only when the # deprovision flow runs to completion (provisioner/ec2.go:806). Crashed # provisions, hard-failed E2E runs, and any tenant created without a # matching deprovision (early-bail in provisioner, manual orchestration # bugs) leak the per-tenant bootstrap secret. At ~$0.40/secret/month, # 50 leaked secrets = $20/month — enough to show up on the cost # dashboard. # # Observed 2026-05-03: AWS Secrets Manager line item ~$19/month with # only one tenant currently provisioned, indicating ~45+ orphan # secrets. The tenant_resources audit table (mig 024) tracks four # resource kinds (Cloudflare Tunnel, Cloudflare DNS, EC2 Instance, # Security Group) but NOT Secrets Manager — the long-term fix is to # add KindSecretsManagerSecret + recorder hook + reconciler enumerator. # Tracked separately as a controlplane issue. # # This is a parallel-shape janitor to sweep-cf-tunnels.sh: # 1. Query CP admin API to enumerate live org IDs (prod + staging) # 2. Enumerate AWS Secrets Manager secrets matching the tenant prefix # 3. For each secret matching `molecule/tenant//bootstrap`, # check if appears in the live set # 4. Defense-in-depth: skip secrets created in the last 24h # (window for a provision-in-progress that hasn't yet finished # its first heartbeat to CP) # 5. Only delete secrets with NO live org counterpart AND outside # the 24h grace window # # Dry-run by default; must pass --execute to actually delete. # # Note on deletion semantics: --force-delete-without-recovery skips # the 7-30 day recovery window. We accept this because (a) the grace # window above already filters in-flight provisions, and (b) the # bootstrap secret is regenerated on every reprovision — losing one # is recoverable by re-running the provision flow. # # Env vars required: # AWS_REGION — region the secrets live in (default: us-east-1) # CP_PROD_ADMIN_TOKEN — CP admin bearer for api.moleculesai.app # CP_STAGING_ADMIN_TOKEN — CP admin bearer for staging-api.moleculesai.app # AWS_ACCESS_KEY_ID, — IAM principal with secretsmanager:ListSecrets # AWS_SECRET_ACCESS_KEY and secretsmanager:DeleteSecret. Note: the # prod molecule-cp principal does NOT have # these permissions; the workflow uses a # dedicated janitor principal. # # Exit codes: # 0 — dry-run completed or sweep executed successfully # 1 — missing required env, API failure, or unexpected state # 2 — safety check failed (would delete >MAX_DELETE_PCT% of # tenant-shaped secrets; refusing) set -euo pipefail DRY_RUN=1 # Tenant secrets are durable by design — they should track 1:1 with # live tenants. The 50% default mirrors sweep-cf-orphans.sh (DNS # records, also durable) rather than sweep-cf-tunnels.sh (90%, mostly # orphan by design). If the live tenant count drops by more than half # in one sweep window, that's an incident worth investigating before # we erase the audit trail. MAX_DELETE_PCT="${MAX_DELETE_PCT:-50}" GRACE_HOURS="${GRACE_HOURS:-24}" AWS_REGION="${AWS_REGION:-us-east-1}" for arg in "$@"; do case "$arg" in --execute|--no-dry-run) DRY_RUN=0 ;; --help|-h) grep '^#' "$0" | head -55 | sed 's/^# \{0,1\}//' exit 0 ;; *) echo "unknown arg: $arg (use --help)" >&2 exit 1 ;; esac done need() { local var="$1" if [ -z "${!var:-}" ]; then echo "ERROR: $var is required" >&2 exit 1 fi } need CP_PROD_ADMIN_TOKEN need CP_STAGING_ADMIN_TOKEN need AWS_ACCESS_KEY_ID need AWS_SECRET_ACCESS_KEY if ! command -v aws >/dev/null 2>&1; then echo "ERROR: aws cli is required" >&2 exit 1 fi log() { echo "[$(date -u +%H:%M:%S)] $*"; } # --- Gather live sets ------------------------------------------------------ # # Secret naming uses the tenant's UUID (org_id), not the slug — see # awsapi.TenantSecretName in molecule-controlplane. The /cp/admin/orgs # response includes both `id` and `slug`; we extract `id` here. log "Fetching CP prod org ids..." PROD_IDS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_PROD_ADMIN_TOKEN" \ "https://api.moleculesai.app/cp/admin/orgs?limit=500" \ | python3 -c "import json,sys; print(' '.join(o['id'] for o in json.load(sys.stdin).get('orgs',[])))") log " prod orgs: $(echo "$PROD_IDS" | wc -w | tr -d ' ')" log "Fetching CP staging org ids..." STAGING_IDS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_STAGING_ADMIN_TOKEN" \ "https://staging-api.moleculesai.app/cp/admin/orgs?limit=500" \ | python3 -c "import json,sys; print(' '.join(o['id'] for o in json.load(sys.stdin).get('orgs',[])))") log " staging orgs: $(echo "$STAGING_IDS" | wc -w | tr -d ' ')" log "Fetching AWS Secrets Manager secrets (region=$AWS_REGION)..." # list-secrets is paginated via NextToken. The aws cli auto-paginates # unless --max-items is set, but explicit pagination keeps us safe # from any sudden default change and lets us cap at a sane upper # bound. ListSecrets returns up to 100 per page; we cap at 50 pages # (5000 secrets) which is well past any plausible tenant count. PAGES_DIR=$(mktemp -d -t aws-secrets-XXXXXX) DELETE_PLAN="" NAME_MAP="" FAIL_LOG="" RESULT_LOG="" cleanup() { rm -rf "$PAGES_DIR" [ -n "$DELETE_PLAN" ] && rm -f "$DELETE_PLAN" [ -n "$NAME_MAP" ] && rm -f "$NAME_MAP" [ -n "$FAIL_LOG" ] && rm -f "$FAIL_LOG" [ -n "$RESULT_LOG" ] && rm -f "$RESULT_LOG" return 0 } trap cleanup EXIT NEXT_TOKEN="" PAGE=1 while :; do page_file="$PAGES_DIR/page-$(printf '%05d' "$PAGE").json" if [ -z "$NEXT_TOKEN" ]; then aws secretsmanager list-secrets \ --region "$AWS_REGION" \ --filters Key=name,Values=molecule/tenant/ \ --max-results 100 \ --output json > "$page_file" else aws secretsmanager list-secrets \ --region "$AWS_REGION" \ --filters Key=name,Values=molecule/tenant/ \ --max-results 100 \ --next-token "$NEXT_TOKEN" \ --output json > "$page_file" fi NEXT_TOKEN=$(python3 -c "import json,sys; d=json.load(open(sys.argv[1])); print(d.get('NextToken') or '')" "$page_file") PAGE=$((PAGE + 1)) if [ -z "$NEXT_TOKEN" ]; then break; fi if [ "$PAGE" -gt 50 ]; then log "::warning::stopping pagination at page 50 (5000 secrets) — re-run if more" break fi done SECRET_JSON=$(python3 -c ' import glob, json, os, sys acc = {"SecretList": []} for f in sorted(glob.glob(os.path.join(sys.argv[1], "page-*.json"))): with open(f) as fh: acc["SecretList"].extend(json.load(fh).get("SecretList") or []) print(json.dumps(acc)) ' "$PAGES_DIR") TOTAL_SECRETS=$(echo "$SECRET_JSON" | python3 -c "import json,sys; print(len(json.load(sys.stdin)['SecretList']))") log " total tenant-prefixed secrets: $TOTAL_SECRETS" # --- Compute orphans ------------------------------------------------------- # # Rules (in order): # 1. Name doesn't match `molecule/tenant//bootstrap` → keep # (unknown — never sweep arbitrary secrets that might belong to # platform infra or other tenants of this AWS account). # 2. CreatedDate within $GRACE_HOURS → keep (defense-in-depth: don't # kill a secret while its provision is still mid-flight). # 3. org_id ∈ {prod_ids ∪ staging_ids} → keep (live tenant). # 4. Otherwise → delete (orphan). export PROD_IDS STAGING_IDS GRACE_HOURS DECISIONS=$(echo "$SECRET_JSON" | python3 -c ' import json, os, re, sys from datetime import datetime, timezone, timedelta prod_ids = set(os.environ["PROD_IDS"].split()) staging_ids = set(os.environ["STAGING_IDS"].split()) all_ids = prod_ids | staging_ids grace = timedelta(hours=int(os.environ["GRACE_HOURS"])) now = datetime.now(timezone.utc) # molecule/tenant//bootstrap — org_id is a UUID. _TENANT_RE = re.compile(r"^molecule/tenant/([0-9a-fA-F-]{36})/bootstrap$") def parse_iso(s): if not s: return None # AWS returns ISO8601 with timezone (sometimes "+00:00", sometimes # numeric offset). datetime.fromisoformat handles both since 3.11. try: return datetime.fromisoformat(s) except ValueError: return None def decide(s, all_ids, grace, now): name = s.get("Name", "") arn = s.get("ARN", "") m = _TENANT_RE.match(name) if not m: return ("keep", "not-a-tenant-secret", arn, name) org_id = m.group(1) created = parse_iso(s.get("CreatedDate") or s.get("LastChangedDate")) if created is not None and (now - created) < grace: return ("keep", "in-grace-window", arn, name) if org_id in all_ids: return ("keep", "live-tenant", arn, name) return ("delete", "orphan-tenant", arn, name) d = json.loads(sys.stdin.read()) for s in d.get("SecretList", []): action, reason, arn, name = decide(s, all_ids, grace, now) print(json.dumps({"action": action, "reason": reason, "arn": arn, "name": name})) ') # --- Summarize + safety gate ---------------------------------------------- DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))") KEEP_COUNT=$((TOTAL_SECRETS - DELETE_COUNT)) TENANT_SECRETS=$(echo "$DECISIONS" | python3 -c " import json, sys n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-secret') print(n) ") log "" log "== Sweep plan ==" log " total secrets: $TOTAL_SECRETS" log " tenant-shaped secrets: $TENANT_SECRETS" log " would delete: $DELETE_COUNT" log " would keep: $KEEP_COUNT" log "" # Per-reason breakdown of deletes + keep-categories worth seeing echo "$DECISIONS" | python3 -c " import json,sys,collections delete_c = collections.Counter() keep_c = collections.Counter() for l in sys.stdin: d = json.loads(l) if d['action'] == 'delete': delete_c[d['reason']] += 1 else: keep_c[d['reason']] += 1 for reason, n in delete_c.most_common(): print(f' delete/{reason}: {n}') for reason, n in keep_c.most_common(): print(f' keep/{reason}: {n}') " # Safety gate operates against the tenant-shaped subset — same # rationale as sweep-cf-tunnels: a miscount of platform-infra # secrets shouldn't relax the gate. if [ "$TENANT_SECRETS" -gt 0 ]; then PCT=$(( DELETE_COUNT * 100 / TENANT_SECRETS )) if [ "$PCT" -gt "$MAX_DELETE_PCT" ]; then log "" log "SAFETY: would delete $PCT% of tenant-shaped secrets (threshold $MAX_DELETE_PCT%) — refusing." log " If this is expected (e.g. major cleanup after incident), rerun with" log " MAX_DELETE_PCT=$((PCT+5)) $0 $*" exit 2 fi fi if [ "$DRY_RUN" = "1" ]; then log "" log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT secrets." log "" log "First 20 secrets that would be deleted:" echo "$DECISIONS" | python3 -c " import json, sys shown = 0 for l in sys.stdin: d = json.loads(l) if d['action'] == 'delete': print(f\" {d['reason']:25s} {d['name']}\") shown += 1 if shown >= 20: break " exit 0 fi # --- Execute deletes ------------------------------------------------------- # # Parallel delete loop following sweep-cf-tunnels.sh's pattern. # AWS Secrets Manager DeleteSecret is fast (~0.3s/call), so even a # serial loop would handle 100s of secrets within the workflow's # 30 min cap, but parallel-by-default keeps us symmetric with the # other sweepers and gives headroom for a one-off backlog. # # --force-delete-without-recovery skips the 7-30 day recovery window. # Acceptable here because (a) the GRACE_HOURS filter prevents touching # in-flight provisions, and (b) the secret is regenerated on every # fresh provision — losing one only matters for a tenant we're # explicitly trying to forget. CONCURRENCY="${SWEEP_CONCURRENCY:-8}" DELETE_PLAN=$(mktemp -t aws-secrets-plan-XXXXXX) NAME_MAP=$(mktemp -t aws-secrets-names-XXXXXX) FAIL_LOG=$(mktemp -t aws-secrets-fail-XXXXXX) RESULT_LOG=$(mktemp -t aws-secrets-result-XXXXXX) # Build delete plan (one ARN per line) and id→name side-channel for # failure-log readability. Use ARN rather than Name on the delete # call because Name is mutable; ARN is the stable identifier. echo "$DECISIONS" | python3 -c ' import json, sys plan_path = sys.argv[1] map_path = sys.argv[2] with open(plan_path, "w") as plan, open(map_path, "w") as nmap: for line in sys.stdin: d = json.loads(line) if d.get("action") != "delete": continue arn = d["arn"] name = d.get("name", "") plan.write(arn + "\n") nmap.write(arn + "\t" + name + "\n") ' "$DELETE_PLAN" "$NAME_MAP" log "" log "Executing $DELETE_COUNT deletions ($CONCURRENCY-way parallel)..." export AWS_REGION NAME_MAP FAIL_LOG # shellcheck disable=SC2016 xargs -P "$CONCURRENCY" -L 1 -I {} bash -c ' arn="$1" if aws secretsmanager delete-secret \ --region "$AWS_REGION" \ --secret-id "$arn" \ --force-delete-without-recovery \ --output json >/dev/null 2>&1; then echo OK else name=$(awk -F"\t" -v a="$arn" "\$1==a {print \$2; exit}" "$NAME_MAP") echo FAIL echo "FAIL $name $arn" >> "$FAIL_LOG" fi ' _ {} < "$DELETE_PLAN" > "$RESULT_LOG" DELETED=$(grep -c '^OK$' "$RESULT_LOG" || true) FAILED=$(grep -c '^FAIL$' "$RESULT_LOG" || true) log "" log "Done. deleted=$DELETED failed=$FAILED" if [ "$FAILED" -ne 0 ]; then log "Failure detail (first 20):" head -20 "$FAIL_LOG" | while IFS= read -r fl; do log " $fl"; done fi [ "$FAILED" -eq 0 ]