Merge pull request #2557 from Molecule-AI/feat/sweep-aws-secrets-orphans

feat(ops): sweep orphan AWS Secrets Manager secrets
2026-05-03 09:48:59 +00:00 · 2026-05-03 09:48:59 +00:00 · e014d22ee9
commit e014d22ee9
parent 18c2bdbe68 6f8f7932d2
2 changed files with 504 additions and 0 deletions
--- a/.github/workflows/sweep-aws-secrets.yml
+++ b/.github/workflows/sweep-aws-secrets.yml
@ -0,0 +1,129 @@
+name: Sweep stale AWS Secrets Manager secrets
+
+# Janitor for per-tenant AWS Secrets Manager secrets
+# (`molecule/tenant/<org_id>/bootstrap`) whose backing tenant no
+# longer exists. Parallel-shape to sweep-cf-tunnels.yml and
+# sweep-cf-orphans.yml — different cloud, same justification.
+#
+# Why this exists separately from a long-term reconciler integration:
+#   - molecule-controlplane's tenant_resources audit table (mig 024)
+#     currently tracks four resource kinds: CloudflareTunnel,
+#     CloudflareDNS, EC2Instance, SecurityGroup. SecretsManager is
+#     not in the list, so the existing reconciler doesn't catch
+#     orphan secrets.
+#   - At ~$0.40/secret/month the cost grew to ~$19/month before this
+#     sweeper was written, indicating ~45+ orphan secrets from
+#     crashed provisions and incomplete deprovision flows.
+#   - The proper fix (KindSecretsManagerSecret + recorder hook +
+#     reconciler enumerator) is filed as a separate controlplane
+#     issue. This sweeper is the immediate cost-relief stopgap.
+#
+# IAM principal: AWS_JANITOR_ACCESS_KEY_ID / AWS_JANITOR_SECRET_ACCESS_KEY.
+# This is a DEDICATED principal — the production `molecule-cp` IAM
+# user lacks `secretsmanager:ListSecrets` (it only has
+# Get/Create/Update/Delete on specific resources, scoped to its
+# operational needs). The janitor needs ListSecrets across the
+# `molecule/tenant/*` prefix, which warrants a separate principal so
+# we don't broaden the prod-CP policy.
+#
+# Safety: the script's MAX_DELETE_PCT gate (default 50%, mirroring
+# sweep-cf-orphans.yml — tenant secrets are durable by design, unlike
+# the mostly-orphan tunnels) refuses to nuke past the threshold.
+
+on:
+  schedule:
+    # Hourly at :30 — offsets from sweep-cf-orphans (:15) and
+    # sweep-cf-tunnels (:45) so the three janitors don't burst the
+    # CP admin endpoints at the same minute.
+    - cron: '30 * * * *'
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry run only — list what would be deleted, no deletion"
+        required: false
+        type: boolean
+        default: true
+      max_delete_pct:
+        description: "Override safety gate (default 50, set higher only for major cleanup)"
+        required: false
+        default: "50"
+      grace_hours:
+        description: "Skip secrets created within this many hours (default 24)"
+        required: false
+        default: "24"
+
+# Don't let two sweeps race the same AWS account.
+concurrency:
+  group: sweep-aws-secrets
+  cancel-in-progress: false
+
+permissions:
+  contents: read
+
+jobs:
+  sweep:
+    name: Sweep AWS Secrets Manager
+    runs-on: ubuntu-latest
+    # 30 min cap, mirroring the other janitors. AWS DeleteSecret is
+    # fast (~0.3s/call) so even a 100+ backlog drains in seconds
+    # under the 8-way xargs parallelism, but the cap is set generously
+    # to leave headroom for any actual API hang.
+    timeout-minutes: 30
+    env:
+      AWS_REGION: ${{ secrets.AWS_REGION || 'us-east-1' }}
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_JANITOR_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_JANITOR_SECRET_ACCESS_KEY }}
+      CP_PROD_ADMIN_TOKEN: ${{ secrets.CP_PROD_ADMIN_TOKEN }}
+      CP_STAGING_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_TOKEN }}
+      MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }}
+      GRACE_HOURS: ${{ github.event.inputs.grace_hours || '24' }}
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Verify required secrets present
+        id: verify
+        # Schedule-vs-dispatch behaviour split mirrors sweep-cf-orphans
+        # and sweep-cf-tunnels (hardened 2026-04-28). Same principle:
+        #   - schedule → exit 1 on missing secrets (red CI surfaces it)
+        #   - workflow_dispatch → exit 0 with warning (operator-driven,
+        #     they already accepted the repo state)
+        run: |
+          missing=()
+          for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN; do
+            if [ -z "${!var:-}" ]; then
+              missing+=("$var")
+            fi
+          done
+          if [ ${#missing[@]} -gt 0 ]; then
+            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+              echo "::warning::skipping sweep — secrets not configured: ${missing[*]}"
+              echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun."
+              echo "::warning::AWS_JANITOR_* must belong to a principal with secretsmanager:ListSecrets and secretsmanager:DeleteSecret on molecule/tenant/* (the prod molecule-cp principal lacks ListSecrets)."
+              echo "skip=true" >> "$GITHUB_OUTPUT"
+              exit 0
+            fi
+            echo "::error::sweep cannot run — required secrets missing: ${missing[*]}"
+            echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow."
+            echo "::error::AWS_JANITOR_* must belong to a principal with secretsmanager:ListSecrets and secretsmanager:DeleteSecret on molecule/tenant/*."
+            exit 1
+          fi
+          echo "All required secrets present ✓"
+          echo "skip=false" >> "$GITHUB_OUTPUT"
+
+      - name: Run sweep
+        if: steps.verify.outputs.skip != 'true'
+        # Schedule-vs-dispatch dry-run asymmetry mirrors sweep-cf-tunnels:
+        #   - Scheduled: input empty → "false" → --execute (the whole
+        #     point of an hourly janitor).
+        #   - Manual workflow_dispatch: input default true → dry-run;
+        #     operator must flip it to actually delete.
+        run: |
+          set -euo pipefail
+          if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then
+            echo "Running in dry-run mode — no deletions"
+            bash scripts/ops/sweep-aws-secrets.sh
+          else
+            echo "Running with --execute — will delete identified orphans"
+            bash scripts/ops/sweep-aws-secrets.sh --execute
+          fi
--- a/scripts/ops/sweep-aws-secrets.sh
+++ b/scripts/ops/sweep-aws-secrets.sh
@ -0,0 +1,375 @@
+#!/usr/bin/env bash
+# sweep-aws-secrets.sh — safe, targeted sweep of AWS Secrets Manager
+# secrets whose corresponding tenant no longer exists.
+#
+# Why this exists: CP's tenant-delete cascade calls
+# Secrets.DeleteSecret() at deprovision time, but only when the
+# deprovision flow runs to completion (provisioner/ec2.go:806). Crashed
+# provisions, hard-failed E2E runs, and any tenant created without a
+# matching deprovision (early-bail in provisioner, manual orchestration
+# bugs) leak the per-tenant bootstrap secret. At ~$0.40/secret/month,
+# 50 leaked secrets = $20/month — enough to show up on the cost
+# dashboard.
+#
+# Observed 2026-05-03: AWS Secrets Manager line item ~$19/month with
+# only one tenant currently provisioned, indicating ~45+ orphan
+# secrets. The tenant_resources audit table (mig 024) tracks four
+# resource kinds (Cloudflare Tunnel, Cloudflare DNS, EC2 Instance,
+# Security Group) but NOT Secrets Manager — the long-term fix is to
+# add KindSecretsManagerSecret + recorder hook + reconciler enumerator.
+# Tracked separately as a controlplane issue.
+#
+# This is a parallel-shape janitor to sweep-cf-tunnels.sh:
+#   1. Query CP admin API to enumerate live org IDs (prod + staging)
+#   2. Enumerate AWS Secrets Manager secrets matching the tenant prefix
+#   3. For each secret matching `molecule/tenant/<org_id>/bootstrap`,
+#      check if <org_id> appears in the live set
+#   4. Defense-in-depth: skip secrets created in the last 24h
+#      (window for a provision-in-progress that hasn't yet finished
+#      its first heartbeat to CP)
+#   5. Only delete secrets with NO live org counterpart AND outside
+#      the 24h grace window
+#
+# Dry-run by default; must pass --execute to actually delete.
+#
+# Note on deletion semantics: --force-delete-without-recovery skips
+# the 7-30 day recovery window. We accept this because (a) the grace
+# window above already filters in-flight provisions, and (b) the
+# bootstrap secret is regenerated on every reprovision — losing one
+# is recoverable by re-running the provision flow.
+#
+# Env vars required:
+#   AWS_REGION              — region the secrets live in (default: us-east-1)
+#   CP_PROD_ADMIN_TOKEN     — CP admin bearer for api.moleculesai.app
+#   CP_STAGING_ADMIN_TOKEN  — CP admin bearer for staging-api.moleculesai.app
+#   AWS_ACCESS_KEY_ID,      — IAM principal with secretsmanager:ListSecrets
+#   AWS_SECRET_ACCESS_KEY     and secretsmanager:DeleteSecret. Note: the
+#                             prod molecule-cp principal does NOT have
+#                             these permissions; the workflow uses a
+#                             dedicated janitor principal.
+#
+# Exit codes:
+#   0  — dry-run completed or sweep executed successfully
+#   1  — missing required env, API failure, or unexpected state
+#   2  — safety check failed (would delete >MAX_DELETE_PCT% of
+#         tenant-shaped secrets; refusing)
+
+set -euo pipefail
+
+DRY_RUN=1
+# Tenant secrets are durable by design — they should track 1:1 with
+# live tenants. The 50% default mirrors sweep-cf-orphans.sh (DNS
+# records, also durable) rather than sweep-cf-tunnels.sh (90%, mostly
+# orphan by design). If the live tenant count drops by more than half
+# in one sweep window, that's an incident worth investigating before
+# we erase the audit trail.
+MAX_DELETE_PCT="${MAX_DELETE_PCT:-50}"
+GRACE_HOURS="${GRACE_HOURS:-24}"
+AWS_REGION="${AWS_REGION:-us-east-1}"
+
+for arg in "$@"; do
+  case "$arg" in
+    --execute|--no-dry-run) DRY_RUN=0 ;;
+    --help|-h)
+      grep '^#' "$0" | head -55 | sed 's/^# \{0,1\}//'
+      exit 0
+      ;;
+    *)
+      echo "unknown arg: $arg (use --help)" >&2
+      exit 1
+      ;;
+  esac
+done
+
+need() {
+  local var="$1"
+  if [ -z "${!var:-}" ]; then
+    echo "ERROR: $var is required" >&2
+    exit 1
+  fi
+}
+need CP_PROD_ADMIN_TOKEN
+need CP_STAGING_ADMIN_TOKEN
+need AWS_ACCESS_KEY_ID
+need AWS_SECRET_ACCESS_KEY
+
+if ! command -v aws >/dev/null 2>&1; then
+  echo "ERROR: aws cli is required" >&2
+  exit 1
+fi
+
+log() { echo "[$(date -u +%H:%M:%S)] $*"; }
+
+# --- Gather live sets ------------------------------------------------------
+#
+# Secret naming uses the tenant's UUID (org_id), not the slug — see
+# awsapi.TenantSecretName in molecule-controlplane. The /cp/admin/orgs
+# response includes both `id` and `slug`; we extract `id` here.
+
+log "Fetching CP prod org ids..."
+PROD_IDS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_PROD_ADMIN_TOKEN" \
+  "https://api.moleculesai.app/cp/admin/orgs?limit=500" \
+  | python3 -c "import json,sys; print(' '.join(o['id'] for o in json.load(sys.stdin).get('orgs',[])))")
+log "  prod orgs: $(echo "$PROD_IDS" | wc -w | tr -d ' ')"
+
+log "Fetching CP staging org ids..."
+STAGING_IDS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_STAGING_ADMIN_TOKEN" \
+  "https://staging-api.moleculesai.app/cp/admin/orgs?limit=500" \
+  | python3 -c "import json,sys; print(' '.join(o['id'] for o in json.load(sys.stdin).get('orgs',[])))")
+log "  staging orgs: $(echo "$STAGING_IDS" | wc -w | tr -d ' ')"
+
+log "Fetching AWS Secrets Manager secrets (region=$AWS_REGION)..."
+# list-secrets is paginated via NextToken. The aws cli auto-paginates
+# unless --max-items is set, but explicit pagination keeps us safe
+# from any sudden default change and lets us cap at a sane upper
+# bound. ListSecrets returns up to 100 per page; we cap at 50 pages
+# (5000 secrets) which is well past any plausible tenant count.
+PAGES_DIR=$(mktemp -d -t aws-secrets-XXXXXX)
+DELETE_PLAN=""
+NAME_MAP=""
+FAIL_LOG=""
+RESULT_LOG=""
+cleanup() {
+  rm -rf "$PAGES_DIR"
+  [ -n "$DELETE_PLAN" ] && rm -f "$DELETE_PLAN"
+  [ -n "$NAME_MAP" ] && rm -f "$NAME_MAP"
+  [ -n "$FAIL_LOG" ] && rm -f "$FAIL_LOG"
+  [ -n "$RESULT_LOG" ] && rm -f "$RESULT_LOG"
+  return 0
+}
+trap cleanup EXIT
+
+NEXT_TOKEN=""
+PAGE=1
+while :; do
+  page_file="$PAGES_DIR/page-$(printf '%05d' "$PAGE").json"
+  if [ -z "$NEXT_TOKEN" ]; then
+    aws secretsmanager list-secrets \
+      --region "$AWS_REGION" \
+      --filters Key=name,Values=molecule/tenant/ \
+      --max-results 100 \
+      --output json > "$page_file"
+  else
+    aws secretsmanager list-secrets \
+      --region "$AWS_REGION" \
+      --filters Key=name,Values=molecule/tenant/ \
+      --max-results 100 \
+      --next-token "$NEXT_TOKEN" \
+      --output json > "$page_file"
+  fi
+  NEXT_TOKEN=$(python3 -c "import json,sys; d=json.load(open(sys.argv[1])); print(d.get('NextToken') or '')" "$page_file")
+  PAGE=$((PAGE + 1))
+  if [ -z "$NEXT_TOKEN" ]; then break; fi
+  if [ "$PAGE" -gt 50 ]; then
+    log "::warning::stopping pagination at page 50 (5000 secrets) — re-run if more"
+    break
+  fi
+done
+
+SECRET_JSON=$(python3 -c '
+import glob, json, os, sys
+acc = {"SecretList": []}
+for f in sorted(glob.glob(os.path.join(sys.argv[1], "page-*.json"))):
+    with open(f) as fh:
+        acc["SecretList"].extend(json.load(fh).get("SecretList") or [])
+print(json.dumps(acc))
+' "$PAGES_DIR")
+TOTAL_SECRETS=$(echo "$SECRET_JSON" | python3 -c "import json,sys; print(len(json.load(sys.stdin)['SecretList']))")
+log "  total tenant-prefixed secrets: $TOTAL_SECRETS"
+
+# --- Compute orphans -------------------------------------------------------
+#
+# Rules (in order):
+#   1. Name doesn't match `molecule/tenant/<org_id>/bootstrap` → keep
+#      (unknown — never sweep arbitrary secrets that might belong to
+#      platform infra or other tenants of this AWS account).
+#   2. CreatedDate within $GRACE_HOURS → keep (defense-in-depth: don't
+#      kill a secret while its provision is still mid-flight).
+#   3. org_id ∈ {prod_ids ∪ staging_ids} → keep (live tenant).
+#   4. Otherwise → delete (orphan).
+
+export PROD_IDS STAGING_IDS GRACE_HOURS
+DECISIONS=$(echo "$SECRET_JSON" | python3 -c '
+import json, os, re, sys
+from datetime import datetime, timezone, timedelta
+
+prod_ids = set(os.environ["PROD_IDS"].split())
+staging_ids = set(os.environ["STAGING_IDS"].split())
+all_ids = prod_ids | staging_ids
+grace = timedelta(hours=int(os.environ["GRACE_HOURS"]))
+now = datetime.now(timezone.utc)
+
+# molecule/tenant/<org_id>/bootstrap — org_id is a UUID.
+_TENANT_RE = re.compile(r"^molecule/tenant/([0-9a-fA-F-]{36})/bootstrap$")
+
+def parse_iso(s):
+    if not s:
+        return None
+    # AWS returns ISO8601 with timezone (sometimes "+00:00", sometimes
+    # numeric offset). datetime.fromisoformat handles both since 3.11.
+    try:
+        return datetime.fromisoformat(s)
+    except ValueError:
+        return None
+
+def decide(s, all_ids, grace, now):
+    name = s.get("Name", "")
+    arn = s.get("ARN", "")
+
+    m = _TENANT_RE.match(name)
+    if not m:
+        return ("keep", "not-a-tenant-secret", arn, name)
+
+    org_id = m.group(1)
+
+    created = parse_iso(s.get("CreatedDate") or s.get("LastChangedDate"))
+    if created is not None and (now - created) < grace:
+        return ("keep", "in-grace-window", arn, name)
+
+    if org_id in all_ids:
+        return ("keep", "live-tenant", arn, name)
+
+    return ("delete", "orphan-tenant", arn, name)
+
+d = json.loads(sys.stdin.read())
+for s in d.get("SecretList", []):
+    action, reason, arn, name = decide(s, all_ids, grace, now)
+    print(json.dumps({"action": action, "reason": reason, "arn": arn, "name": name}))
+')
+
+# --- Summarize + safety gate ----------------------------------------------
+
+DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
+KEEP_COUNT=$((TOTAL_SECRETS - DELETE_COUNT))
+TENANT_SECRETS=$(echo "$DECISIONS" | python3 -c "
+import json, sys
+n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-secret')
+print(n)
+")
+
+log ""
+log "== Sweep plan =="
+log "  total secrets:          $TOTAL_SECRETS"
+log "  tenant-shaped secrets:  $TENANT_SECRETS"
+log "  would delete:           $DELETE_COUNT"
+log "  would keep:             $KEEP_COUNT"
+log ""
+
+# Per-reason breakdown of deletes + keep-categories worth seeing
+echo "$DECISIONS" | python3 -c "
+import json,sys,collections
+delete_c = collections.Counter()
+keep_c = collections.Counter()
+for l in sys.stdin:
+    d = json.loads(l)
+    if d['action'] == 'delete':
+        delete_c[d['reason']] += 1
+    else:
+        keep_c[d['reason']] += 1
+for reason, n in delete_c.most_common():
+    print(f'  delete/{reason}: {n}')
+for reason, n in keep_c.most_common():
+    print(f'  keep/{reason}: {n}')
+"
+
+# Safety gate operates against the tenant-shaped subset — same
+# rationale as sweep-cf-tunnels: a miscount of platform-infra
+# secrets shouldn't relax the gate.
+if [ "$TENANT_SECRETS" -gt 0 ]; then
+  PCT=$(( DELETE_COUNT * 100 / TENANT_SECRETS ))
+  if [ "$PCT" -gt "$MAX_DELETE_PCT" ]; then
+    log ""
+    log "SAFETY: would delete $PCT% of tenant-shaped secrets (threshold $MAX_DELETE_PCT%) — refusing."
+    log "  If this is expected (e.g. major cleanup after incident), rerun with"
+    log "  MAX_DELETE_PCT=$((PCT+5)) $0 $*"
+    exit 2
+  fi
+fi
+
+if [ "$DRY_RUN" = "1" ]; then
+  log ""
+  log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT secrets."
+  log ""
+  log "First 20 secrets that would be deleted:"
+  echo "$DECISIONS" | python3 -c "
+import json, sys
+shown = 0
+for l in sys.stdin:
+    d = json.loads(l)
+    if d['action'] == 'delete':
+        print(f\"  {d['reason']:25s}  {d['name']}\")
+        shown += 1
+        if shown >= 20: break
+"
+  exit 0
+fi
+
+# --- Execute deletes -------------------------------------------------------
+#
+# Parallel delete loop following sweep-cf-tunnels.sh's pattern.
+# AWS Secrets Manager DeleteSecret is fast (~0.3s/call), so even a
+# serial loop would handle 100s of secrets within the workflow's
+# 30 min cap, but parallel-by-default keeps us symmetric with the
+# other sweepers and gives headroom for a one-off backlog.
+#
+# --force-delete-without-recovery skips the 7-30 day recovery window.
+# Acceptable here because (a) the GRACE_HOURS filter prevents touching
+# in-flight provisions, and (b) the secret is regenerated on every
+# fresh provision — losing one only matters for a tenant we're
+# explicitly trying to forget.
+
+CONCURRENCY="${SWEEP_CONCURRENCY:-8}"
+DELETE_PLAN=$(mktemp -t aws-secrets-plan-XXXXXX)
+NAME_MAP=$(mktemp -t aws-secrets-names-XXXXXX)
+FAIL_LOG=$(mktemp -t aws-secrets-fail-XXXXXX)
+RESULT_LOG=$(mktemp -t aws-secrets-result-XXXXXX)
+
+# Build delete plan (one ARN per line) and id→name side-channel for
+# failure-log readability. Use ARN rather than Name on the delete
+# call because Name is mutable; ARN is the stable identifier.
+echo "$DECISIONS" | python3 -c '
+import json, sys
+plan_path = sys.argv[1]
+map_path = sys.argv[2]
+with open(plan_path, "w") as plan, open(map_path, "w") as nmap:
+    for line in sys.stdin:
+        d = json.loads(line)
+        if d.get("action") != "delete":
+            continue
+        arn = d["arn"]
+        name = d.get("name", "")
+        plan.write(arn + "\n")
+        nmap.write(arn + "\t" + name + "\n")
+' "$DELETE_PLAN" "$NAME_MAP"
+
+log ""
+log "Executing $DELETE_COUNT deletions ($CONCURRENCY-way parallel)..."
+
+export AWS_REGION NAME_MAP FAIL_LOG
+
+# shellcheck disable=SC2016
+xargs -P "$CONCURRENCY" -L 1 -I {} bash -c '
+  arn="$1"
+  if aws secretsmanager delete-secret \
+       --region "$AWS_REGION" \
+       --secret-id "$arn" \
+       --force-delete-without-recovery \
+       --output json >/dev/null 2>&1; then
+    echo OK
+  else
+    name=$(awk -F"\t" -v a="$arn" "\$1==a {print \$2; exit}" "$NAME_MAP")
+    echo FAIL
+    echo "FAIL $name $arn" >> "$FAIL_LOG"
+  fi
+' _ {} < "$DELETE_PLAN" > "$RESULT_LOG"
+
+DELETED=$(grep -c '^OK$' "$RESULT_LOG" || true)
+FAILED=$(grep -c '^FAIL$' "$RESULT_LOG" || true)
+
+log ""
+log "Done. deleted=$DELETED failed=$FAILED"
+if [ "$FAILED" -ne 0 ]; then
+  log "Failure detail (first 20):"
+  head -20 "$FAIL_LOG" | while IFS= read -r fl; do log "  $fl"; done
+fi
+[ "$FAILED" -eq 0 ]