diff --git a/.github/workflows/sweep-aws-secrets.yml b/.github/workflows/sweep-aws-secrets.yml new file mode 100644 index 00000000..39e57978 --- /dev/null +++ b/.github/workflows/sweep-aws-secrets.yml @@ -0,0 +1,129 @@ +name: Sweep stale AWS Secrets Manager secrets + +# Janitor for per-tenant AWS Secrets Manager secrets +# (`molecule/tenant//bootstrap`) whose backing tenant no +# longer exists. Parallel-shape to sweep-cf-tunnels.yml and +# sweep-cf-orphans.yml — different cloud, same justification. +# +# Why this exists separately from a long-term reconciler integration: +# - molecule-controlplane's tenant_resources audit table (mig 024) +# currently tracks four resource kinds: CloudflareTunnel, +# CloudflareDNS, EC2Instance, SecurityGroup. SecretsManager is +# not in the list, so the existing reconciler doesn't catch +# orphan secrets. +# - At ~$0.40/secret/month the cost grew to ~$19/month before this +# sweeper was written, indicating ~45+ orphan secrets from +# crashed provisions and incomplete deprovision flows. +# - The proper fix (KindSecretsManagerSecret + recorder hook + +# reconciler enumerator) is filed as a separate controlplane +# issue. This sweeper is the immediate cost-relief stopgap. +# +# IAM principal: AWS_JANITOR_ACCESS_KEY_ID / AWS_JANITOR_SECRET_ACCESS_KEY. +# This is a DEDICATED principal — the production `molecule-cp` IAM +# user lacks `secretsmanager:ListSecrets` (it only has +# Get/Create/Update/Delete on specific resources, scoped to its +# operational needs). The janitor needs ListSecrets across the +# `molecule/tenant/*` prefix, which warrants a separate principal so +# we don't broaden the prod-CP policy. +# +# Safety: the script's MAX_DELETE_PCT gate (default 50%, mirroring +# sweep-cf-orphans.yml — tenant secrets are durable by design, unlike +# the mostly-orphan tunnels) refuses to nuke past the threshold. + +on: + schedule: + # Hourly at :30 — offsets from sweep-cf-orphans (:15) and + # sweep-cf-tunnels (:45) so the three janitors don't burst the + # CP admin endpoints at the same minute. + - cron: '30 * * * *' + workflow_dispatch: + inputs: + dry_run: + description: "Dry run only — list what would be deleted, no deletion" + required: false + type: boolean + default: true + max_delete_pct: + description: "Override safety gate (default 50, set higher only for major cleanup)" + required: false + default: "50" + grace_hours: + description: "Skip secrets created within this many hours (default 24)" + required: false + default: "24" + +# Don't let two sweeps race the same AWS account. +concurrency: + group: sweep-aws-secrets + cancel-in-progress: false + +permissions: + contents: read + +jobs: + sweep: + name: Sweep AWS Secrets Manager + runs-on: ubuntu-latest + # 30 min cap, mirroring the other janitors. AWS DeleteSecret is + # fast (~0.3s/call) so even a 100+ backlog drains in seconds + # under the 8-way xargs parallelism, but the cap is set generously + # to leave headroom for any actual API hang. + timeout-minutes: 30 + env: + AWS_REGION: ${{ secrets.AWS_REGION || 'us-east-1' }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_JANITOR_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_JANITOR_SECRET_ACCESS_KEY }} + CP_PROD_ADMIN_TOKEN: ${{ secrets.CP_PROD_ADMIN_TOKEN }} + CP_STAGING_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_TOKEN }} + MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }} + GRACE_HOURS: ${{ github.event.inputs.grace_hours || '24' }} + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Verify required secrets present + id: verify + # Schedule-vs-dispatch behaviour split mirrors sweep-cf-orphans + # and sweep-cf-tunnels (hardened 2026-04-28). Same principle: + # - schedule → exit 1 on missing secrets (red CI surfaces it) + # - workflow_dispatch → exit 0 with warning (operator-driven, + # they already accepted the repo state) + run: | + missing=() + for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN; do + if [ -z "${!var:-}" ]; then + missing+=("$var") + fi + done + if [ ${#missing[@]} -gt 0 ]; then + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + echo "::warning::skipping sweep — secrets not configured: ${missing[*]}" + echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun." + echo "::warning::AWS_JANITOR_* must belong to a principal with secretsmanager:ListSecrets and secretsmanager:DeleteSecret on molecule/tenant/* (the prod molecule-cp principal lacks ListSecrets)." + echo "skip=true" >> "$GITHUB_OUTPUT" + exit 0 + fi + echo "::error::sweep cannot run — required secrets missing: ${missing[*]}" + echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow." + echo "::error::AWS_JANITOR_* must belong to a principal with secretsmanager:ListSecrets and secretsmanager:DeleteSecret on molecule/tenant/*." + exit 1 + fi + echo "All required secrets present ✓" + echo "skip=false" >> "$GITHUB_OUTPUT" + + - name: Run sweep + if: steps.verify.outputs.skip != 'true' + # Schedule-vs-dispatch dry-run asymmetry mirrors sweep-cf-tunnels: + # - Scheduled: input empty → "false" → --execute (the whole + # point of an hourly janitor). + # - Manual workflow_dispatch: input default true → dry-run; + # operator must flip it to actually delete. + run: | + set -euo pipefail + if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then + echo "Running in dry-run mode — no deletions" + bash scripts/ops/sweep-aws-secrets.sh + else + echo "Running with --execute — will delete identified orphans" + bash scripts/ops/sweep-aws-secrets.sh --execute + fi diff --git a/scripts/ops/sweep-aws-secrets.sh b/scripts/ops/sweep-aws-secrets.sh new file mode 100755 index 00000000..55db0a11 --- /dev/null +++ b/scripts/ops/sweep-aws-secrets.sh @@ -0,0 +1,375 @@ +#!/usr/bin/env bash +# sweep-aws-secrets.sh — safe, targeted sweep of AWS Secrets Manager +# secrets whose corresponding tenant no longer exists. +# +# Why this exists: CP's tenant-delete cascade calls +# Secrets.DeleteSecret() at deprovision time, but only when the +# deprovision flow runs to completion (provisioner/ec2.go:806). Crashed +# provisions, hard-failed E2E runs, and any tenant created without a +# matching deprovision (early-bail in provisioner, manual orchestration +# bugs) leak the per-tenant bootstrap secret. At ~$0.40/secret/month, +# 50 leaked secrets = $20/month — enough to show up on the cost +# dashboard. +# +# Observed 2026-05-03: AWS Secrets Manager line item ~$19/month with +# only one tenant currently provisioned, indicating ~45+ orphan +# secrets. The tenant_resources audit table (mig 024) tracks four +# resource kinds (Cloudflare Tunnel, Cloudflare DNS, EC2 Instance, +# Security Group) but NOT Secrets Manager — the long-term fix is to +# add KindSecretsManagerSecret + recorder hook + reconciler enumerator. +# Tracked separately as a controlplane issue. +# +# This is a parallel-shape janitor to sweep-cf-tunnels.sh: +# 1. Query CP admin API to enumerate live org IDs (prod + staging) +# 2. Enumerate AWS Secrets Manager secrets matching the tenant prefix +# 3. For each secret matching `molecule/tenant//bootstrap`, +# check if appears in the live set +# 4. Defense-in-depth: skip secrets created in the last 24h +# (window for a provision-in-progress that hasn't yet finished +# its first heartbeat to CP) +# 5. Only delete secrets with NO live org counterpart AND outside +# the 24h grace window +# +# Dry-run by default; must pass --execute to actually delete. +# +# Note on deletion semantics: --force-delete-without-recovery skips +# the 7-30 day recovery window. We accept this because (a) the grace +# window above already filters in-flight provisions, and (b) the +# bootstrap secret is regenerated on every reprovision — losing one +# is recoverable by re-running the provision flow. +# +# Env vars required: +# AWS_REGION — region the secrets live in (default: us-east-1) +# CP_PROD_ADMIN_TOKEN — CP admin bearer for api.moleculesai.app +# CP_STAGING_ADMIN_TOKEN — CP admin bearer for staging-api.moleculesai.app +# AWS_ACCESS_KEY_ID, — IAM principal with secretsmanager:ListSecrets +# AWS_SECRET_ACCESS_KEY and secretsmanager:DeleteSecret. Note: the +# prod molecule-cp principal does NOT have +# these permissions; the workflow uses a +# dedicated janitor principal. +# +# Exit codes: +# 0 — dry-run completed or sweep executed successfully +# 1 — missing required env, API failure, or unexpected state +# 2 — safety check failed (would delete >MAX_DELETE_PCT% of +# tenant-shaped secrets; refusing) + +set -euo pipefail + +DRY_RUN=1 +# Tenant secrets are durable by design — they should track 1:1 with +# live tenants. The 50% default mirrors sweep-cf-orphans.sh (DNS +# records, also durable) rather than sweep-cf-tunnels.sh (90%, mostly +# orphan by design). If the live tenant count drops by more than half +# in one sweep window, that's an incident worth investigating before +# we erase the audit trail. +MAX_DELETE_PCT="${MAX_DELETE_PCT:-50}" +GRACE_HOURS="${GRACE_HOURS:-24}" +AWS_REGION="${AWS_REGION:-us-east-1}" + +for arg in "$@"; do + case "$arg" in + --execute|--no-dry-run) DRY_RUN=0 ;; + --help|-h) + grep '^#' "$0" | head -55 | sed 's/^# \{0,1\}//' + exit 0 + ;; + *) + echo "unknown arg: $arg (use --help)" >&2 + exit 1 + ;; + esac +done + +need() { + local var="$1" + if [ -z "${!var:-}" ]; then + echo "ERROR: $var is required" >&2 + exit 1 + fi +} +need CP_PROD_ADMIN_TOKEN +need CP_STAGING_ADMIN_TOKEN +need AWS_ACCESS_KEY_ID +need AWS_SECRET_ACCESS_KEY + +if ! command -v aws >/dev/null 2>&1; then + echo "ERROR: aws cli is required" >&2 + exit 1 +fi + +log() { echo "[$(date -u +%H:%M:%S)] $*"; } + +# --- Gather live sets ------------------------------------------------------ +# +# Secret naming uses the tenant's UUID (org_id), not the slug — see +# awsapi.TenantSecretName in molecule-controlplane. The /cp/admin/orgs +# response includes both `id` and `slug`; we extract `id` here. + +log "Fetching CP prod org ids..." +PROD_IDS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_PROD_ADMIN_TOKEN" \ + "https://api.moleculesai.app/cp/admin/orgs?limit=500" \ + | python3 -c "import json,sys; print(' '.join(o['id'] for o in json.load(sys.stdin).get('orgs',[])))") +log " prod orgs: $(echo "$PROD_IDS" | wc -w | tr -d ' ')" + +log "Fetching CP staging org ids..." +STAGING_IDS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_STAGING_ADMIN_TOKEN" \ + "https://staging-api.moleculesai.app/cp/admin/orgs?limit=500" \ + | python3 -c "import json,sys; print(' '.join(o['id'] for o in json.load(sys.stdin).get('orgs',[])))") +log " staging orgs: $(echo "$STAGING_IDS" | wc -w | tr -d ' ')" + +log "Fetching AWS Secrets Manager secrets (region=$AWS_REGION)..." +# list-secrets is paginated via NextToken. The aws cli auto-paginates +# unless --max-items is set, but explicit pagination keeps us safe +# from any sudden default change and lets us cap at a sane upper +# bound. ListSecrets returns up to 100 per page; we cap at 50 pages +# (5000 secrets) which is well past any plausible tenant count. +PAGES_DIR=$(mktemp -d -t aws-secrets-XXXXXX) +DELETE_PLAN="" +NAME_MAP="" +FAIL_LOG="" +RESULT_LOG="" +cleanup() { + rm -rf "$PAGES_DIR" + [ -n "$DELETE_PLAN" ] && rm -f "$DELETE_PLAN" + [ -n "$NAME_MAP" ] && rm -f "$NAME_MAP" + [ -n "$FAIL_LOG" ] && rm -f "$FAIL_LOG" + [ -n "$RESULT_LOG" ] && rm -f "$RESULT_LOG" + return 0 +} +trap cleanup EXIT + +NEXT_TOKEN="" +PAGE=1 +while :; do + page_file="$PAGES_DIR/page-$(printf '%05d' "$PAGE").json" + if [ -z "$NEXT_TOKEN" ]; then + aws secretsmanager list-secrets \ + --region "$AWS_REGION" \ + --filters Key=name,Values=molecule/tenant/ \ + --max-results 100 \ + --output json > "$page_file" + else + aws secretsmanager list-secrets \ + --region "$AWS_REGION" \ + --filters Key=name,Values=molecule/tenant/ \ + --max-results 100 \ + --next-token "$NEXT_TOKEN" \ + --output json > "$page_file" + fi + NEXT_TOKEN=$(python3 -c "import json,sys; d=json.load(open(sys.argv[1])); print(d.get('NextToken') or '')" "$page_file") + PAGE=$((PAGE + 1)) + if [ -z "$NEXT_TOKEN" ]; then break; fi + if [ "$PAGE" -gt 50 ]; then + log "::warning::stopping pagination at page 50 (5000 secrets) — re-run if more" + break + fi +done + +SECRET_JSON=$(python3 -c ' +import glob, json, os, sys +acc = {"SecretList": []} +for f in sorted(glob.glob(os.path.join(sys.argv[1], "page-*.json"))): + with open(f) as fh: + acc["SecretList"].extend(json.load(fh).get("SecretList") or []) +print(json.dumps(acc)) +' "$PAGES_DIR") +TOTAL_SECRETS=$(echo "$SECRET_JSON" | python3 -c "import json,sys; print(len(json.load(sys.stdin)['SecretList']))") +log " total tenant-prefixed secrets: $TOTAL_SECRETS" + +# --- Compute orphans ------------------------------------------------------- +# +# Rules (in order): +# 1. Name doesn't match `molecule/tenant//bootstrap` → keep +# (unknown — never sweep arbitrary secrets that might belong to +# platform infra or other tenants of this AWS account). +# 2. CreatedDate within $GRACE_HOURS → keep (defense-in-depth: don't +# kill a secret while its provision is still mid-flight). +# 3. org_id ∈ {prod_ids ∪ staging_ids} → keep (live tenant). +# 4. Otherwise → delete (orphan). + +export PROD_IDS STAGING_IDS GRACE_HOURS +DECISIONS=$(echo "$SECRET_JSON" | python3 -c ' +import json, os, re, sys +from datetime import datetime, timezone, timedelta + +prod_ids = set(os.environ["PROD_IDS"].split()) +staging_ids = set(os.environ["STAGING_IDS"].split()) +all_ids = prod_ids | staging_ids +grace = timedelta(hours=int(os.environ["GRACE_HOURS"])) +now = datetime.now(timezone.utc) + +# molecule/tenant//bootstrap — org_id is a UUID. +_TENANT_RE = re.compile(r"^molecule/tenant/([0-9a-fA-F-]{36})/bootstrap$") + +def parse_iso(s): + if not s: + return None + # AWS returns ISO8601 with timezone (sometimes "+00:00", sometimes + # numeric offset). datetime.fromisoformat handles both since 3.11. + try: + return datetime.fromisoformat(s) + except ValueError: + return None + +def decide(s, all_ids, grace, now): + name = s.get("Name", "") + arn = s.get("ARN", "") + + m = _TENANT_RE.match(name) + if not m: + return ("keep", "not-a-tenant-secret", arn, name) + + org_id = m.group(1) + + created = parse_iso(s.get("CreatedDate") or s.get("LastChangedDate")) + if created is not None and (now - created) < grace: + return ("keep", "in-grace-window", arn, name) + + if org_id in all_ids: + return ("keep", "live-tenant", arn, name) + + return ("delete", "orphan-tenant", arn, name) + +d = json.loads(sys.stdin.read()) +for s in d.get("SecretList", []): + action, reason, arn, name = decide(s, all_ids, grace, now) + print(json.dumps({"action": action, "reason": reason, "arn": arn, "name": name})) +') + +# --- Summarize + safety gate ---------------------------------------------- + +DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))") +KEEP_COUNT=$((TOTAL_SECRETS - DELETE_COUNT)) +TENANT_SECRETS=$(echo "$DECISIONS" | python3 -c " +import json, sys +n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-secret') +print(n) +") + +log "" +log "== Sweep plan ==" +log " total secrets: $TOTAL_SECRETS" +log " tenant-shaped secrets: $TENANT_SECRETS" +log " would delete: $DELETE_COUNT" +log " would keep: $KEEP_COUNT" +log "" + +# Per-reason breakdown of deletes + keep-categories worth seeing +echo "$DECISIONS" | python3 -c " +import json,sys,collections +delete_c = collections.Counter() +keep_c = collections.Counter() +for l in sys.stdin: + d = json.loads(l) + if d['action'] == 'delete': + delete_c[d['reason']] += 1 + else: + keep_c[d['reason']] += 1 +for reason, n in delete_c.most_common(): + print(f' delete/{reason}: {n}') +for reason, n in keep_c.most_common(): + print(f' keep/{reason}: {n}') +" + +# Safety gate operates against the tenant-shaped subset — same +# rationale as sweep-cf-tunnels: a miscount of platform-infra +# secrets shouldn't relax the gate. +if [ "$TENANT_SECRETS" -gt 0 ]; then + PCT=$(( DELETE_COUNT * 100 / TENANT_SECRETS )) + if [ "$PCT" -gt "$MAX_DELETE_PCT" ]; then + log "" + log "SAFETY: would delete $PCT% of tenant-shaped secrets (threshold $MAX_DELETE_PCT%) — refusing." + log " If this is expected (e.g. major cleanup after incident), rerun with" + log " MAX_DELETE_PCT=$((PCT+5)) $0 $*" + exit 2 + fi +fi + +if [ "$DRY_RUN" = "1" ]; then + log "" + log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT secrets." + log "" + log "First 20 secrets that would be deleted:" + echo "$DECISIONS" | python3 -c " +import json, sys +shown = 0 +for l in sys.stdin: + d = json.loads(l) + if d['action'] == 'delete': + print(f\" {d['reason']:25s} {d['name']}\") + shown += 1 + if shown >= 20: break +" + exit 0 +fi + +# --- Execute deletes ------------------------------------------------------- +# +# Parallel delete loop following sweep-cf-tunnels.sh's pattern. +# AWS Secrets Manager DeleteSecret is fast (~0.3s/call), so even a +# serial loop would handle 100s of secrets within the workflow's +# 30 min cap, but parallel-by-default keeps us symmetric with the +# other sweepers and gives headroom for a one-off backlog. +# +# --force-delete-without-recovery skips the 7-30 day recovery window. +# Acceptable here because (a) the GRACE_HOURS filter prevents touching +# in-flight provisions, and (b) the secret is regenerated on every +# fresh provision — losing one only matters for a tenant we're +# explicitly trying to forget. + +CONCURRENCY="${SWEEP_CONCURRENCY:-8}" +DELETE_PLAN=$(mktemp -t aws-secrets-plan-XXXXXX) +NAME_MAP=$(mktemp -t aws-secrets-names-XXXXXX) +FAIL_LOG=$(mktemp -t aws-secrets-fail-XXXXXX) +RESULT_LOG=$(mktemp -t aws-secrets-result-XXXXXX) + +# Build delete plan (one ARN per line) and id→name side-channel for +# failure-log readability. Use ARN rather than Name on the delete +# call because Name is mutable; ARN is the stable identifier. +echo "$DECISIONS" | python3 -c ' +import json, sys +plan_path = sys.argv[1] +map_path = sys.argv[2] +with open(plan_path, "w") as plan, open(map_path, "w") as nmap: + for line in sys.stdin: + d = json.loads(line) + if d.get("action") != "delete": + continue + arn = d["arn"] + name = d.get("name", "") + plan.write(arn + "\n") + nmap.write(arn + "\t" + name + "\n") +' "$DELETE_PLAN" "$NAME_MAP" + +log "" +log "Executing $DELETE_COUNT deletions ($CONCURRENCY-way parallel)..." + +export AWS_REGION NAME_MAP FAIL_LOG + +# shellcheck disable=SC2016 +xargs -P "$CONCURRENCY" -L 1 -I {} bash -c ' + arn="$1" + if aws secretsmanager delete-secret \ + --region "$AWS_REGION" \ + --secret-id "$arn" \ + --force-delete-without-recovery \ + --output json >/dev/null 2>&1; then + echo OK + else + name=$(awk -F"\t" -v a="$arn" "\$1==a {print \$2; exit}" "$NAME_MAP") + echo FAIL + echo "FAIL $name $arn" >> "$FAIL_LOG" + fi +' _ {} < "$DELETE_PLAN" > "$RESULT_LOG" + +DELETED=$(grep -c '^OK$' "$RESULT_LOG" || true) +FAILED=$(grep -c '^FAIL$' "$RESULT_LOG" || true) + +log "" +log "Done. deleted=$DELETED failed=$FAILED" +if [ "$FAILED" -ne 0 ]; then + log "Failure detail (first 20):" + head -20 "$FAIL_LOG" | while IFS= read -r fl; do log " $fl"; done +fi +[ "$FAILED" -eq 0 ]