name: Sweep stale AWS Secrets Manager secrets # Janitor for per-tenant AWS Secrets Manager secrets # (`molecule/tenant//bootstrap`) whose backing tenant no # longer exists. Parallel-shape to sweep-cf-tunnels.yml and # sweep-cf-orphans.yml — different cloud, same justification. # # Why this exists separately from a long-term reconciler integration: # - molecule-controlplane's tenant_resources audit table (mig 024) # currently tracks four resource kinds: CloudflareTunnel, # CloudflareDNS, EC2Instance, SecurityGroup. SecretsManager is # not in the list, so the existing reconciler doesn't catch # orphan secrets. # - At ~$0.40/secret/month the cost grew to ~$19/month before this # sweeper was written, indicating ~45+ orphan secrets from # crashed provisions and incomplete deprovision flows. # - The proper fix (KindSecretsManagerSecret + recorder hook + # reconciler enumerator) is filed as a separate controlplane # issue. This sweeper is the immediate cost-relief stopgap. # # IAM principal: AWS_JANITOR_ACCESS_KEY_ID / AWS_JANITOR_SECRET_ACCESS_KEY. # This is a DEDICATED principal — the production `molecule-cp` IAM # user lacks `secretsmanager:ListSecrets` (it only has # Get/Create/Update/Delete on specific resources, scoped to its # operational needs). The janitor needs ListSecrets across the # `molecule/tenant/*` prefix, which warrants a separate principal so # we don't broaden the prod-CP policy. # # Safety: the script's MAX_DELETE_PCT gate (default 50%, mirroring # sweep-cf-orphans.yml — tenant secrets are durable by design, unlike # the mostly-orphan tunnels) refuses to nuke past the threshold. on: schedule: # Hourly at :30 — offsets from sweep-cf-orphans (:15) and # sweep-cf-tunnels (:45) so the three janitors don't burst the # CP admin endpoints at the same minute. - cron: '30 * * * *' workflow_dispatch: inputs: dry_run: description: "Dry run only — list what would be deleted, no deletion" required: false type: boolean default: true max_delete_pct: description: "Override safety gate (default 50, set higher only for major cleanup)" required: false default: "50" grace_hours: description: "Skip secrets created within this many hours (default 24)" required: false default: "24" # Don't let two sweeps race the same AWS account. concurrency: group: sweep-aws-secrets cancel-in-progress: false permissions: contents: read jobs: sweep: name: Sweep AWS Secrets Manager runs-on: ubuntu-latest # 30 min cap, mirroring the other janitors. AWS DeleteSecret is # fast (~0.3s/call) so even a 100+ backlog drains in seconds # under the 8-way xargs parallelism, but the cap is set generously # to leave headroom for any actual API hang. timeout-minutes: 30 env: AWS_REGION: ${{ secrets.AWS_REGION || 'us-east-1' }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_JANITOR_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_JANITOR_SECRET_ACCESS_KEY }} CP_PROD_ADMIN_TOKEN: ${{ secrets.CP_PROD_ADMIN_TOKEN }} CP_STAGING_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_TOKEN }} MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }} GRACE_HOURS: ${{ github.event.inputs.grace_hours || '24' }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Verify required secrets present id: verify # Schedule-vs-dispatch behaviour split mirrors sweep-cf-orphans # and sweep-cf-tunnels (hardened 2026-04-28). Same principle: # - schedule → exit 1 on missing secrets (red CI surfaces it) # - workflow_dispatch → exit 0 with warning (operator-driven, # they already accepted the repo state) run: | missing=() for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN; do if [ -z "${!var:-}" ]; then missing+=("$var") fi done if [ ${#missing[@]} -gt 0 ]; then if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then echo "::warning::skipping sweep — secrets not configured: ${missing[*]}" echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun." echo "::warning::AWS_JANITOR_* must belong to a principal with secretsmanager:ListSecrets and secretsmanager:DeleteSecret on molecule/tenant/* (the prod molecule-cp principal lacks ListSecrets)." echo "skip=true" >> "$GITHUB_OUTPUT" exit 0 fi echo "::error::sweep cannot run — required secrets missing: ${missing[*]}" echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow." echo "::error::AWS_JANITOR_* must belong to a principal with secretsmanager:ListSecrets and secretsmanager:DeleteSecret on molecule/tenant/*." exit 1 fi echo "All required secrets present ✓" echo "skip=false" >> "$GITHUB_OUTPUT" - name: Run sweep if: steps.verify.outputs.skip != 'true' # Schedule-vs-dispatch dry-run asymmetry mirrors sweep-cf-tunnels: # - Scheduled: input empty → "false" → --execute (the whole # point of an hourly janitor). # - Manual workflow_dispatch: input default true → dry-run; # operator must flip it to actually delete. run: | set -euo pipefail if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then echo "Running in dry-run mode — no deletions" bash scripts/ops/sweep-aws-secrets.sh else echo "Running with --execute — will delete identified orphans" bash scripts/ops/sweep-aws-secrets.sh --execute fi