molecule-core/.github/workflows/sweep-cf-orphans.yml

name: Sweep stale Cloudflare DNS records

# Janitor for Cloudflare DNS records whose backing tenant/workspace no
# longer exists. Without this loop, every short-lived E2E or canary
# leaves a CF record on the moleculesai.app zone — the zone has a
# 200-record quota (controlplane#239 hit it 2026-04-23+) and provisions
# start failing with code 81045 once exhausted.
#
# Why a separate workflow vs sweep-stale-e2e-orgs.yml:
#   - That workflow operates at the CP layer (DELETE /cp/admin/tenants/:slug
#     drives the cascade). It assumes CP has the org row to drive the
#     deprovision from. It doesn't catch records left behind when CP
#     itself never knew about the tenant (canary scratch, manual ops
#     experiments) or when the cascade's CF-delete branch failed.
#   - sweep-cf-orphans.sh enumerates the CF zone directly and matches
#     each record against live CP slugs + AWS EC2 names. It catches
#     leaks the CP-driven sweep can't.
#
# Safety: the script's own MAX_DELETE_PCT gate refuses to nuke more
# than 50% of records in a single run. If something has gone weird
# (CP admin endpoint returns no orgs → every tenant looks orphan) the
# gate halts before damage. Decision-function unit tests in
# scripts/ops/test_sweep_cf_decide.py (#2027) cover the rule
# classifier.

on:
  schedule:
    # Hourly. Mirrors sweep-stale-e2e-orgs cadence so the two janitors
    # converge on the same tick. CF API rate budget is generous (1200
    # req/5min); a single sweep makes ~1 list + N deletes (N<=quota/2).
    - cron: '15 * * * *'  # offset from sweep-stale-e2e-orgs (top of hour)
  workflow_dispatch:
    inputs:
      dry_run:
        description: "Dry run only — list what would be deleted, no deletion"
        required: false
        type: boolean
        default: true
      max_delete_pct:
        description: "Override safety gate (default 50, set higher only for major cleanup)"
        required: false
        default: "50"
  # No `merge_group:` trigger on purpose. This is a janitor — it doesn't
  # need to gate merges, and including it as written before #2088 fired
  # the full sweep job (or its secret-check) on every PR going through
  # the merge queue, generating one red CI run per merge-queue eval. If
  # this workflow is ever wired up as a required check, re-add
  #   merge_group: { types: [checks_requested] }
  # AND gate the sweep step with `if: github.event_name != 'merge_group'`
  # so merge-queue evals report success without actually running.

# Don't let two sweeps race the same zone. workflow_dispatch during a
# scheduled run would otherwise issue duplicate DELETE calls.
concurrency:
  group: sweep-cf-orphans
  cancel-in-progress: false

permissions:
  contents: read

jobs:
  sweep:
    name: Sweep CF orphans
    runs-on: ubuntu-latest
    # 3 min surfaces hangs (CF API stall, AWS describe-instances stuck)
    # within one cron interval instead of burning a full tick. Realistic
    # worst case is ~2 min: 4 sequential curls + 1 aws + N×CF-DELETE
    # each individually capped at 10s by the script's curl -m flag.
    timeout-minutes: 3
    env:
      CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }}
      CF_ZONE_ID: ${{ secrets.CF_ZONE_ID }}
      CP_PROD_ADMIN_TOKEN: ${{ secrets.CP_PROD_ADMIN_TOKEN }}
      CP_STAGING_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_TOKEN }}
      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
      AWS_DEFAULT_REGION: us-east-2
      MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }}

    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Verify required secrets present
        id: verify
        # Schedule-vs-dispatch behaviour split (hardened 2026-04-28
        # after the silent-no-op incident below):
        #
        # The earlier soft-skip-on-schedule policy hid a real leak. All
        # six secrets were unset on this repo for an unknown duration;
        # every hourly run printed a yellow ::warning:: and exited 0,
        # so the workflow registered as "passing" while doing nothing.
        # CF orphans accumulated to 152/200 (~76% of the zone quota
        # gone) before a manual `dig`-driven audit caught it. Anything
        # that runs as a janitor and reports green while idle is
        # indistinguishable from "the janitor is healthy" — so we now
        # treat schedule (and any future workflow_run/push triggers)
        # as a hard-fail when secrets are missing.
        #
        #   - schedule / workflow_run / push → exit 1 (red CI run
        #     surfaces the misconfiguration the next tick)
        #   - workflow_dispatch              → exit 0 with a warning
        #     (an operator ran this ad-hoc; they already accepted the
        #     state of the repo and want the workflow to short-circuit
        #     so they can rerun after fixing the secret)
        run: |
          missing=()
          for var in CF_API_TOKEN CF_ZONE_ID CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
            if [ -z "${!var:-}" ]; then
              missing+=("$var")
            fi
          done
          if [ ${#missing[@]} -gt 0 ]; then
            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
              echo "::warning::skipping sweep — secrets not configured: ${missing[*]}"
              echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun."
              echo "skip=true" >> "$GITHUB_OUTPUT"
              exit 0
            fi
            echo "::error::sweep cannot run — required secrets missing: ${missing[*]}"
            echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow."
            echo "::error::a silent skip masked an active CF DNS leak (152/200 zone records) caught only by a manual audit on 2026-04-28; this gate exists to make the gap visible."
            exit 1
          fi
          echo "All required secrets present ✓"
          echo "skip=false" >> "$GITHUB_OUTPUT"

      - name: Run sweep
        if: steps.verify.outputs.skip != 'true'
        # Schedule-vs-dispatch dry-run asymmetry (intentional):
        #   - Scheduled runs: github.event.inputs.dry_run is empty →
        #     defaults to "false" below → script runs with --execute
        #     (the whole point of an hourly janitor).
        #   - Manual workflow_dispatch: input default is true (line 38)
        #     so an ad-hoc operator-triggered run is dry-run by default;
        #     they have to flip the toggle to actually delete.
        # The script's MAX_DELETE_PCT gate (default 50%) is the second
        # line of defense regardless of mode.
        run: |
          set -euo pipefail
          if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then
            echo "Running in dry-run mode — no deletions"
            bash scripts/ops/sweep-cf-orphans.sh
          else
            echo "Running with --execute — will delete identified orphans"
            bash scripts/ops/sweep-cf-orphans.sh --execute
          fi