Bumps [actions/checkout](https://github.com/actions/checkout) from 4 to 6. - [Release notes](https://github.com/actions/checkout/releases) - [Commits](https://github.com/actions/checkout/compare/v4...v6) --- updated-dependencies: - dependency-name: actions/checkout dependency-version: '6' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>
147 lines
7.0 KiB
YAML
147 lines
7.0 KiB
YAML
name: Sweep stale Cloudflare DNS records
|
||
|
||
# Janitor for Cloudflare DNS records whose backing tenant/workspace no
|
||
# longer exists. Without this loop, every short-lived E2E or canary
|
||
# leaves a CF record on the moleculesai.app zone — the zone has a
|
||
# 200-record quota (controlplane#239 hit it 2026-04-23+) and provisions
|
||
# start failing with code 81045 once exhausted.
|
||
#
|
||
# Why a separate workflow vs sweep-stale-e2e-orgs.yml:
|
||
# - That workflow operates at the CP layer (DELETE /cp/admin/tenants/:slug
|
||
# drives the cascade). It assumes CP has the org row to drive the
|
||
# deprovision from. It doesn't catch records left behind when CP
|
||
# itself never knew about the tenant (canary scratch, manual ops
|
||
# experiments) or when the cascade's CF-delete branch failed.
|
||
# - sweep-cf-orphans.sh enumerates the CF zone directly and matches
|
||
# each record against live CP slugs + AWS EC2 names. It catches
|
||
# leaks the CP-driven sweep can't.
|
||
#
|
||
# Safety: the script's own MAX_DELETE_PCT gate refuses to nuke more
|
||
# than 50% of records in a single run. If something has gone weird
|
||
# (CP admin endpoint returns no orgs → every tenant looks orphan) the
|
||
# gate halts before damage. Decision-function unit tests in
|
||
# scripts/ops/test_sweep_cf_decide.py (#2027) cover the rule
|
||
# classifier.
|
||
|
||
on:
|
||
schedule:
|
||
# Hourly. Mirrors sweep-stale-e2e-orgs cadence so the two janitors
|
||
# converge on the same tick. CF API rate budget is generous (1200
|
||
# req/5min); a single sweep makes ~1 list + N deletes (N<=quota/2).
|
||
- cron: '15 * * * *' # offset from sweep-stale-e2e-orgs (top of hour)
|
||
workflow_dispatch:
|
||
inputs:
|
||
dry_run:
|
||
description: "Dry run only — list what would be deleted, no deletion"
|
||
required: false
|
||
type: boolean
|
||
default: true
|
||
max_delete_pct:
|
||
description: "Override safety gate (default 50, set higher only for major cleanup)"
|
||
required: false
|
||
default: "50"
|
||
# No `merge_group:` trigger on purpose. This is a janitor — it doesn't
|
||
# need to gate merges, and including it as written before #2088 fired
|
||
# the full sweep job (or its secret-check) on every PR going through
|
||
# the merge queue, generating one red CI run per merge-queue eval. If
|
||
# this workflow is ever wired up as a required check, re-add
|
||
# merge_group: { types: [checks_requested] }
|
||
# AND gate the sweep step with `if: github.event_name != 'merge_group'`
|
||
# so merge-queue evals report success without actually running.
|
||
|
||
# Don't let two sweeps race the same zone. workflow_dispatch during a
|
||
# scheduled run would otherwise issue duplicate DELETE calls.
|
||
concurrency:
|
||
group: sweep-cf-orphans
|
||
cancel-in-progress: false
|
||
|
||
permissions:
|
||
contents: read
|
||
|
||
jobs:
|
||
sweep:
|
||
name: Sweep CF orphans
|
||
runs-on: ubuntu-latest
|
||
# 3 min surfaces hangs (CF API stall, AWS describe-instances stuck)
|
||
# within one cron interval instead of burning a full tick. Realistic
|
||
# worst case is ~2 min: 4 sequential curls + 1 aws + N×CF-DELETE
|
||
# each individually capped at 10s by the script's curl -m flag.
|
||
timeout-minutes: 3
|
||
env:
|
||
CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }}
|
||
CF_ZONE_ID: ${{ secrets.CF_ZONE_ID }}
|
||
CP_PROD_ADMIN_TOKEN: ${{ secrets.CP_PROD_ADMIN_TOKEN }}
|
||
CP_STAGING_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_TOKEN }}
|
||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||
AWS_DEFAULT_REGION: us-east-2
|
||
MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }}
|
||
|
||
steps:
|
||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||
|
||
- name: Verify required secrets present
|
||
id: verify
|
||
# Schedule-vs-dispatch behaviour split (hardened 2026-04-28
|
||
# after the silent-no-op incident below):
|
||
#
|
||
# The earlier soft-skip-on-schedule policy hid a real leak. All
|
||
# six secrets were unset on this repo for an unknown duration;
|
||
# every hourly run printed a yellow ::warning:: and exited 0,
|
||
# so the workflow registered as "passing" while doing nothing.
|
||
# CF orphans accumulated to 152/200 (~76% of the zone quota
|
||
# gone) before a manual `dig`-driven audit caught it. Anything
|
||
# that runs as a janitor and reports green while idle is
|
||
# indistinguishable from "the janitor is healthy" — so we now
|
||
# treat schedule (and any future workflow_run/push triggers)
|
||
# as a hard-fail when secrets are missing.
|
||
#
|
||
# - schedule / workflow_run / push → exit 1 (red CI run
|
||
# surfaces the misconfiguration the next tick)
|
||
# - workflow_dispatch → exit 0 with a warning
|
||
# (an operator ran this ad-hoc; they already accepted the
|
||
# state of the repo and want the workflow to short-circuit
|
||
# so they can rerun after fixing the secret)
|
||
run: |
|
||
missing=()
|
||
for var in CF_API_TOKEN CF_ZONE_ID CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
|
||
if [ -z "${!var:-}" ]; then
|
||
missing+=("$var")
|
||
fi
|
||
done
|
||
if [ ${#missing[@]} -gt 0 ]; then
|
||
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
|
||
echo "::warning::skipping sweep — secrets not configured: ${missing[*]}"
|
||
echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun."
|
||
echo "skip=true" >> "$GITHUB_OUTPUT"
|
||
exit 0
|
||
fi
|
||
echo "::error::sweep cannot run — required secrets missing: ${missing[*]}"
|
||
echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow."
|
||
echo "::error::a silent skip masked an active CF DNS leak (152/200 zone records) caught only by a manual audit on 2026-04-28; this gate exists to make the gap visible."
|
||
exit 1
|
||
fi
|
||
echo "All required secrets present ✓"
|
||
echo "skip=false" >> "$GITHUB_OUTPUT"
|
||
|
||
- name: Run sweep
|
||
if: steps.verify.outputs.skip != 'true'
|
||
# Schedule-vs-dispatch dry-run asymmetry (intentional):
|
||
# - Scheduled runs: github.event.inputs.dry_run is empty →
|
||
# defaults to "false" below → script runs with --execute
|
||
# (the whole point of an hourly janitor).
|
||
# - Manual workflow_dispatch: input default is true (line 38)
|
||
# so an ad-hoc operator-triggered run is dry-run by default;
|
||
# they have to flip the toggle to actually delete.
|
||
# The script's MAX_DELETE_PCT gate (default 50%) is the second
|
||
# line of defense regardless of mode.
|
||
run: |
|
||
set -euo pipefail
|
||
if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then
|
||
echo "Running in dry-run mode — no deletions"
|
||
bash scripts/ops/sweep-cf-orphans.sh
|
||
else
|
||
echo "Running with --execute — will delete identified orphans"
|
||
bash scripts/ops/sweep-cf-orphans.sh --execute
|
||
fi
|