diff --git a/.gitea/workflows/e2e-staging-saas.yml b/.gitea/workflows/e2e-staging-saas.yml index 40997fb93..9b4f8e444 100644 --- a/.gitea/workflows/e2e-staging-saas.yml +++ b/.gitea/workflows/e2e-staging-saas.yml @@ -354,6 +354,58 @@ jobs: fi exit 0 + # ── POST-RUN DNS PRUNE (core#81045 recurrence fix) ─────────────────────────── + # + # The full-lifecycle E2E harness creates e2e-smoke-* and e2e-tmpl-* DNS + # records under the staging zone. When teardown is skipped (runner cancel, + # CP transient error, trap miss) these records leak and eventually exhaust + # the Cloudflare DNS record quota (error 81045), blocking staging tenant + # provisioning. This job runs after every real E2E run and prunes records + # older than a conservative age threshold. + # + # Design notes: + # - needs: e2e-staging-saas so it only runs when the real E2E job fires + # (push/dispatch/cron), not on PRs. + # - if: always() so it runs even when the E2E job fails or is cancelled, + # which is exactly when records are most likely to leak. + # - continue-on-error: true — pruning is best-effort janitorial cleanup; + # a transient CF API blip here must not block the merge gate. The + # sweep-stale-e2e-orgs workflow is the backstop. + # - Token and zone id come from repository secrets ONLY; never hardcoded. + # - --min-age-hours is conservative (2h) so in-flight records from a long + # E2E run or a recently-started dispatch are never touched. + # bp-required: pending #3140 — non-required / best-effort cleanup job. + prune-stale-e2e-dns: + name: Prune stale e2e DNS records + runs-on: ubuntu-latest + needs: e2e-staging-saas + if: always() + # mc#3140: best-effort cleanup; transient CF API failures must not block merge. + continue-on-error: true + timeout-minutes: 10 + permissions: + contents: read + env: + CF_API_TOKEN: ${{ secrets.CF_STAGING_DNS_API_TOKEN }} + CF_ZONE_ID: ${{ secrets.CF_STAGING_ZONE_ID }} + # Staging tenant DNS records live under staging.moleculesai.app, not the + # apex zone, so the prefix matcher must anchor to the staging subdomain. + PRUNE_ZONE_DOMAIN: staging.moleculesai.app + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Dry-run preview (read-only) + if: env.CF_API_TOKEN == '' || env.CF_ZONE_ID == '' + run: | + echo "::warning::CF_STAGING_DNS_API_TOKEN or CF_STAGING_ZONE_ID not configured — skipping DNS prune." + exit 0 + + - name: Prune stale e2e DNS records + if: env.CF_API_TOKEN != '' && env.CF_ZONE_ID != '' + run: | + set -euo pipefail + ./scripts/ops/prune_cf_e2e_dns.sh --apply --min-age-hours 2 + # ── PLATFORM-MANAGED BOOT REGRESSION (moonshot/kimi NOT_CONFIGURED) ────────── # # The REAL-boot complement to the deterministic unit suite diff --git a/scripts/ops/prune_cf_e2e_dns.sh b/scripts/ops/prune_cf_e2e_dns.sh new file mode 100755 index 000000000..c73c4bac4 --- /dev/null +++ b/scripts/ops/prune_cf_e2e_dns.sh @@ -0,0 +1,390 @@ +#!/usr/bin/env bash +# prune_cf_e2e_dns.sh — targeted, fail-closed cleanup of disposable E2E DNS +# records that accumulate under the moleculesai.app zone and exhaust the +# Cloudflare DNS record quota (code 81045). +# +# Why this exists: staging E2E harnesses create DNS records for slugs like +# e2e-smoke--- and e2e-tmpl- (see +# tests/e2e/test_staging_full_saas.sh and tests/e2e/test_template_delivery_e2e.sh). +# When teardown is skipped (CI cancellation, runner crash, transient CP error), +# these records leak. Cloudflare caps DNS records per zone; once the cap is +# hit, new tenant provisioning fails with CF code 81045. This script is the +# immediate unblock tool: it deletes clearly-ephemeral test records by pattern +# + age, independent of CP state. +# +# Scope (conservative): +# - Records whose full name matches +# e2e-smoke-*. +# e2e-tmpl-*. +# - Records older than --min-age-hours / PRUNE_MIN_AGE_HOURS (default 24) +# so in-flight runs are not touched. +# - Anything else is kept untouched. +# +# Dry-run by default; must pass --apply (or set PRUNE_APPLY=1) to delete. +# +# Required env: +# CF_API_TOKEN — Cloudflare API token with Zone:DNS:Edit on the target zone. +# Falls back to CLOUDFLARE_API_TOKEN. +# CF_ZONE_ID — Cloudflare zone id for moleculesai.app (or staging zone). +# Falls back to CLOUDFLARE_ZONE_ID. +# +# Optional env: +# PRUNE_APPLY=1 — same as --apply (both accepted). +# PRUNE_MIN_AGE_HOURS= — default minimum age in hours (default: 24). +# MAX_DELETE_PCT= — refuse to delete more than this percentage of +# matched ephemeral records (default: 50). +# PRUNE_ZONE_DOMAIN= — zone domain to anchor matches (default: moleculesai.app). +# +# Exit codes: +# 0 — dry-run completed or prune executed successfully +# 1 — missing required env, API failure, or unexpected state +# 2 — safety gate refused the prune + +set -euo pipefail + +DRY_RUN=1 +MIN_AGE_HOURS="${PRUNE_MIN_AGE_HOURS:-24}" +MAX_DELETE_PCT="${MAX_DELETE_PCT:-50}" +ZONE_DOMAIN="${PRUNE_ZONE_DOMAIN:-moleculesai.app}" + +while [ $# -gt 0 ]; do + case "$1" in + --apply|--execute|--no-dry-run) DRY_RUN=0; shift ;; + --min-age-hours) + shift + MIN_AGE_HOURS="${1:-}" + if ! [[ "$MIN_AGE_HOURS" =~ ^[0-9]+$ ]]; then + echo "ERROR: --min-age-hours requires a non-negative integer" >&2 + exit 1 + fi + shift + ;; + --help|-h) + sed -n '1,/^set -euo pipefail$/p' "$0" | grep '^#' | sed 's/^# \{0,1\}//' + exit 0 + ;; + --*) + echo "unknown arg: $1 (use --help)" >&2 + exit 1 + ;; + *) + echo "unknown arg: $1 (use --help)" >&2 + exit 1 + ;; + esac +done + +if [ "${PRUNE_APPLY:-0}" = "1" ]; then + DRY_RUN=0 +fi + +need() { + local var="$1" + if [ -z "${!var:-}" ]; then + echo "ERROR: $var is required" >&2 + exit 1 + fi +} + +# Accept canonical operator-host names OR CI-scoped names. +CF_API_TOKEN="${CF_API_TOKEN:-${CLOUDFLARE_API_TOKEN:-}}" +CF_ZONE_ID="${CF_ZONE_ID:-${CLOUDFLARE_ZONE_ID:-}}" + +need CF_API_TOKEN +need CF_ZONE_ID + +if ! command -v curl >/dev/null 2>&1; then + echo "ERROR: curl is required" >&2 + exit 1 +fi +if ! command -v python3 >/dev/null 2>&1; then + echo "ERROR: python3 is required" >&2 + exit 1 +fi + +if ! [[ "$MIN_AGE_HOURS" =~ ^[0-9]+$ ]]; then + echo "ERROR: PRUNE_MIN_AGE_HOURS/--min-age-hours must be a non-negative integer" >&2 + exit 1 +fi + +log() { echo "[$(date -u +%H:%M:%S)] $*"; } + +# --- Preflight: verify CF token + zone BEFORE any list/delete work --------- +log "Preflight: verifying CF token + zone..." +PF_TOKEN_JSON=$(curl -sS -m 10 -H "Authorization: Bearer $CF_API_TOKEN" \ + "https://api.cloudflare.com/client/v4/user/tokens/verify") +if ! echo "$PF_TOKEN_JSON" | python3 -c ' +import json, sys +try: + p = json.load(sys.stdin) +except Exception as exc: + print(f"ERROR: non-JSON from /user/tokens/verify: {exc}", file=sys.stderr) + raise SystemExit(1) +if not p.get("success"): + errs = p.get("errors") or [] + detail = "; ".join( + "{code}: {msg}".format(code=e.get("code", "?"), msg=e.get("message", "?")) + for e in errs + ) or "unknown" + print(f"ERROR: CF token verify returned success=false: {detail}", file=sys.stderr) + raise SystemExit(1) +status = (p.get("result") or {}).get("status", "?") +if status != "active": + print(f"ERROR: CF token is not active (status={status})", file=sys.stderr) + raise SystemExit(1) +'; then + log " CF token preflight FAILED — verify CF_API_TOKEN/CLOUDFLARE_API_TOKEN is active." + exit 1 +fi +log " CF token active ✓" + +PF_ZONE_JSON=$(curl -sS -m 10 -H "Authorization: Bearer $CF_API_TOKEN" \ + "https://api.cloudflare.com/client/v4/zones/$CF_ZONE_ID") +if ! echo "$PF_ZONE_JSON" | CF_ZONE_ID="$CF_ZONE_ID" python3 -c ' +import json, os, sys +try: + p = json.load(sys.stdin) +except Exception as exc: + print(f"ERROR: non-JSON from /zones/{os.environ['CF_ZONE_ID']}: {exc}", file=sys.stderr) + raise SystemExit(1) +if not p.get("success"): + errs = p.get("errors") or [] + detail = "; ".join( + "{code}: {msg}".format(code=e.get("code", "?"), msg=e.get("message", "?")) + for e in errs + ) or "unknown" + print(f"ERROR: zone lookup returned success=false: {detail}", file=sys.stderr) + raise SystemExit(1) +res = p.get("result") or {} +if res.get("id") != os.environ["CF_ZONE_ID"]: + print("ERROR: zone id mismatch", file=sys.stderr) + raise SystemExit(1) +'; then + log " CF zone preflight FAILED — verify CF_ZONE_ID/CLOUDFLARE_ZONE_ID and Zone:Read permission." + exit 1 +fi +log " zone $CF_ZONE_ID reachable ✓" + +# --- Gather DNS records with explicit pagination ---------------------------- +log "Fetching DNS records from zone $CF_ZONE_ID (paginated)..." +PAGES_DIR=$(mktemp -d -t cf-dns-XXXXXX) +PLAN_FILE="" +FAIL_LOG="" +cleanup() { + rm -rf "$PAGES_DIR" + [ -n "$PLAN_FILE" ] && rm -f "$PLAN_FILE" + [ -n "$FAIL_LOG" ] && rm -f "$FAIL_LOG" + return 0 +} +trap cleanup EXIT + +PAGE=1 +NEXT_PAGE=1 +while [ -n "$NEXT_PAGE" ]; do + page_file="$PAGES_DIR/page-$(printf '%05d' "$PAGE").json" + curl -sS -m 30 -f -H "Authorization: Bearer $CF_API_TOKEN" \ + "https://api.cloudflare.com/client/v4/zones/$CF_ZONE_ID/dns_records?per_page=100&page=$NEXT_PAGE" \ + > "$page_file" || { + log "ERROR: CF DNS list page $NEXT_PAGE failed (non-2xx or network error)." + exit 1 + } + + if ! python3 -c ' +import json, sys +try: + p = json.load(open(sys.argv[1])) +except Exception as exc: + print(f"ERROR: non-JSON list response: {exc}", file=sys.stderr) + raise SystemExit(1) +if not p.get("success"): + errs = p.get("errors") or [] + detail = "; ".join("{code}: {msg}".format(code=e.get("code","?"), msg=e.get("message","?")) for e in errs) or "unknown" + print(f"ERROR: CF DNS list returned success=false: {detail}", file=sys.stderr) + raise SystemExit(1) +if not isinstance(p.get("result"), list): + print("ERROR: CF DNS list result is not a list", file=sys.stderr) + raise SystemExit(1) +' "$page_file"; then + log "ERROR: CF DNS list page $NEXT_PAGE returned errors or malformed JSON" + exit 1 + fi + + HAS_MORE=$(python3 -c ' +import json, sys +p = json.load(open(sys.argv[1])) +ri = p.get("result_info") or {} +print(1 if ri.get("page", 0) < ri.get("total_pages", 0) else "") +' "$page_file") + PAGE=$((PAGE + 1)) + if [ -z "$HAS_MORE" ]; then + NEXT_PAGE="" + else + NEXT_PAGE=$PAGE + fi + if [ "$PAGE" -gt 500 ]; then + log "::warning::stopping pagination at page 500 (50k records) — re-run if more" + break + fi +done + +CF_JSON=$(python3 -c ' +import glob, json, os, sys +acc = {"result": []} +for f in sorted(glob.glob(os.path.join(sys.argv[1], "page-*.json"))): + with open(f) as fh: + acc["result"].extend(json.load(fh).get("result") or []) +print(json.dumps(acc)) +' "$PAGES_DIR") +TOTAL_CF=$(echo "$CF_JSON" | python3 -c "import json,sys; print(len(json.load(sys.stdin)['result']))") +log " total CF records: $TOTAL_CF" + +# --- Compute targets --------------------------------------------------------- +export MIN_AGE_HOURS ZONE_DOMAIN +DECISIONS=$(echo "$CF_JSON" | python3 -c ' +import json, os, re, sys +from datetime import datetime, timezone, timedelta + +min_age = timedelta(hours=int(os.environ["MIN_AGE_HOURS"])) +zone_domain = os.environ["ZONE_DOMAIN"] +now = datetime.now(timezone.utc) + +# Conservative: only the two known disposable E2E prefixes, anchored to the +# configured zone domain so similarly-named records in other zones never match. +# The prefix must include the trailing hyphen so e2e-smokeprod.moleculesai.app +# is NOT matched. +EPHEMERAL_RE = re.compile( + r"^(e2e-smoke-|e2e-tmpl-)[a-zA-Z0-9_-]*\." + re.escape(zone_domain) + r"$" +) + +def parse_iso(s): + if not s: + return None + s = s.strip() + if s.endswith("Z"): + s = s[:-1] + "+00:00" + try: + return datetime.fromisoformat(s) + except ValueError: + return None + +def decide(r): + rid = r.get("id", "") + name = r.get("name", "") + typ = r.get("type", "") + created = parse_iso(r.get("created_on")) + + if not EPHEMERAL_RE.match(name): + return ("keep", "not-ephemeral-pattern", rid, name, typ) + + if created is None: + return ("keep", "missing-created_on", rid, name, typ) + + if (now - created) < min_age: + return ("keep", "too-new", rid, name, typ) + + return ("delete", "stale-ephemeral", rid, name, typ) + +d = json.load(sys.stdin) +for r in d.get("result", []): + action, reason, rid, name, typ = decide(r) + print(json.dumps({ + "action": action, + "reason": reason, + "id": rid, + "name": name, + "type": typ, + "created_on": r.get("created_on", ""), + })) +') + +MATCHED_COUNT=$(printf '%s' "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-ephemeral-pattern'))") +DELETE_COUNT=$(printf '%s' "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))") +KEEP_COUNT=$((MATCHED_COUNT - DELETE_COUNT)) + +log "" +log "== Prune plan ==" +log " zone domain: $ZONE_DOMAIN" +log " total CF records: $TOTAL_CF" +log " matched ephemeral shape: $MATCHED_COUNT" +log " would delete: $DELETE_COUNT" +log " would keep (in scope): $KEEP_COUNT" +log " min-age-hours: $MIN_AGE_HOURS" +log "" + +printf '%s' "$DECISIONS" | python3 -c " +import json, sys, collections +c = collections.Counter() +for l in sys.stdin: + d = json.loads(l) + c[d['reason']] += 1 +for reason, n in c.most_common(): + print(f' {reason}: {n}') +" + +# --- Safety gate ------------------------------------------------------------- +if [ "$MATCHED_COUNT" -gt 0 ]; then + PCT=$(( DELETE_COUNT * 100 / MATCHED_COUNT )) + if [ "$PCT" -gt "$MAX_DELETE_PCT" ]; then + log "" + log "SAFETY: would delete $PCT% of matched ephemeral records (threshold $MAX_DELETE_PCT%) — refusing." + log " If this is expected, rerun with MAX_DELETE_PCT=$((PCT+5)) $0 $*" + exit 2 + fi +fi + +if [ "$DRY_RUN" = "1" ]; then + log "" + log "Dry run complete. Pass --apply (or PRUNE_APPLY=1) to delete $DELETE_COUNT records." + log "" + log "First 20 records that would be deleted:" + printf '%s' "$DECISIONS" | python3 -c " +import json, sys +shown = 0 +for l in sys.stdin: + d = json.loads(l) + if d['action'] == 'delete': + print(f\" {d['created_on'][:19]:20s} {d['name']}\") + shown += 1 + if shown >= 20: break +" + exit 0 +fi + +# --- Execute deletes --------------------------------------------------------- +PLAN_FILE=$(mktemp -t cf-dns-plan-XXXXXX) +FAIL_LOG=$(mktemp -t cf-dns-fail-XXXXXX) + +printf '%s' "$DECISIONS" | python3 -c ' +import json, sys +with open(sys.argv[1], "w") as plan: + for line in sys.stdin: + d = json.loads(line) + if d.get("action") == "delete": + plan.write(d["id"] + "\t" + d["name"] + "\n") +' "$PLAN_FILE" + +log "" +log "Executing $DELETE_COUNT deletions..." + +DELETED=0 +FAILED=0 +while IFS=$'\t' read -r rid name; do + [ -n "$rid" ] || continue + if curl -sS -m 15 -f -X DELETE \ + -H "Authorization: Bearer $CF_API_TOKEN" \ + "https://api.cloudflare.com/client/v4/zones/$CF_ZONE_ID/dns_records/$rid" \ + >/dev/null 2>&1; then + DELETED=$((DELETED + 1)) + else + FAILED=$((FAILED + 1)) + echo "FAIL $name $rid" >> "$FAIL_LOG" + fi +done < "$PLAN_FILE" + +log "" +log "Done. deleted=$DELETED failed=$FAILED" +if [ "$FAILED" -ne 0 ]; then + log "Failure detail (first 20):" + head -20 "$FAIL_LOG" | while IFS= read -r fl; do log " $fl"; done +fi +[ "$FAILED" -eq 0 ] diff --git a/tests/ops/test_prune_cf_e2e_dns_fail_closed.sh b/tests/ops/test_prune_cf_e2e_dns_fail_closed.sh new file mode 100755 index 000000000..b925ac6d5 --- /dev/null +++ b/tests/ops/test_prune_cf_e2e_dns_fail_closed.sh @@ -0,0 +1,161 @@ +#!/usr/bin/env bash +# Regression test for scripts/ops/prune_cf_e2e_dns.sh — verifies fail-closed +# behavior for Cloudflare API errors and record-selection safety. +# +# Tests: +# 1. Non-2xx CF DNS list response aborts before any delete attempt. +# 2. Malformed JSON CF DNS list response aborts before any delete attempt. +# 3. CF DNS list result that is not an array aborts before any delete attempt. +# 4. A record matching the e2e-smoke-* pattern but younger than min-age is kept. +# 5. A non-ephemeral record (api.moleculesai.app) older than min-age is kept. +# 6. Happy path: an old e2e-smoke-* record is deleted (sentinel reached). +set -uo pipefail + +SCRIPT="${SCRIPT:-scripts/ops/prune_cf_e2e_dns.sh}" + +PASS=0 +FAIL=0 + +run_case() { + local name="$1" list_exit="$2" list_body="$3" expect_delete_sentinel="$4" zone_domain="${5:-moleculesai.app}" + local tmp + tmp=$(mktemp -d -t cf-e2e-prune-fail-closed-XXXXXX) + local delete_sentinel="$tmp/delete_reached" + + # URL-aware curl mock. CF token/zone preflight always succeeds. CF DNS list + # endpoint receives the controlled response. CF DNS delete endpoint writes a + # sentinel if reached. + cat > "$tmp/curl" <<'MOCK' +#!/usr/bin/env bash +url="" +method="GET" +while [ "$#" -gt 0 ]; do + case "$1" in + -X) method="$2"; shift ;; + https://*) url="$1" ;; + esac + shift +done +case "$url" in + */user/tokens/verify) + echo '{"success":true,"result":{"status":"active"}}' + exit 0 + ;; + */zones/*/dns_records*) + if [ "$method" = "DELETE" ]; then + echo 'reached' > "$DELETE_SENTINEL" + echo '{"success":true,"result":{"id":"deleted"}}' + exit 0 + fi + __LIST_BODY__ + exit __LIST_EXIT__ + ;; + */zones/*) + echo '{"success":true,"result":{"id":"zone"}}' + exit 0 + ;; + *) + echo 'reached' > "$DELETE_SENTINEL" + echo '{"success":true,"result":{"id":"deleted"}}' + exit 0 + ;; +esac +MOCK + printf '%s\n' "$list_body" > "$tmp/list_body.txt" + sed -i "s|__LIST_BODY__|cat \"\$LIST_BODY_FILE\"|g; s|__LIST_EXIT__|$list_exit|g" "$tmp/curl" + chmod +x "$tmp/curl" + + local out="$tmp/out" err="$tmp/err" + # Export paths so the mock script can find the list body file and sentinel. + export DELETE_SENTINEL="$delete_sentinel" + export LIST_BODY_FILE="$tmp/list_body.txt" + # Allow the single-record happy-path case to delete 100% of matched records. + export MAX_DELETE_PCT=100 + PATH="$tmp:$PATH" \ + CF_API_TOKEN=tok \ + CF_ZONE_ID=zone \ + PRUNE_MIN_AGE_HOURS=1 \ + PRUNE_ZONE_DOMAIN="$zone_domain" \ + bash "$SCRIPT" --apply > "$out" 2> "$err" + local actual_exit=$? + local case_fail=0 + + if [ "$expect_delete_sentinel" = "true" ]; then + # Happy path: script must reach delete and exit 0. + if [ ! -f "$delete_sentinel" ]; then + echo " ✗ $name: delete sentinel missing — prune did not reach delete step" >&2 + case_fail=1 + fi + if [ "$actual_exit" -ne 0 ]; then + echo " ✗ $name: expected exit 0, got $actual_exit" >&2 + case_fail=1 + fi + else + # Fail-closed / keep cases: delete sentinel must NOT be written. + if [ -f "$delete_sentinel" ]; then + echo " ✗ $name: delete sentinel exists — prune reached delete step unexpectedly" >&2 + case_fail=1 + fi + if [ "$expect_delete_sentinel" = "false-abort" ] && [ "$actual_exit" -eq 0 ]; then + echo " ✗ $name: expected non-zero exit for abort case, got 0" >&2 + case_fail=1 + fi + if [ "$expect_delete_sentinel" = "false-keep" ] && [ "$actual_exit" -ne 0 ]; then + echo " ✗ $name: expected exit 0 for keep case, got $actual_exit" >&2 + case_fail=1 + fi + fi + + if [ "$case_fail" -eq 0 ]; then + echo " ✓ $name" + PASS=$((PASS + 1)) + else + echo " stdout:" >&2 + sed 's/^/ /' "$out" >&2 + echo " stderr:" >&2 + sed 's/^/ /' "$err" >&2 + FAIL=$((FAIL + 1)) + fi + + rm -rf "$tmp" +} + +echo "Test: prune_cf_e2e_dns fail-closed boundary" +echo + +# Bad CF list responses must abort before delete. +run_case "CF DNS list returns 500" 55 '{"success":false,"errors":[{"code":1000}]}' false-abort +run_case "CF DNS list returns malformed JSON" 0 'this is not json' false-abort +run_case "CF DNS list returns non-array result" 0 '{"success":true,"result":{"id":"rec1"}}' false-abort + +# Helper to build a DNS list result with one record, given created_on ISO string +# and optional zone domain (default: moleculesai.app). +make_list() { + local created_on="$1" zone_domain="${2:-moleculesai.app}" + cat <