diff --git a/.github/workflows/test-ops-scripts.yml b/.github/workflows/test-ops-scripts.yml index c377d9e9..9a3a5fa3 100644 --- a/.github/workflows/test-ops-scripts.yml +++ b/.github/workflows/test-ops-scripts.yml @@ -15,6 +15,12 @@ on: paths: - 'scripts/ops/**' - '.github/workflows/test-ops-scripts.yml' + merge_group: + types: [checks_requested] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true jobs: test: diff --git a/scripts/ops/sweep-cf-orphans.sh b/scripts/ops/sweep-cf-orphans.sh index cdd58f21..569bcbcf 100755 --- a/scripts/ops/sweep-cf-orphans.sh +++ b/scripts/ops/sweep-cf-orphans.sh @@ -117,66 +117,69 @@ log " CF records: $TOTAL_CF" # 5. Anything else → keep (we only sweep patterns we understand). export PROD_SLUGS STAGING_SLUGS EC2_NAMES TOTAL_CF +# Edits inside the CANONICAL DECIDE block below must mirror +# scripts/ops/sweep_cf_decide.py — the parity test in +# test_sweep_cf_decide.py asserts they match byte-for-byte. DECISIONS=$(echo "$CF_JSON" | python3 -c ' import json, os, re, sys d = json.load(sys.stdin) prod_slugs = set(os.environ["PROD_SLUGS"].split()) staging_slugs = set(os.environ["STAGING_SLUGS"].split()) +all_slugs = prod_slugs | staging_slugs ec2_names = set(n for n in os.environ["EC2_NAMES"].split() if n) +_PLATFORM_CORE_NAMES = { + "api.moleculesai.app", "app.moleculesai.app", "doc.moleculesai.app", + "send.moleculesai.app", "status.moleculesai.app", "www.moleculesai.app", + "staging-api.moleculesai.app", +} +_WS_RE = re.compile(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$") +_E2E_RE = re.compile(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$") +_TENANT_RE = re.compile(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$") + # CANONICAL DECIDE BEGIN -# Edits inside this block must mirror scripts/ops/sweep_cf_decide.py — the -# parity test in test_sweep_cf_decide.py asserts they match byte-for-byte. -def decide(r, prod_slugs, staging_slugs, ec2_names): +def decide(r, all_slugs, ec2_names): n = r["name"] rid = r["id"] typ = r["type"] - all_slugs = prod_slugs | staging_slugs - # Rule 1: platform core — leave alone if n == "moleculesai.app": return ("keep", "apex", rid, n, typ) if n.startswith("_") or n.endswith("._domainkey.moleculesai.app"): return ("keep", "verification/key", rid, n, typ) - if n in {"api.moleculesai.app","app.moleculesai.app","doc.moleculesai.app", - "send.moleculesai.app","status.moleculesai.app","www.moleculesai.app", - "staging-api.moleculesai.app"}: + if n in _PLATFORM_CORE_NAMES: return ("keep", "platform-core", rid, n, typ) - # Rule 3: ws--.(staging.)moleculesai.app - m = re.match(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$", n) + m = _WS_RE.match(n) if m: prefix = m.group(1) - # Live EC2 names are like "ws-d3605ef2-f7d" — same shape as DNS subdomain. + # Live EC2 names share the ws-- shape with the DNS subdomain. for ename in ec2_names: if ename.startswith(prefix): return ("keep", "live-ec2", rid, n, typ) return ("delete", "orphan-ws", rid, n, typ) - # Rule 4: e2e-* tenants (includes canary, canvas variants) - m = re.match(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$", n) + m = _E2E_RE.match(n) if m: slug = m.group(1) if slug in all_slugs: return ("keep", "live-e2e-tenant", rid, n, typ) return ("delete", "orphan-e2e-tenant", rid, n, typ) - # Rule 2: any other tenant subdomain (slug.moleculesai.app or slug.staging.moleculesai.app) - m = re.match(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$", n) + m = _TENANT_RE.match(n) if m: slug = m.group(1) if slug in all_slugs: return ("keep", "live-tenant", rid, n, typ) - # Only flag as orphan if name looks like a tenant (not a one-off like "hermes-final-*") - # To avoid false-positive nukes on ad-hoc records, we KEEP anything that - # does not match a known pattern. Orphan only for explicit tenant-shaped names. + # KEEP unknown tenant-shaped names — avoid false-positive nukes on + # ad-hoc records (e.g. hermes-final-*) that do not match a known slug. return ("keep", "unknown-subdomain-kept-for-safety", rid, n, typ) return ("keep", "not-a-pattern-we-sweep", rid, n, typ) # CANONICAL DECIDE END for r in d["result"]: - action, reason, rid, name, typ = decide(r, prod_slugs, staging_slugs, ec2_names) + action, reason, rid, name, typ = decide(r, all_slugs, ec2_names) print(json.dumps({"action": action, "reason": reason, "id": rid, "name": name, "type": typ})) ') diff --git a/scripts/ops/sweep_cf_decide.py b/scripts/ops/sweep_cf_decide.py index 2e36d7cb..23cfb2fd 100644 --- a/scripts/ops/sweep_cf_decide.py +++ b/scripts/ops/sweep_cf_decide.py @@ -11,10 +11,11 @@ If you change the rules: edit BOTH this file AND the inline block in ``# CANONICAL DECIDE BEGIN`` to ``# CANONICAL DECIDE END`` markers in both files; the parity check compares those slices). -Inputs to ``decide(record, prod_slugs, staging_slugs, ec2_names)``: +Inputs to ``decide(record, all_slugs, ec2_names)``: record Cloudflare DNS record dict {name, id, type} - prod_slugs set of CP-prod org slugs (live tenants) - staging_slugs set of CP-staging org slugs + all_slugs set of CP org slugs (prod ∪ staging) — caller computes the + union once instead of per-record (decide is hot-path: 100s + to 1000s of records per sweep) ec2_names set of live EC2 Name tags (e.g. ``ws-d3605ef2-f7d``) Returns ``(action, reason, id, name, type)`` matching the bash heredoc. @@ -22,53 +23,57 @@ Returns ``(action, reason, id, name, type)`` matching the bash heredoc. from __future__ import annotations import re -from typing import Iterable + + +# Pre-compile per-record regexes once at module load — saves the per-call +# pattern-cache lookup across 1000s of CF records per sweep. Mirrored at +# the same scope in sweep-cf-orphans.sh's heredoc. +_PLATFORM_CORE_NAMES = { + "api.moleculesai.app", "app.moleculesai.app", "doc.moleculesai.app", + "send.moleculesai.app", "status.moleculesai.app", "www.moleculesai.app", + "staging-api.moleculesai.app", +} +_WS_RE = re.compile(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$") +_E2E_RE = re.compile(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$") +_TENANT_RE = re.compile(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$") # CANONICAL DECIDE BEGIN -def decide(r, prod_slugs, staging_slugs, ec2_names): +def decide(r, all_slugs, ec2_names): n = r["name"] rid = r["id"] typ = r["type"] - all_slugs = prod_slugs | staging_slugs - # Rule 1: platform core — leave alone if n == "moleculesai.app": return ("keep", "apex", rid, n, typ) if n.startswith("_") or n.endswith("._domainkey.moleculesai.app"): return ("keep", "verification/key", rid, n, typ) - if n in {"api.moleculesai.app","app.moleculesai.app","doc.moleculesai.app", - "send.moleculesai.app","status.moleculesai.app","www.moleculesai.app", - "staging-api.moleculesai.app"}: + if n in _PLATFORM_CORE_NAMES: return ("keep", "platform-core", rid, n, typ) - # Rule 3: ws--.(staging.)moleculesai.app - m = re.match(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$", n) + m = _WS_RE.match(n) if m: prefix = m.group(1) - # Live EC2 names are like "ws-d3605ef2-f7d" — same shape as DNS subdomain. + # Live EC2 names share the ws-- shape with the DNS subdomain. for ename in ec2_names: if ename.startswith(prefix): return ("keep", "live-ec2", rid, n, typ) return ("delete", "orphan-ws", rid, n, typ) - # Rule 4: e2e-* tenants (includes canary, canvas variants) - m = re.match(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$", n) + m = _E2E_RE.match(n) if m: slug = m.group(1) if slug in all_slugs: return ("keep", "live-e2e-tenant", rid, n, typ) return ("delete", "orphan-e2e-tenant", rid, n, typ) - # Rule 2: any other tenant subdomain (slug.moleculesai.app or slug.staging.moleculesai.app) - m = re.match(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$", n) + m = _TENANT_RE.match(n) if m: slug = m.group(1) if slug in all_slugs: return ("keep", "live-tenant", rid, n, typ) - # Only flag as orphan if name looks like a tenant (not a one-off like "hermes-final-*") - # To avoid false-positive nukes on ad-hoc records, we KEEP anything that - # does not match a known pattern. Orphan only for explicit tenant-shaped names. + # KEEP unknown tenant-shaped names — avoid false-positive nukes on + # ad-hoc records (e.g. hermes-final-*) that do not match a known slug. return ("keep", "unknown-subdomain-kept-for-safety", rid, n, typ) return ("keep", "not-a-pattern-we-sweep", rid, n, typ) @@ -78,13 +83,12 @@ def decide(r, prod_slugs, staging_slugs, ec2_names): def safety_gate(total: int, delete_count: int, max_delete_pct: int = 50) -> bool: """Return True iff the sweep is safe to execute. - Mirrors the shell-side gate in sweep-cf-orphans.sh: if the deletion - fraction exceeds ``max_delete_pct`` the sweep refuses to run. The - bash script computes the integer percentage as ``DELETE_COUNT*100/TOTAL`` - — keeping the same arithmetic here so a future "raise to 75%" tweak - needs to be made in only one place semantically. + Mirrors the shell-side gate: if the deletion fraction exceeds + ``max_delete_pct`` the sweep refuses to run. Same integer arithmetic + as the bash script (``DELETE_COUNT*100/TOTAL``) so a future threshold + tweak only needs to land in one semantic place. """ if total <= 0: - return True # nothing to delete; gate is trivially satisfied + return True pct = delete_count * 100 // total return pct <= max_delete_pct diff --git a/scripts/ops/test_sweep_cf_decide.py b/scripts/ops/test_sweep_cf_decide.py index 240c88b3..930ba3c0 100644 --- a/scripts/ops/test_sweep_cf_decide.py +++ b/scripts/ops/test_sweep_cf_decide.py @@ -17,10 +17,8 @@ import unittest import sweep_cf_decide as M -# --- Fixtures --------------------------------------------------------------- - -PROD = {"acme", "globex", "initech"} -STAGING = {"e2e-test-runner", "soak", "playground"} +# Caller responsibility (per the new decide signature): compute the union once. +ALL_SLUGS = {"acme", "globex", "initech", "e2e-test-runner", "soak", "playground"} LIVE_EC2 = {"ws-d3605ef2-f7d", "ws-aaaaaaaa-bbb", "ws-cafef00d-dec"} @@ -29,10 +27,7 @@ def rec(name: str, rid: str = "rid-x", typ: str = "A") -> dict: def call(record: dict) -> tuple: - return M.decide(record, PROD, STAGING, LIVE_EC2) - - -# --- Rule 1: platform core -------------------------------------------------- + return M.decide(record, ALL_SLUGS, LIVE_EC2) class TestPlatformCore(unittest.TestCase): @@ -43,10 +38,10 @@ class TestPlatformCore(unittest.TestCase): self.assertEqual((action, reason), ("keep", "apex")) def test_underscore_records_kept(self): - # _vercel, _domainkey, _railway-verify, etc. for n in ("_vercel.moleculesai.app", "_railway-verify.moleculesai.app"): - action, reason, *_ = call(rec(n)) - self.assertEqual((action, reason), ("keep", "verification/key"), n) + with self.subTest(name=n): + action, reason, *_ = call(rec(n)) + self.assertEqual((action, reason), ("keep", "verification/key")) def test_dkim_kept(self): action, reason, *_ = call(rec("send._domainkey.moleculesai.app")) @@ -62,11 +57,9 @@ class TestPlatformCore(unittest.TestCase): "www.moleculesai.app", "staging-api.moleculesai.app", ): - action, reason, *_ = call(rec(n)) - self.assertEqual((action, reason), ("keep", "platform-core"), n) - - -# --- Rule 3: ws-- ----------------------------------------------- + with self.subTest(name=n): + action, reason, *_ = call(rec(n)) + self.assertEqual((action, reason), ("keep", "platform-core")) class TestWsRule(unittest.TestCase): @@ -89,9 +82,6 @@ class TestWsRule(unittest.TestCase): self.assertEqual((action, reason), ("delete", "orphan-ws")) -# --- Rule 4: e2e-* tenants -------------------------------------------------- - - class TestE2ERule(unittest.TestCase): def test_live_e2e_kept(self): action, reason, *_ = call(rec("e2e-test-runner.staging.moleculesai.app")) @@ -102,14 +92,10 @@ class TestE2ERule(unittest.TestCase): self.assertEqual((action, reason), ("delete", "orphan-e2e-tenant")) def test_dead_e2e_on_prod_deleted(self): - # e2e-* on prod (no .staging) is also tenant-shaped — deletion path. action, reason, *_ = call(rec("e2e-ghost.moleculesai.app")) self.assertEqual((action, reason), ("delete", "orphan-e2e-tenant")) -# --- Rule 2: generic tenant subdomain --------------------------------------- - - class TestTenantSubdomainRule(unittest.TestCase): def test_live_prod_tenant_kept(self): action, reason, *_ = call(rec("acme.moleculesai.app")) @@ -120,14 +106,10 @@ class TestTenantSubdomainRule(unittest.TestCase): self.assertEqual((action, reason), ("keep", "live-tenant")) def test_unknown_subdomain_kept_for_safety(self): - # The script intentionally KEEPS unknown patterns to avoid blast. action, reason, *_ = call(rec("hermes-final-2.moleculesai.app")) self.assertEqual((action, reason), ("keep", "unknown-subdomain-kept-for-safety")) -# --- Rule 5 / fallthrough --------------------------------------------------- - - class TestNotASweepPattern(unittest.TestCase): def test_external_domain_kept(self): # Domain-spoofing attempt — must NOT match any of the moleculesai.app rules. @@ -139,13 +121,10 @@ class TestNotASweepPattern(unittest.TestCase): self.assertEqual((action, reason), ("keep", "not-a-pattern-we-sweep")) -# --- Rule priority ---------------------------------------------------------- - - class TestRulePriority(unittest.TestCase): - """Rule 1 (platform-core) wins over later rules even if the name shape - overlaps — e.g. ``api.moleculesai.app`` matches Rule 2's tenant pattern - but must be classified as platform-core.""" + """Platform-core check must precede the tenant-subdomain regex — + e.g. ``api.moleculesai.app`` matches the tenant pattern but must + classify as platform-core.""" def test_api_subdomain_classified_as_platform_not_tenant(self): action, reason, *_ = call(rec("api.moleculesai.app")) @@ -156,9 +135,6 @@ class TestRulePriority(unittest.TestCase): self.assertEqual(reason, "verification/key") -# --- Safety gate ------------------------------------------------------------ - - class TestSafetyGate(unittest.TestCase): """The bash gate refuses to delete >MAX_DELETE_PCT (default 50%).""" @@ -179,48 +155,42 @@ class TestSafetyGate(unittest.TestCase): self.assertFalse(M.safety_gate(total=100, delete_count=76, max_delete_pct=75)) -# --- Empty live-sets behavior (incident-prevention) ------------------------- - - class TestEmptyLiveSets(unittest.TestCase): """If the CP admin API returns no orgs (auth broken, network blip), - every tenant-shaped record looks orphan. The decide function alone - has no defense — that's the safety_gate's job. This test pins the - expected behavior so the safety-gate contract is documented.""" + every tenant-shaped record looks orphan. decide() alone has no + defense — that's safety_gate's job. This test pins the contract so + a future "make decide() defensive" change doesn't silently bypass + the gate.""" def test_dead_e2e_orphans_when_live_set_empty(self): - empty = set() action, reason, *_ = M.decide( rec("e2e-test-runner.staging.moleculesai.app"), - empty, empty, set(), + set(), set(), ) - # decide() classifies as orphan — gate is the line of defense. self.assertEqual((action, reason), ("delete", "orphan-e2e-tenant")) def test_live_ws_still_kept_when_ec2_set_empty(self): - # Symmetric: ws-* without matching EC2 = orphan. action, reason, *_ = M.decide( rec("ws-cafef00d-dec.moleculesai.app"), - PROD, STAGING, set(), + ALL_SLUGS, set(), ) self.assertEqual((action, reason), ("delete", "orphan-ws")) -# --- Parity check ----------------------------------------------------------- - - class TestParityWithBashScript(unittest.TestCase): """The decision logic exists in two places: the canonical block in sweep_cf_decide.py and the inline heredoc in sweep-cf-orphans.sh. - This test asserts the two byte-for-byte match between the + This test asserts the two match between the ``# CANONICAL DECIDE BEGIN`` / ``# CANONICAL DECIDE END`` markers, - so an edit to one without the other fails CI loudly.""" + so an edit to one without the other fails CI loudly. The mirror- + reminder comment lives OUTSIDE the markers in the .sh file so we + don't need to special-case it here.""" @staticmethod - def _slice_canonical(text: str) -> str: - """Return the canonical block, line-anchored (mentions of the marker - words inside docstrings are ignored — only an exact-match line - ``# CANONICAL DECIDE BEGIN`` opens the slice).""" + def _slice_canonical(text: str) -> list[str]: + """Return the lines between the canonical markers, exclusive. + Markers are matched line-anchored (a stripped-line literal match) + so the docstring's prose mention is ignored.""" lines = text.splitlines() begin_idx = end_idx = None for i, line in enumerate(lines): @@ -235,27 +205,14 @@ class TestParityWithBashScript(unittest.TestCase): "missing CANONICAL DECIDE BEGIN/END markers — " "first 30 lines were:\n" + "\n".join(lines[:30]) ) - block = lines[begin_idx + 1:end_idx] - # Strip leading whitespace per-line so the .sh heredoc (no indent) - # and the .py module (also no indent at function scope) compare equal - # even if a future move into a class adds indent on one side. - return "\n".join(line.strip() for line in block if line.strip()) + return lines[begin_idx + 1:end_idx] def test_blocks_match(self): here = os.path.dirname(__file__) with open(os.path.join(here, "sweep_cf_decide.py"), "r", encoding="utf-8") as f: py_block = self._slice_canonical(f.read()) with open(os.path.join(here, "sweep-cf-orphans.sh"), "r", encoding="utf-8") as f: - # Strip the bash-only marker comment line (the .py file doesn't - # carry the "Edits inside this block must mirror …" reminder). - sh_text = f.read().replace( - "# Edits inside this block must mirror scripts/ops/sweep_cf_decide.py — the\n", - "", - ).replace( - "# parity test in test_sweep_cf_decide.py asserts they match byte-for-byte.\n", - "", - ) - sh_block = self._slice_canonical(sh_text) + sh_block = self._slice_canonical(f.read()) self.assertEqual( py_block, sh_block,