refactor(ops): apply simplify findings on #2027 PR

Code-quality + efficiency review of PR #2079:

- Hoist all_slugs = prod_slugs | staging_slugs out of decide() into the
  caller (was rebuilt on every record — 1k records × ~50-slug union per
  call). decide() signature now (r, all_slugs, ec2_names).
- Compile regexes at module scope (_WS_RE, _E2E_RE, _TENANT_RE) +
  hoist platform-core literal set (_PLATFORM_CORE_NAMES). Same change
  mirrored in the bash heredoc.
- Drop decorative # Rule N: comments (numbering was out of order, 3
  before 2 — actively confusing).
- Move the "edits must mirror" reminder OUTSIDE the CANONICAL DECIDE
  block in the .sh file, eliminating the .replace() comment-skip hack
  in TestParityWithBashScript.
- Drop per-line .strip() in _slice_canonical (would mask a real
  indentation bug; both blocks already at column 0).
- subTest() in TestPlatformCore loops so a single failure no longer
  short-circuits the rest of the items.
- merge_group + concurrency on test-ops-scripts.yml (parity with
  ci.yml gate behaviour).
- Fix don't apostrophe in inline comment that closed the python
  heredoc's single-quote and broke bash -n.

All 25 tests still pass. bash -n clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
rabbitblood 2026-04-26 00:28:15 -07:00
parent ba78a5c00d
commit 6494e9192b
4 changed files with 86 additions and 116 deletions

View File

@ -15,6 +15,12 @@ on:
paths:
- 'scripts/ops/**'
- '.github/workflows/test-ops-scripts.yml'
merge_group:
types: [checks_requested]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
test:

View File

@ -117,66 +117,69 @@ log " CF records: $TOTAL_CF"
# 5. Anything else → keep (we only sweep patterns we understand).
export PROD_SLUGS STAGING_SLUGS EC2_NAMES TOTAL_CF
# Edits inside the CANONICAL DECIDE block below must mirror
# scripts/ops/sweep_cf_decide.py — the parity test in
# test_sweep_cf_decide.py asserts they match byte-for-byte.
DECISIONS=$(echo "$CF_JSON" | python3 -c '
import json, os, re, sys
d = json.load(sys.stdin)
prod_slugs = set(os.environ["PROD_SLUGS"].split())
staging_slugs = set(os.environ["STAGING_SLUGS"].split())
all_slugs = prod_slugs | staging_slugs
ec2_names = set(n for n in os.environ["EC2_NAMES"].split() if n)
_PLATFORM_CORE_NAMES = {
"api.moleculesai.app", "app.moleculesai.app", "doc.moleculesai.app",
"send.moleculesai.app", "status.moleculesai.app", "www.moleculesai.app",
"staging-api.moleculesai.app",
}
_WS_RE = re.compile(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$")
_E2E_RE = re.compile(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$")
_TENANT_RE = re.compile(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$")
# CANONICAL DECIDE BEGIN
# Edits inside this block must mirror scripts/ops/sweep_cf_decide.py — the
# parity test in test_sweep_cf_decide.py asserts they match byte-for-byte.
def decide(r, prod_slugs, staging_slugs, ec2_names):
def decide(r, all_slugs, ec2_names):
n = r["name"]
rid = r["id"]
typ = r["type"]
all_slugs = prod_slugs | staging_slugs
# Rule 1: platform core — leave alone
if n == "moleculesai.app":
return ("keep", "apex", rid, n, typ)
if n.startswith("_") or n.endswith("._domainkey.moleculesai.app"):
return ("keep", "verification/key", rid, n, typ)
if n in {"api.moleculesai.app","app.moleculesai.app","doc.moleculesai.app",
"send.moleculesai.app","status.moleculesai.app","www.moleculesai.app",
"staging-api.moleculesai.app"}:
if n in _PLATFORM_CORE_NAMES:
return ("keep", "platform-core", rid, n, typ)
# Rule 3: ws-<hex8>-<rest>.(staging.)moleculesai.app
m = re.match(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$", n)
m = _WS_RE.match(n)
if m:
prefix = m.group(1)
# Live EC2 names are like "ws-d3605ef2-f7d" — same shape as DNS subdomain.
# Live EC2 names share the ws-<hex8>-<rest> shape with the DNS subdomain.
for ename in ec2_names:
if ename.startswith(prefix):
return ("keep", "live-ec2", rid, n, typ)
return ("delete", "orphan-ws", rid, n, typ)
# Rule 4: e2e-* tenants (includes canary, canvas variants)
m = re.match(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$", n)
m = _E2E_RE.match(n)
if m:
slug = m.group(1)
if slug in all_slugs:
return ("keep", "live-e2e-tenant", rid, n, typ)
return ("delete", "orphan-e2e-tenant", rid, n, typ)
# Rule 2: any other tenant subdomain (slug.moleculesai.app or slug.staging.moleculesai.app)
m = re.match(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$", n)
m = _TENANT_RE.match(n)
if m:
slug = m.group(1)
if slug in all_slugs:
return ("keep", "live-tenant", rid, n, typ)
# Only flag as orphan if name looks like a tenant (not a one-off like "hermes-final-*")
# To avoid false-positive nukes on ad-hoc records, we KEEP anything that
# does not match a known pattern. Orphan only for explicit tenant-shaped names.
# KEEP unknown tenant-shaped names — avoid false-positive nukes on
# ad-hoc records (e.g. hermes-final-*) that do not match a known slug.
return ("keep", "unknown-subdomain-kept-for-safety", rid, n, typ)
return ("keep", "not-a-pattern-we-sweep", rid, n, typ)
# CANONICAL DECIDE END
for r in d["result"]:
action, reason, rid, name, typ = decide(r, prod_slugs, staging_slugs, ec2_names)
action, reason, rid, name, typ = decide(r, all_slugs, ec2_names)
print(json.dumps({"action": action, "reason": reason, "id": rid, "name": name, "type": typ}))
')

View File

@ -11,10 +11,11 @@ If you change the rules: edit BOTH this file AND the inline block in
``# CANONICAL DECIDE BEGIN`` to ``# CANONICAL DECIDE END`` markers in
both files; the parity check compares those slices).
Inputs to ``decide(record, prod_slugs, staging_slugs, ec2_names)``:
Inputs to ``decide(record, all_slugs, ec2_names)``:
record Cloudflare DNS record dict {name, id, type}
prod_slugs set of CP-prod org slugs (live tenants)
staging_slugs set of CP-staging org slugs
all_slugs set of CP org slugs (prod staging) caller computes the
union once instead of per-record (decide is hot-path: 100s
to 1000s of records per sweep)
ec2_names set of live EC2 Name tags (e.g. ``ws-d3605ef2-f7d``)
Returns ``(action, reason, id, name, type)`` matching the bash heredoc.
@ -22,53 +23,57 @@ Returns ``(action, reason, id, name, type)`` matching the bash heredoc.
from __future__ import annotations
import re
from typing import Iterable
# Pre-compile per-record regexes once at module load — saves the per-call
# pattern-cache lookup across 1000s of CF records per sweep. Mirrored at
# the same scope in sweep-cf-orphans.sh's heredoc.
_PLATFORM_CORE_NAMES = {
"api.moleculesai.app", "app.moleculesai.app", "doc.moleculesai.app",
"send.moleculesai.app", "status.moleculesai.app", "www.moleculesai.app",
"staging-api.moleculesai.app",
}
_WS_RE = re.compile(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$")
_E2E_RE = re.compile(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$")
_TENANT_RE = re.compile(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$")
# CANONICAL DECIDE BEGIN
def decide(r, prod_slugs, staging_slugs, ec2_names):
def decide(r, all_slugs, ec2_names):
n = r["name"]
rid = r["id"]
typ = r["type"]
all_slugs = prod_slugs | staging_slugs
# Rule 1: platform core — leave alone
if n == "moleculesai.app":
return ("keep", "apex", rid, n, typ)
if n.startswith("_") or n.endswith("._domainkey.moleculesai.app"):
return ("keep", "verification/key", rid, n, typ)
if n in {"api.moleculesai.app","app.moleculesai.app","doc.moleculesai.app",
"send.moleculesai.app","status.moleculesai.app","www.moleculesai.app",
"staging-api.moleculesai.app"}:
if n in _PLATFORM_CORE_NAMES:
return ("keep", "platform-core", rid, n, typ)
# Rule 3: ws-<hex8>-<rest>.(staging.)moleculesai.app
m = re.match(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$", n)
m = _WS_RE.match(n)
if m:
prefix = m.group(1)
# Live EC2 names are like "ws-d3605ef2-f7d" — same shape as DNS subdomain.
# Live EC2 names share the ws-<hex8>-<rest> shape with the DNS subdomain.
for ename in ec2_names:
if ename.startswith(prefix):
return ("keep", "live-ec2", rid, n, typ)
return ("delete", "orphan-ws", rid, n, typ)
# Rule 4: e2e-* tenants (includes canary, canvas variants)
m = re.match(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$", n)
m = _E2E_RE.match(n)
if m:
slug = m.group(1)
if slug in all_slugs:
return ("keep", "live-e2e-tenant", rid, n, typ)
return ("delete", "orphan-e2e-tenant", rid, n, typ)
# Rule 2: any other tenant subdomain (slug.moleculesai.app or slug.staging.moleculesai.app)
m = re.match(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$", n)
m = _TENANT_RE.match(n)
if m:
slug = m.group(1)
if slug in all_slugs:
return ("keep", "live-tenant", rid, n, typ)
# Only flag as orphan if name looks like a tenant (not a one-off like "hermes-final-*")
# To avoid false-positive nukes on ad-hoc records, we KEEP anything that
# does not match a known pattern. Orphan only for explicit tenant-shaped names.
# KEEP unknown tenant-shaped names — avoid false-positive nukes on
# ad-hoc records (e.g. hermes-final-*) that do not match a known slug.
return ("keep", "unknown-subdomain-kept-for-safety", rid, n, typ)
return ("keep", "not-a-pattern-we-sweep", rid, n, typ)
@ -78,13 +83,12 @@ def decide(r, prod_slugs, staging_slugs, ec2_names):
def safety_gate(total: int, delete_count: int, max_delete_pct: int = 50) -> bool:
"""Return True iff the sweep is safe to execute.
Mirrors the shell-side gate in sweep-cf-orphans.sh: if the deletion
fraction exceeds ``max_delete_pct`` the sweep refuses to run. The
bash script computes the integer percentage as ``DELETE_COUNT*100/TOTAL``
keeping the same arithmetic here so a future "raise to 75%" tweak
needs to be made in only one place semantically.
Mirrors the shell-side gate: if the deletion fraction exceeds
``max_delete_pct`` the sweep refuses to run. Same integer arithmetic
as the bash script (``DELETE_COUNT*100/TOTAL``) so a future threshold
tweak only needs to land in one semantic place.
"""
if total <= 0:
return True # nothing to delete; gate is trivially satisfied
return True
pct = delete_count * 100 // total
return pct <= max_delete_pct

View File

@ -17,10 +17,8 @@ import unittest
import sweep_cf_decide as M
# --- Fixtures ---------------------------------------------------------------
PROD = {"acme", "globex", "initech"}
STAGING = {"e2e-test-runner", "soak", "playground"}
# Caller responsibility (per the new decide signature): compute the union once.
ALL_SLUGS = {"acme", "globex", "initech", "e2e-test-runner", "soak", "playground"}
LIVE_EC2 = {"ws-d3605ef2-f7d", "ws-aaaaaaaa-bbb", "ws-cafef00d-dec"}
@ -29,10 +27,7 @@ def rec(name: str, rid: str = "rid-x", typ: str = "A") -> dict:
def call(record: dict) -> tuple:
return M.decide(record, PROD, STAGING, LIVE_EC2)
# --- Rule 1: platform core --------------------------------------------------
return M.decide(record, ALL_SLUGS, LIVE_EC2)
class TestPlatformCore(unittest.TestCase):
@ -43,10 +38,10 @@ class TestPlatformCore(unittest.TestCase):
self.assertEqual((action, reason), ("keep", "apex"))
def test_underscore_records_kept(self):
# _vercel, _domainkey, _railway-verify, etc.
for n in ("_vercel.moleculesai.app", "_railway-verify.moleculesai.app"):
action, reason, *_ = call(rec(n))
self.assertEqual((action, reason), ("keep", "verification/key"), n)
with self.subTest(name=n):
action, reason, *_ = call(rec(n))
self.assertEqual((action, reason), ("keep", "verification/key"))
def test_dkim_kept(self):
action, reason, *_ = call(rec("send._domainkey.moleculesai.app"))
@ -62,11 +57,9 @@ class TestPlatformCore(unittest.TestCase):
"www.moleculesai.app",
"staging-api.moleculesai.app",
):
action, reason, *_ = call(rec(n))
self.assertEqual((action, reason), ("keep", "platform-core"), n)
# --- Rule 3: ws-<hex8>-<rest> -----------------------------------------------
with self.subTest(name=n):
action, reason, *_ = call(rec(n))
self.assertEqual((action, reason), ("keep", "platform-core"))
class TestWsRule(unittest.TestCase):
@ -89,9 +82,6 @@ class TestWsRule(unittest.TestCase):
self.assertEqual((action, reason), ("delete", "orphan-ws"))
# --- Rule 4: e2e-* tenants --------------------------------------------------
class TestE2ERule(unittest.TestCase):
def test_live_e2e_kept(self):
action, reason, *_ = call(rec("e2e-test-runner.staging.moleculesai.app"))
@ -102,14 +92,10 @@ class TestE2ERule(unittest.TestCase):
self.assertEqual((action, reason), ("delete", "orphan-e2e-tenant"))
def test_dead_e2e_on_prod_deleted(self):
# e2e-* on prod (no .staging) is also tenant-shaped — deletion path.
action, reason, *_ = call(rec("e2e-ghost.moleculesai.app"))
self.assertEqual((action, reason), ("delete", "orphan-e2e-tenant"))
# --- Rule 2: generic tenant subdomain ---------------------------------------
class TestTenantSubdomainRule(unittest.TestCase):
def test_live_prod_tenant_kept(self):
action, reason, *_ = call(rec("acme.moleculesai.app"))
@ -120,14 +106,10 @@ class TestTenantSubdomainRule(unittest.TestCase):
self.assertEqual((action, reason), ("keep", "live-tenant"))
def test_unknown_subdomain_kept_for_safety(self):
# The script intentionally KEEPS unknown patterns to avoid blast.
action, reason, *_ = call(rec("hermes-final-2.moleculesai.app"))
self.assertEqual((action, reason), ("keep", "unknown-subdomain-kept-for-safety"))
# --- Rule 5 / fallthrough ---------------------------------------------------
class TestNotASweepPattern(unittest.TestCase):
def test_external_domain_kept(self):
# Domain-spoofing attempt — must NOT match any of the moleculesai.app rules.
@ -139,13 +121,10 @@ class TestNotASweepPattern(unittest.TestCase):
self.assertEqual((action, reason), ("keep", "not-a-pattern-we-sweep"))
# --- Rule priority ----------------------------------------------------------
class TestRulePriority(unittest.TestCase):
"""Rule 1 (platform-core) wins over later rules even if the name shape
overlaps e.g. ``api.moleculesai.app`` matches Rule 2's tenant pattern
but must be classified as platform-core."""
"""Platform-core check must precede the tenant-subdomain regex —
e.g. ``api.moleculesai.app`` matches the tenant pattern but must
classify as platform-core."""
def test_api_subdomain_classified_as_platform_not_tenant(self):
action, reason, *_ = call(rec("api.moleculesai.app"))
@ -156,9 +135,6 @@ class TestRulePriority(unittest.TestCase):
self.assertEqual(reason, "verification/key")
# --- Safety gate ------------------------------------------------------------
class TestSafetyGate(unittest.TestCase):
"""The bash gate refuses to delete >MAX_DELETE_PCT (default 50%)."""
@ -179,48 +155,42 @@ class TestSafetyGate(unittest.TestCase):
self.assertFalse(M.safety_gate(total=100, delete_count=76, max_delete_pct=75))
# --- Empty live-sets behavior (incident-prevention) -------------------------
class TestEmptyLiveSets(unittest.TestCase):
"""If the CP admin API returns no orgs (auth broken, network blip),
every tenant-shaped record looks orphan. The decide function alone
has no defense that's the safety_gate's job. This test pins the
expected behavior so the safety-gate contract is documented."""
every tenant-shaped record looks orphan. decide() alone has no
defense that's safety_gate's job. This test pins the contract so
a future "make decide() defensive" change doesn't silently bypass
the gate."""
def test_dead_e2e_orphans_when_live_set_empty(self):
empty = set()
action, reason, *_ = M.decide(
rec("e2e-test-runner.staging.moleculesai.app"),
empty, empty, set(),
set(), set(),
)
# decide() classifies as orphan — gate is the line of defense.
self.assertEqual((action, reason), ("delete", "orphan-e2e-tenant"))
def test_live_ws_still_kept_when_ec2_set_empty(self):
# Symmetric: ws-* without matching EC2 = orphan.
action, reason, *_ = M.decide(
rec("ws-cafef00d-dec.moleculesai.app"),
PROD, STAGING, set(),
ALL_SLUGS, set(),
)
self.assertEqual((action, reason), ("delete", "orphan-ws"))
# --- Parity check -----------------------------------------------------------
class TestParityWithBashScript(unittest.TestCase):
"""The decision logic exists in two places: the canonical block in
sweep_cf_decide.py and the inline heredoc in sweep-cf-orphans.sh.
This test asserts the two byte-for-byte match between the
This test asserts the two match between the
``# CANONICAL DECIDE BEGIN`` / ``# CANONICAL DECIDE END`` markers,
so an edit to one without the other fails CI loudly."""
so an edit to one without the other fails CI loudly. The mirror-
reminder comment lives OUTSIDE the markers in the .sh file so we
don't need to special-case it here."""
@staticmethod
def _slice_canonical(text: str) -> str:
"""Return the canonical block, line-anchored (mentions of the marker
words inside docstrings are ignored only an exact-match line
``# CANONICAL DECIDE BEGIN`` opens the slice)."""
def _slice_canonical(text: str) -> list[str]:
"""Return the lines between the canonical markers, exclusive.
Markers are matched line-anchored (a stripped-line literal match)
so the docstring's prose mention is ignored."""
lines = text.splitlines()
begin_idx = end_idx = None
for i, line in enumerate(lines):
@ -235,27 +205,14 @@ class TestParityWithBashScript(unittest.TestCase):
"missing CANONICAL DECIDE BEGIN/END markers — "
"first 30 lines were:\n" + "\n".join(lines[:30])
)
block = lines[begin_idx + 1:end_idx]
# Strip leading whitespace per-line so the .sh heredoc (no indent)
# and the .py module (also no indent at function scope) compare equal
# even if a future move into a class adds indent on one side.
return "\n".join(line.strip() for line in block if line.strip())
return lines[begin_idx + 1:end_idx]
def test_blocks_match(self):
here = os.path.dirname(__file__)
with open(os.path.join(here, "sweep_cf_decide.py"), "r", encoding="utf-8") as f:
py_block = self._slice_canonical(f.read())
with open(os.path.join(here, "sweep-cf-orphans.sh"), "r", encoding="utf-8") as f:
# Strip the bash-only marker comment line (the .py file doesn't
# carry the "Edits inside this block must mirror …" reminder).
sh_text = f.read().replace(
"# Edits inside this block must mirror scripts/ops/sweep_cf_decide.py — the\n",
"",
).replace(
"# parity test in test_sweep_cf_decide.py asserts they match byte-for-byte.\n",
"",
)
sh_block = self._slice_canonical(sh_text)
sh_block = self._slice_canonical(f.read())
self.assertEqual(
py_block,
sh_block,