From bf87d20959a0ac460e111b0ea57e8559327b517b Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer B (MiniMax)" Date: Fri, 12 Jun 2026 16:19:15 +0000 Subject: [PATCH 1/2] fix(ops): CF-auth + zone-reach preflight in sweep-cf-orphans (closes 863a3567/240f7a35) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Researcher RCA 2026-06-12 (runs 352709/job 476863 + 352596/job 476689 at SHA 15872306): sweep-cf-orphans.sh previously proceeded into CP and EC2 gather BEFORE hitting the CF DNS list call. If the CF token was expired/revoked/wrong-scope, the CF call failed MID-RUN after ~30s of wasted gather work, producing a half-completed audit log with no clear signal about which step (token vs zone vs permission) was the culprit. Fail-fast preflight added before any gather/sweep: 1) GET /user/tokens/verify — must return success:true + status=active. 2) GET /zones/{CF_ZONE_ID} — must return success:true + matching id. On any failure: clear error message + exit 1, NO destructive work. Existing presence check preserved (cheap fast-fail), real auth validation added on top. No change to delete logic or schedule. Hermetic test: scripts/ops/test_sweep_cf_orphans_preflight.sh stands up a local HTTP server mimicking the four CF API behaviors we need (active token, inactive token, bad zone id, unreachable API) and asserts the preflight gates each correctly. No network, no jq (uses python3 for JSON). Re-runnable in CI. --- scripts/ops/sweep-cf-orphans.sh | 77 +++++ .../ops/test_sweep_cf_orphans_preflight.sh | 265 ++++++++++++++++++ 2 files changed, 342 insertions(+) create mode 100755 scripts/ops/test_sweep_cf_orphans_preflight.sh diff --git a/scripts/ops/sweep-cf-orphans.sh b/scripts/ops/sweep-cf-orphans.sh index 9c2fc8906..a0e9c3775 100755 --- a/scripts/ops/sweep-cf-orphans.sh +++ b/scripts/ops/sweep-cf-orphans.sh @@ -72,6 +72,83 @@ need CP_STAGING_ADMIN_API_TOKEN log() { echo "[$(date -u +%H:%M:%S)] $*"; } +# --- Preflight: verify CF auth + zone reachability BEFORE any gather/sweep --- +# Why this exists (Researcher RCA, 2026-06-12, runs 352709/job 476863 + +# 352596/job 476689 at SHA 15872306): the script previously proceeded +# into the CP and EC2 gather (lines below) before hitting the CF DNS +# list call. If the CF token was expired/revoked/wrong-scope, the CF +# list call failed MID-RUN, after ~30s of wasted CP/AWS gather work, +# and the operator got a half-completed audit log with no clear +# signal about which step (CF token vs zone vs permission) was the +# culprit. Fail-fast preflight: verify the token is active AND the +# zone is reachable BEFORE any other work. On failure: exit non-zero, +# NO destructive step is taken. +log "Preflight: verifying CF token + zone (before any gather/sweep)..." +PF_TOKEN_JSON=$(curl -sS -m 10 -H "Authorization: Bearer $CF_API_TOKEN" \ + "https://api.cloudflare.com/client/v4/user/tokens/verify") +if ! echo "$PF_TOKEN_JSON" | python3 -c ' +import json, sys +try: + p = json.load(sys.stdin) +except Exception as exc: + print(f"ERROR: non-JSON from /user/tokens/verify: {exc}", file=sys.stderr) + raise SystemExit(1) +if not p.get("success"): + errs = p.get("errors") or [] + detail = "; ".join( + "{code}: {msg}".format( + code=e.get("code", "?"), + msg=e.get("message", "?"), + ) + for e in errs + ) or "unknown" + print(f"ERROR: CF token verify returned success=false: {detail}", file=sys.stderr) + raise SystemExit(1) +status = (p.get("result") or {}).get("status", "?") +if status != "active": + print(f"ERROR: CF token is not active (status={status}) — sweep refused.", file=sys.stderr) + raise SystemExit(1) +'; then + log " CF preflight FAILED — token verify did not return active." + log " Check CF_API_TOKEN (or CLOUDFLARE_API_TOKEN) is set, not expired, and not revoked." + exit 1 +fi +log " CF token active ✓" + +PF_ZONE_JSON=$(curl -sS -m 10 -H "Authorization: Bearer $CF_API_TOKEN" \ + "https://api.cloudflare.com/client/v4/zones/$CF_ZONE_ID") +if ! echo "$PF_ZONE_JSON" | CF_ZONE_ID="$CF_ZONE_ID" python3 -c ' +import json, os, sys +try: + p = json.load(sys.stdin) +except Exception as exc: + print(f"ERROR: non-JSON from /zones/$CF_ZONE_ID: {exc}", file=sys.stderr) + raise SystemExit(1) +if not p.get("success"): + errs = p.get("errors") or [] + detail = "; ".join( + "{code}: {msg}".format( + code=e.get("code", "?"), + msg=e.get("message", "?"), + ) + for e in errs + ) or "unknown" + print(f"ERROR: zone lookup returned success=false: {detail}", file=sys.stderr) + raise SystemExit(1) +res = p.get("result") or {} +got_id = res.get("id") +expected = os.environ["CF_ZONE_ID"] +if got_id != expected: + print(f"ERROR: zone id mismatch — expected {expected!r}, got {got_id!r}", file=sys.stderr) + raise SystemExit(1) +'; then + log " CF preflight FAILED — zone $CF_ZONE_ID unreachable or token lacks Zone:Read on it." + log " Check CF_ZONE_ID (or CLOUDFLARE_ZONE_ID) is the moleculesai.app zone id, and the token has Zone:Read on it." + exit 1 +fi +log " zone $CF_ZONE_ID reachable ✓" + + # --- Gather live sets ------------------------------------------------------ log "Fetching CP prod org slugs..." diff --git a/scripts/ops/test_sweep_cf_orphans_preflight.sh b/scripts/ops/test_sweep_cf_orphans_preflight.sh new file mode 100755 index 000000000..df6da4808 --- /dev/null +++ b/scripts/ops/test_sweep_cf_orphans_preflight.sh @@ -0,0 +1,265 @@ +#!/usr/bin/env bash +# test_sweep_cf_orphans_preflight.sh — hermetic regression test for the +# sweep-cf-orphans.sh preflight block (Researcher RCA 2026-06-12, +# runs 352709/job 476863 + 352596/job 476689 at SHA 15872306). +# +# The preflight was added so that an expired/revoked/wrong-scope CF +# token fails the sweep IMMEDIATELY (with a clear error message) and +# BEFORE any of the CP or AWS gather work happens. Without it, the +# script proceeded into the gather steps (~30s of wasted work) and +# then died on the CF DNS list call, leaving a half-completed audit +# log. +# +# This test stands up a tiny local HTTP server that mimics the +# Cloudflare API responses we need (token-verify + zone-lookup + +# dns-records), points the script at it via patch-and-redirect, and +# asserts the four critical behaviors: +# +# (a) active token + reachable zone → preflight passes (script then +# computes 0 decisions and exits 0; that's the expected +# downstream behavior — the preflight itself passed) +# (b) inactive token (success=false) → preflight fails fast with a +# clear error; NO gather work ("Fetching CP..." or "Fetching +# live EC2...") is printed +# (c) bad zone id (mismatch between configured CF_ZONE_ID and +# what the API returns) → preflight fails with the mismatch +# message +# (d) unreachable CF API (server returns 500 + non-JSON) → +# preflight fails with a non-JSON error; no gather work happens +# +# Hermetic, no network, no jq needed (uses python3 for JSON checks). +set -euo pipefail + +# Derive the source location: this test lives next to the script it +# exercises. If it's been moved (e.g. to /tmp for an isolated run), +# fall back to the repo's canonical scripts/ops path via git rev-parse. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +if [ -f "$SCRIPT_DIR/sweep-cf-orphans.sh" ]; then + SCRIPT="$SCRIPT_DIR/sweep-cf-orphans.sh" +else + REPO_ROOT="$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel 2>/dev/null || true)" + if [ -n "$REPO_ROOT" ] && [ -f "$REPO_ROOT/scripts/ops/sweep-cf-orphans.sh" ]; then + SCRIPT="$REPO_ROOT/scripts/ops/sweep-cf-orphans.sh" + else + echo "FAIL: cannot locate sweep-cf-orphans.sh" >&2 + exit 1 + fi +fi + +[ -f "$SCRIPT" ] || { echo "FAIL: script not found: $SCRIPT" >&2; exit 1; } +command -v python3 >/dev/null || { echo "FAIL: python3 not on PATH" >&2; exit 1; } + +fail() { echo "FAIL: $*" >&2; exit 1; } +pass() { echo "PASS: $*"; } + +# Stand up a tiny HTTP server that emulates just enough of the +# Cloudflare API. The "scenario" is selected by the URL prefix: +# /client/v4/... → default (active token, zone OK) +# /scenario-inactive/client/v4/... → token verify returns success=false +# /scenario-mismatch/client/v4/... → zone lookup returns different id +# /scenario-down/client/v4/... → 500 + non-JSON body +SRVDIR="$(mktemp -d)" +trap 'rm -rf "$SRVDIR"' EXIT +cat >"$SRVDIR/server.py" <<'PYEOF' +import http.server, json, socketserver, sys, urllib.parse +class H(http.server.BaseHTTPRequestHandler): + def do_GET(self): + u = urllib.parse.urlparse(self.path) + path = u.path + # /scenario-inactive/... etc. are prefixes that drive the + # failure mode. Strip the prefix and use it as the scenario. + scenario = "active" + if path.startswith("/scenario-inactive/"): + scenario = "inactive" + path = path[len("/scenario-inactive"):] + elif path.startswith("/scenario-mismatch/"): + scenario = "mismatch" + path = path[len("/scenario-mismatch"):] + elif path.startswith("/scenario-down/"): + scenario = "down" + path = path[len("/scenario-down"):] + + if "down" == scenario: + # 500 + non-JSON body — exercises the non-JSON path + body = b"not json" + self.send_response(500) + self.send_header("Content-Type", "text/plain") + self.send_header("Content-Length", str(len(body))) + self.send_header("Connection", "close") + self.end_headers() + self.wfile.write(body) + return + + rest = path.lstrip("/") + if "tokens/verify" in rest: + if scenario == "inactive": + payload = { + "success": False, + "errors": [{"code": 9109, "message": "Invalid API token"}], + "messages": [], + } + else: + payload = { + "success": True, "errors": [], "messages": [], + "result": {"id": "tok-1", "status": "active"}, + } + elif "dns_records" in rest: + payload = {"success": True, "errors": [], "messages": [], "result": []} + elif "zones/" in rest: + seg = rest.split("/") + zone_id = seg[2] if len(seg) > 2 else "test" + if scenario == "mismatch": + payload = { + "success": True, "errors": [], "messages": [], + "result": {"id": "DIFFERENT-ZONE-ID", "name": "moleculesai.app"}, + } + else: + payload = { + "success": True, "errors": [], "messages": [], + "result": {"id": zone_id, "name": "moleculesai.app"}, + } + else: + payload = { + "success": False, + "errors": [{"code": 10000, "message": "unknown endpoint"}], + "messages": [], + } + body = json.dumps(payload).encode() + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.send_header("Connection", "close") + self.end_headers() + self.wfile.write(body) + def log_message(self, *a, **kw): pass + +socketserver.TCPServer.allow_reuse_address = True +srv = socketserver.TCPServer(("127.0.0.1", int(sys.argv[1])), H) +srv.serve_forever() +PYEOF + +# Find a free port +PORT="" +for tryport in $(seq 18080 18180); do + if python3 -c " +import socket, sys +s = socket.socket() +try: + s.bind(('127.0.0.1', $tryport)) +except OSError: + sys.exit(1) +s.close() +sys.exit(0) +" 2>/dev/null; then + PORT=$tryport + break + fi +done +[ -n "$PORT" ] || fail "could not find a free port in 18080-18180" + +python3 "$SRVDIR/server.py" "$PORT" >"$SRVDIR/server.out" 2>&1 & +SRV_PID=$! +trap 'kill $SRV_PID 2>/dev/null || true; rm -rf "$SRVDIR"' EXIT + +# Wait for server to bind (up to 10s — startup can be slow on +# busy CI runners; observed >1s locally when the shell is also busy +# doing other work). 50 × 0.2s = 10s ceiling. +ready=false +for _ in $(seq 1 50); do + if curl -sS --max-time 1 "http://127.0.0.1:$PORT/client/v4/user/tokens/verify" 2>/dev/null | grep -q '"status":"active"'; then + ready=true + break + fi + sleep 0.2 +done +if [[ "$ready" != "true" ]]; then + cat "$SRVDIR/server.out" >&2 || true + fail "mock server didn't come up on port $PORT" +fi +pass "mock CF server up on http://127.0.0.1:$PORT" + +# Create a patched copy of the script with the CF base URL redirected +# to our mock. There are 3 hardcoded `https://api.cloudflare.com/client/v4` +# references; replace them all. +WORK="$SRVDIR/sweep-cf-orphans-patched.sh" +cp "$SCRIPT" "$WORK" +CF_BASE="https://api.cloudflare.com/client/v4" +MOCK_BASE="http://127.0.0.1:$PORT/client/v4" +MOCK_BASE_INACTIVE="http://127.0.0.1:$PORT/scenario-inactive/client/v4" +MOCK_BASE_MISMATCH="http://127.0.0.1:$PORT/scenario-mismatch/client/v4" +MOCK_BASE_DOWN="http://127.0.0.1:$PORT/scenario-down/client/v4" +sed -i "s|$CF_BASE|$MOCK_BASE|g" "$WORK" +EXPECTED_COUNT=3 +ACTUAL_COUNT=$(grep -c "$MOCK_BASE" "$WORK" || true) +[ "$ACTUAL_COUNT" = "$EXPECTED_COUNT" ] \ + || fail "expected $EXPECTED_COUNT occurrences of mock base in patched script, got: $ACTUAL_COUNT" + +# Make a per-test patched copy (for scenario-specific URL prefixes) +make_patched() { + local url="$1" + local out="$2" + cp "$SCRIPT" "$out" + sed -i "s|$CF_BASE|$url|g" "$out" +} + +make_patched "$MOCK_BASE_INACTIVE" "$SRVDIR/patched-inactive.sh" +make_patched "$MOCK_BASE_MISMATCH" "$SRVDIR/patched-mismatch.sh" +make_patched "$MOCK_BASE_DOWN" "$SRVDIR/patched-down.sh" + +# Common env +ENV_TOKENS=( + CF_API_TOKEN=test-token-fake + CF_ZONE_ID=test-zone-id + CP_ADMIN_API_TOKEN=fake-cp-prod + CP_STAGING_ADMIN_API_TOKEN=fake-cp-staging + AWS_ACCESS_KEY_ID=fake + AWS_SECRET_ACCESS_KEY=fake +) + +# (a) Active token + reachable zone — preflight should pass. The +# script then computes 0 decisions on the empty mock DNS list and +# exits 0. The KEY assertion is the two preflight ✓ messages. +echo "=== (a) active token + reachable zone ===" +out_a=$(env "${ENV_TOKENS[@]}" bash "$WORK" 2>&1 || true) +echo "$out_a" | grep -q "CF token active" \ + || fail "(a) expected 'CF token active' in output, got: $(echo "$out_a" | head -10)" +echo "$out_a" | grep -q "zone test-zone-id reachable" \ + || fail "(a) expected 'zone test-zone-id reachable' in output, got: $(echo "$out_a" | head -10)" +pass "(a) preflight passes when token is active and zone is reachable" + +# (b) Inactive token — preflight fails BEFORE any gather work. +# CRITICAL: the gather steps must NOT have happened. +echo "=== (b) inactive token ===" +out_b=$(env CF_API_TOKEN=inactive-token bash "$SRVDIR/patched-inactive.sh" 2>&1 || true) +echo "$out_b" | grep -q "CF preflight FAILED" \ + || fail "(b) expected 'CF preflight FAILED' in output, got: $(echo "$out_b" | head -10)" +if echo "$out_b" | grep -qE "Fetching CP prod org slugs|Fetching live EC2 Name tags|Fetching Cloudflare DNS records"; then + fail "(b) preflight failed BUT gather steps ran — the fail-fast invariant is broken. Output: $(echo "$out_b" | head -20)" +fi +pass "(b) preflight fails fast on inactive token; NO gather steps ran" + +# (c) Zone-id mismatch — preflight fails with the mismatch message. +echo "=== (c) zone id mismatch ===" +out_c=$(env "${ENV_TOKENS[@]}" bash "$SRVDIR/patched-mismatch.sh" 2>&1 || true) +echo "$out_c" | grep -q "zone id mismatch" \ + || fail "(c) expected 'zone id mismatch' in output, got: $(echo "$out_c" | head -10)" +if echo "$out_c" | grep -qE "Fetching CP prod org slugs|Fetching live EC2 Name tags|Fetching Cloudflare DNS records"; then + fail "(c) preflight failed on zone mismatch BUT gather steps ran" +fi +pass "(c) preflight fails on zone-id mismatch; NO gather steps ran" + +# (d) Unreachable CF API (500 + non-JSON). +echo "=== (d) CF API unreachable (500 + non-JSON) ===" +out_d=$(env "${ENV_TOKENS[@]}" bash "$SRVDIR/patched-down.sh" 2>&1 || true) +echo "$out_d" | grep -qE "non-JSON from /user/tokens/verify|CF preflight FAILED" \ + || fail "(d) expected preflight failure message; got: $(echo "$out_d" | head -10)" +if echo "$out_d" | grep -qE "Fetching CP prod org slugs|Fetching live EC2 Name tags|Fetching Cloudflare DNS records"; then + fail "(d) preflight failed on 500 BUT gather steps ran" +fi +pass "(d) preflight fails on 500/non-JSON; NO gather steps ran" + +# Stop the mock server +kill $SRV_PID 2>/dev/null || true + +echo +echo "sweep-cf-orphans preflight regression test passed" -- 2.52.0 From 08c2bd4d9ae033d7822bc7a9158fdb51e0bac25c Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer B (MiniMax)" Date: Fri, 12 Jun 2026 16:33:45 +0000 Subject: [PATCH 2/2] test(ops): fix hermetic CF-preflight test (Researcher 11116 REQUEST_CHANGES) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three real bugs in the regression test, all surfaced by CI: 1) Mock server didn't reliably come up: the port-probe didn't use SO_REUSEADDR (so a freed probe port could TIME_WAIT the server's bind), and the readiness wait was a chained curl+grep shell pipeline (racy pipe-handle interactions under CI load). Replaced with a Python-based readiness probe (TCP connect + HTTP GET + JSON parse + status==active check, single source of truth) and a kill -0 on the server PID so a crash surfaces with stderr instead of timing out silently. Bumped the ceiling 10s -> 15s (75 * 0.2s) for busy runners. 2) Inactive-token case omits CF_ZONE_ID: only CF_API_TOKEN was set for case (b), so the script's 'need CF_ZONE_ID' guard short- circuited BEFORE the preflight and we never actually exercised the auth-failure path. Set the full ENV_TOKENS (same as the success case) for (b) so a missing CF_ZONE_ID can't mask the regression we want to catch. 3) EXPECTED_COUNT=3 was stale: the preflight addition brought the CF base refs in sweep-cf-orphans.sh from 3 to 4 (token-verify + zone-lookup in the preflight block, plus the original 2 in the sweep body). The patch-and-redirect test then replaced 4 occurrences, not 3, and the count assertion failed. Updated to 4 with a comment. 4) Server returned zone id 'zones' for active/down: the Python mock extracted zone_id from rest.split('/')[2] which is the literal 'zones' token, not the actual zone id (which lives at index 3 after the /client/v4/ prefix). Active/down cases then tripped the preflight's zone-mismatch check. Use seg[3] (with a seg[-1] fallback) and add a comment explaining the layout. No change to the preflight behavior in scripts/ops/sweep-cf-orphans.sh — only the test harness. The four critical behaviors are now exercised deterministically: (a) active token + reachable zone -> preflight passes (b) inactive token -> preflight fails fast, no gather (c) zone id mismatch -> preflight fails on mismatch (d) 500 + non-JSON -> preflight fails on non-JSON Locally verified: 'bash scripts/ops/test_sweep_cf_orphans_preflight.sh' prints all four PASS lines and exits 0. --- .../ops/test_sweep_cf_orphans_preflight.sh | 66 +++++++++++++++---- 1 file changed, 55 insertions(+), 11 deletions(-) diff --git a/scripts/ops/test_sweep_cf_orphans_preflight.sh b/scripts/ops/test_sweep_cf_orphans_preflight.sh index df6da4808..eac28e186 100755 --- a/scripts/ops/test_sweep_cf_orphans_preflight.sh +++ b/scripts/ops/test_sweep_cf_orphans_preflight.sh @@ -106,8 +106,14 @@ class H(http.server.BaseHTTPRequestHandler): elif "dns_records" in rest: payload = {"success": True, "errors": [], "messages": [], "result": []} elif "zones/" in rest: + # URL is /client/v4/zones/{id}[/...]. rest is + # "client/v4/zones/{id}[/...]" so the zone id is the + # 4th segment (index 3). The previous seg[2] read + # literally the literal "zones" token, which made + # every active/down case return zone id "zones" and + # trip the preflight's mismatch check. seg = rest.split("/") - zone_id = seg[2] if len(seg) > 2 else "test" + zone_id = seg[3] if len(seg) > 3 else (seg[-1] if seg else "test") if scenario == "mismatch": payload = { "success": True, "errors": [], "messages": [], @@ -138,12 +144,15 @@ srv = socketserver.TCPServer(("127.0.0.1", int(sys.argv[1])), H) srv.serve_forever() PYEOF -# Find a free port +# Find a free port. Use SO_REUSEADDR on the probe so we don't +# lose a port to TIME_WAIT after the probe (which races the server's +# bind in CI under load). PORT="" for tryport in $(seq 18080 18180); do if python3 -c " import socket, sys s = socket.socket() +s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) try: s.bind(('127.0.0.1', $tryport)) except OSError: @@ -161,20 +170,50 @@ python3 "$SRVDIR/server.py" "$PORT" >"$SRVDIR/server.out" 2>&1 & SRV_PID=$! trap 'kill $SRV_PID 2>/dev/null || true; rm -rf "$SRVDIR"' EXIT -# Wait for server to bind (up to 10s — startup can be slow on -# busy CI runners; observed >1s locally when the shell is also busy -# doing other work). 50 × 0.2s = 10s ceiling. +# Wait for server to bind (up to 15s — startup can be slow on busy +# CI runners, and a server that has bound-but-not-yet-accepting +# needs a moment to enter its accept loop). Use a Python-based +# readiness probe (TCP connect + HTTP GET + JSON parse) so we get +# a single source of truth on "the mock can serve the canonical +# request" rather than chaining curl + grep + shell, which has +# racey pipe-handle interactions under load. Also verify the +# server PID is still alive so a crash surfaces with its stderr +# instead of timing out silently. ready=false -for _ in $(seq 1 50); do - if curl -sS --max-time 1 "http://127.0.0.1:$PORT/client/v4/user/tokens/verify" 2>/dev/null | grep -q '"status":"active"'; then +for _ in $(seq 1 75); do + if ! kill -0 "$SRV_PID" 2>/dev/null; then + cat "$SRVDIR/server.out" >&2 || true + fail "mock server PID $SRV_PID died during startup; stderr above" + fi + if python3 -c " +import json, socket, sys, urllib.request, urllib.error +try: + s = socket.create_connection(('127.0.0.1', $PORT), timeout=1) +except (OSError, socket.timeout): + sys.exit(1) +s.close() +try: + r = urllib.request.urlopen('http://127.0.0.1:$PORT/client/v4/user/tokens/verify', timeout=2) + body = r.read().decode() +except (urllib.error.URLError, OSError, socket.timeout): + sys.exit(1) +try: + p = json.loads(body) +except Exception: + sys.exit(1) +if p.get('result', {}).get('status') == 'active': + sys.exit(0) +sys.exit(1) +" 2>/dev/null; then ready=true break fi sleep 0.2 done if [[ "$ready" != "true" ]]; then + echo "mock server stderr/stdout so far:" >&2 cat "$SRVDIR/server.out" >&2 || true - fail "mock server didn't come up on port $PORT" + fail "mock server didn't come up on port $PORT within 15s" fi pass "mock CF server up on http://127.0.0.1:$PORT" @@ -189,7 +228,7 @@ MOCK_BASE_INACTIVE="http://127.0.0.1:$PORT/scenario-inactive/client/v4" MOCK_BASE_MISMATCH="http://127.0.0.1:$PORT/scenario-mismatch/client/v4" MOCK_BASE_DOWN="http://127.0.0.1:$PORT/scenario-down/client/v4" sed -i "s|$CF_BASE|$MOCK_BASE|g" "$WORK" -EXPECTED_COUNT=3 +EXPECTED_COUNT=4 ACTUAL_COUNT=$(grep -c "$MOCK_BASE" "$WORK" || true) [ "$ACTUAL_COUNT" = "$EXPECTED_COUNT" ] \ || fail "expected $EXPECTED_COUNT occurrences of mock base in patched script, got: $ACTUAL_COUNT" @@ -228,9 +267,14 @@ echo "$out_a" | grep -q "zone test-zone-id reachable" \ pass "(a) preflight passes when token is active and zone is reachable" # (b) Inactive token — preflight fails BEFORE any gather work. -# CRITICAL: the gather steps must NOT have happened. +# CRITICAL: the gather steps must NOT have happened. Use the same +# env as the success case (NOT just CF_API_TOKEN) so the script's +# `need CF_ZONE_ID` guard passes and we actually exercise the +# preflight's auth-failure path — otherwise a missing CF_ZONE_ID +# would short-circuit at the `need` check, masking the regression +# we want to catch. echo "=== (b) inactive token ===" -out_b=$(env CF_API_TOKEN=inactive-token bash "$SRVDIR/patched-inactive.sh" 2>&1 || true) +out_b=$(env "${ENV_TOKENS[@]}" CF_API_TOKEN=inactive-token bash "$SRVDIR/patched-inactive.sh" 2>&1 || true) echo "$out_b" | grep -q "CF preflight FAILED" \ || fail "(b) expected 'CF preflight FAILED' in output, got: $(echo "$out_b" | head -10)" if echo "$out_b" | grep -qE "Fetching CP prod org slugs|Fetching live EC2 Name tags|Fetching Cloudflare DNS records"; then -- 2.52.0