From 2a0807eb8e999b909a7e87a9fb1fea889595c355 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Mon, 22 Jun 2026 02:18:40 +0000 Subject: [PATCH 1/3] fix(sweep-cf-orphans): fail-closed live-org fetch + regression test Add curl -f and explicit JSON/orgs-array validation to the CP admin API fetches so any non-2xx, invalid JSON, or missing/invalid 'orgs' array aborts the sweep before Cloudflare DNS records are listed or classified as orphans. The existing MAX_DELETE_PCT gate remains as a second line of defense. Add tests/ops/test_sweep_cf_orphans_fail_closed.sh covering non-2xx, malformed JSON, missing 'orgs', and non-array 'orgs' responses. Co-Authored-By: Claude --- scripts/ops/sweep-cf-orphans.sh | 36 +++++- .../ops/test_sweep_cf_orphans_fail_closed.sh | 117 ++++++++++++++++++ 2 files changed, 147 insertions(+), 6 deletions(-) create mode 100644 tests/ops/test_sweep_cf_orphans_fail_closed.sh diff --git a/scripts/ops/sweep-cf-orphans.sh b/scripts/ops/sweep-cf-orphans.sh index a0e9c377..2f91d0a4 100755 --- a/scripts/ops/sweep-cf-orphans.sh +++ b/scripts/ops/sweep-cf-orphans.sh @@ -149,18 +149,42 @@ fi log " zone $CF_ZONE_ID reachable ✓" +# Fetch org slugs from a CP admin API endpoint. +# Fail-closed: any non-2xx HTTP response, invalid JSON, or missing/invalid +# 'orgs' array aborts the sweep with a non-zero exit. This prevents the +# safety gate from being the only defense when the CP source of truth is +# unreachable or returns an error body. +fetch_cp_orgs() { + local url="$1" token="$2" label="$3" + local resp + resp=$(curl -sS -f -m 15 -H "Authorization: Bearer $token" "$url" 2>&1) || { + echo "ERROR: $label CP admin API request failed (non-2xx or network error)" >&2 + echo "$resp" >&2 + return 1 + } + python3 -c " +import json, sys +try: + d = json.loads(sys.stdin.read()) +except json.JSONDecodeError as e: + print('ERROR: $label CP admin API returned invalid JSON:', e, file=sys.stderr) + sys.exit(1) +orgs = d.get('orgs') +if not isinstance(orgs, list): + print('ERROR: $label CP admin API response missing or invalid \"orgs\" array', file=sys.stderr) + sys.exit(1) +print(' '.join(o['slug'] for o in orgs)) +" <<< "$resp" +} + # --- Gather live sets ------------------------------------------------------ log "Fetching CP prod org slugs..." -PROD_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \ - "https://api.moleculesai.app/cp/admin/orgs?limit=500" \ - | python3 -c "import json,sys; print(' '.join(o['slug'] for o in json.load(sys.stdin).get('orgs',[])))") +PROD_SLUGS=$(fetch_cp_orgs "https://api.moleculesai.app/cp/admin/orgs?limit=500" "$CP_ADMIN_API_TOKEN" "prod") log " prod orgs: $(echo "$PROD_SLUGS" | wc -w | tr -d ' ')" log "Fetching CP staging org slugs..." -STAGING_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_STAGING_ADMIN_API_TOKEN" \ - "https://staging-api.moleculesai.app/cp/admin/orgs?limit=500" \ - | python3 -c "import json,sys; print(' '.join(o['slug'] for o in json.load(sys.stdin).get('orgs',[])))") +STAGING_SLUGS=$(fetch_cp_orgs "https://staging-api.moleculesai.app/cp/admin/orgs?limit=500" "$CP_STAGING_ADMIN_API_TOKEN" "staging") log " staging orgs: $(echo "$STAGING_SLUGS" | wc -w | tr -d ' ')" log "Fetching live EC2 Name tags (region=$REGION)..." diff --git a/tests/ops/test_sweep_cf_orphans_fail_closed.sh b/tests/ops/test_sweep_cf_orphans_fail_closed.sh new file mode 100644 index 00000000..dcb463df --- /dev/null +++ b/tests/ops/test_sweep_cf_orphans_fail_closed.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash +# Regression test for scripts/ops/sweep-cf-orphans.sh — verifies the +# live-org fetch is fail-closed. A non-2xx response, invalid JSON, or a +# response missing the 'orgs' array must abort the sweep BEFORE any +# Cloudflare DNS records are listed or classified as orphans. +set -uo pipefail + +SCRIPT="${SCRIPT:-scripts/ops/sweep-cf-orphans.sh}" + +PASS=0 +FAIL=0 + +run_case() { + local name="$1" curl_exit="$2" curl_body="$3" + local expect_abort="${4:-true}" # true = must stop before CF/orphan classification + local tmp + tmp=$(mktemp -d -t cf-orphans-fail-closed-XXXXXX) + local sentinel="$tmp/cf_reached" + + export curl_body curl_exit tmp + python3 -c " +import os, shlex +body = os.environ['curl_body'] +exit_code = os.environ['curl_exit'] +path = os.path.join(os.environ['tmp'], 'curl') +with open(path, 'w') as f: + f.write('#!/usr/bin/env bash\n') + f.write(f'echo {shlex.quote(body)}\n') + f.write(f'exit {exit_code}\n') +" + chmod +x "$tmp/curl" + + # Mock aws cli: required by the script but should not be reached in + # fail-closed cases. Writes a sentinel if reached. + cat > "$tmp/aws" <<'MOCK' +#!/usr/bin/env bash +echo "reached" > "$CF_SENTINEL" +exit 99 +MOCK + chmod +x "$tmp/aws" + + # Mock CF API: should never be reached in fail-closed cases. We keep it + # simple and let it write a sentinel if called. + cat > "$tmp/cf_api_mock" <<'MOCK' +#!/usr/bin/env bash +echo "reached" > "$CF_SENTINEL" +echo '{"success":false,"errors":[{"code":9999,"message":"CF API should not be called"}]}' +exit 99 +MOCK + chmod +x "$tmp/cf_api_mock" + + local out="$tmp/out" err="$tmp/err" + PATH="$tmp:$PATH" \ + CF_API_TOKEN=tok \ + CF_ZONE_ID=zone \ + CP_ADMIN_API_TOKEN=tok-prod \ + CP_STAGING_ADMIN_API_TOKEN=tok-staging \ + AWS_ACCESS_KEY_ID=ak \ + AWS_SECRET_ACCESS_KEY=sk \ + CF_SENTINEL="$sentinel" \ + bash "$SCRIPT" --execute > "$out" 2> "$err" + local actual_exit=$? + local case_fail=0 + + if [ "$expect_abort" = "true" ]; then + if [ "$actual_exit" -eq 99 ]; then + echo " ✗ $name: reached CF/AWS mock (exit 99) instead of aborting at fetch" >&2 + case_fail=1 + elif [ "$actual_exit" -eq 0 ]; then + echo " ✗ $name: exited 0 instead of aborting" >&2 + case_fail=1 + fi + if [ -f "$sentinel" ]; then + echo " ✗ $name: CF/AWS sentinel exists — sweep reached classification" >&2 + case_fail=1 + fi + if grep -qE '== Sweep plan ==|would delete:|orphan-' "$out" "$err" 2>/dev/null; then + echo " ✗ $name: output contains sweep plan / orphan classification" >&2 + case_fail=1 + fi + else + # Happy-path control: valid empty orgs array must pass the fetch guard. + # The CF mock will not be called because the script checks CF preflight + # first; to keep this simple we just assert the fetch did not abort. + if [ "$actual_exit" -eq 0 ]; then + echo " ✗ $name: exited 0 — fetch passed but this control needs the script to hit CF preflight" >&2 + case_fail=1 + fi + fi + + if [ "$case_fail" -eq 0 ]; then + echo " ✓ $name" + PASS=$((PASS + 1)) + else + echo " stdout:" >&2 + sed 's/^/ /' "$out" >&2 + echo " stderr:" >&2 + sed 's/^/ /' "$err" >&2 + FAIL=$((FAIL + 1)) + fi + + rm -rf "$tmp" + unset curl_body curl_exit tmp +} + +echo "Test: sweep-cf-orphans live-org fetch fail-closed" +echo + +run_case "prod API returns 500" 22 '{"error":"internal"}' true +run_case "prod API returns malformed JSON" 0 'this is not json' true +run_case "prod API returns JSON without orgs" 0 '{"foo":"bar"}' true +run_case "prod API returns orgs as string" 0 '{"orgs":"not-an-array"}' true +run_case "prod API returns valid empty orgs" 0 '{"orgs":[]}' false + +echo +echo "passed=$PASS failed=$FAIL" +[ "$FAIL" -eq 0 ] -- 2.52.0 From e873459f4c617b31c96d93de1711ca12ad4c3008 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Mon, 22 Jun 2026 02:30:11 +0000 Subject: [PATCH 2/3] test(sweep-cf-orphans): prove fail-closed boundary CR2 RC 13115 + Researcher RC 13114: the previous regression test used a global curl stub, so CF preflight failed before the CP live-org fetch was ever exercised. Rewrite the test with a URL-aware curl mock that lets CF preflight succeed and only returns the bad response for the CP admin orgs endpoints. Fail-closed cases now prove the abort happens before AWS/CF classification; the happy-path case proves the sweep proceeds to CF DNS listing after a valid {'orgs':[]} response. Co-Authored-By: Claude --- .../ops/test_sweep_cf_orphans_fail_closed.sh | 95 ++++++++++++------- 1 file changed, 59 insertions(+), 36 deletions(-) diff --git a/tests/ops/test_sweep_cf_orphans_fail_closed.sh b/tests/ops/test_sweep_cf_orphans_fail_closed.sh index dcb463df..402c4378 100644 --- a/tests/ops/test_sweep_cf_orphans_fail_closed.sh +++ b/tests/ops/test_sweep_cf_orphans_fail_closed.sh @@ -11,44 +11,65 @@ PASS=0 FAIL=0 run_case() { - local name="$1" curl_exit="$2" curl_body="$3" + local name="$1" cp_exit="$2" cp_body="$3" local expect_abort="${4:-true}" # true = must stop before CF/orphan classification local tmp tmp=$(mktemp -d -t cf-orphans-fail-closed-XXXXXX) local sentinel="$tmp/cf_reached" - export curl_body curl_exit tmp - python3 -c " -import os, shlex -body = os.environ['curl_body'] -exit_code = os.environ['curl_exit'] -path = os.path.join(os.environ['tmp'], 'curl') -with open(path, 'w') as f: - f.write('#!/usr/bin/env bash\n') - f.write(f'echo {shlex.quote(body)}\n') - f.write(f'exit {exit_code}\n') -" + # Generate a smart curl mock that lets CF preflight succeed and only + # returns the bad response for the CP admin orgs endpoints. This proves + # the abort happens at the live-org fetch boundary, not at CF preflight. + cat > "$tmp/curl" <<'MOCK' +#!/usr/bin/env bash +url="" +while [ "$#" -gt 0 ]; do + case "$1" in + https://*) url="$1" ;; + esac + shift +done +case "$url" in + */user/tokens/verify) + echo '{"success":true,"result":{"status":"active"}}' + exit 0 + ;; + */zones/*/dns_records*) + echo '{"success":true,"result":[{"id":"rec1","name":"api.moleculesai.app","type":"A","created_on":"2026-06-20T00:00:00Z"}]}' + echo 'reached' > "$CF_SENTINEL" + exit 0 + ;; + */zones/*) + echo '{"success":true,"result":{"id":"zone"}}' + exit 0 + ;; + */cp/admin/orgs*) + __CP_BODY__ + exit __CP_EXIT__ + ;; + *) + echo '{"success":true,"result":[]}' + echo '{"success":true,"result":[]}' > "$CF_SENTINEL" + exit 0 + ;; +esac +MOCK + # Substitute the test-case body and exit code. Use printf/sed to avoid + # shell quoting issues with JSON in the heredoc. + printf '%s\n' "$cp_body" > "$tmp/cp_body.txt" + sed -i "s|__CP_BODY__|cat \"$tmp/cp_body.txt\"|g; s|__CP_EXIT__|$cp_exit|g" "$tmp/curl" chmod +x "$tmp/curl" - # Mock aws cli: required by the script but should not be reached in - # fail-closed cases. Writes a sentinel if reached. + # Mock aws cli: required by the script. Writes a sentinel if reached. + # Returns valid empty EC2 JSON so the script proceeds to CF DNS list. cat > "$tmp/aws" <<'MOCK' #!/usr/bin/env bash echo "reached" > "$CF_SENTINEL" -exit 99 +echo '{"Reservations":[]}' +exit 0 MOCK chmod +x "$tmp/aws" - # Mock CF API: should never be reached in fail-closed cases. We keep it - # simple and let it write a sentinel if called. - cat > "$tmp/cf_api_mock" <<'MOCK' -#!/usr/bin/env bash -echo "reached" > "$CF_SENTINEL" -echo '{"success":false,"errors":[{"code":9999,"message":"CF API should not be called"}]}' -exit 99 -MOCK - chmod +x "$tmp/cf_api_mock" - local out="$tmp/out" err="$tmp/err" PATH="$tmp:$PATH" \ CF_API_TOKEN=tok \ @@ -63,10 +84,9 @@ MOCK local case_fail=0 if [ "$expect_abort" = "true" ]; then - if [ "$actual_exit" -eq 99 ]; then - echo " ✗ $name: reached CF/AWS mock (exit 99) instead of aborting at fetch" >&2 - case_fail=1 - elif [ "$actual_exit" -eq 0 ]; then + # Fail-closed cases: script must abort at the CP live-org fetch, + # before AWS EC2 gather or CF DNS list/classify. + if [ "$actual_exit" -eq 0 ]; then echo " ✗ $name: exited 0 instead of aborting" >&2 case_fail=1 fi @@ -79,11 +99,15 @@ MOCK case_fail=1 fi else - # Happy-path control: valid empty orgs array must pass the fetch guard. - # The CF mock will not be called because the script checks CF preflight - # first; to keep this simple we just assert the fetch did not abort. - if [ "$actual_exit" -eq 0 ]; then - echo " ✗ $name: exited 0 — fetch passed but this control needs the script to hit CF preflight" >&2 + # Happy-path control: valid empty orgs array must pass the fetch guard + # and reach Cloudflare DNS listing. The CF catch-all mock writes the + # sentinel and returns an empty result list, so the script exits 0. + if [ ! -f "$sentinel" ]; then + echo " ✗ $name: CF sentinel missing — sweep did not reach DNS list" >&2 + case_fail=1 + fi + if [ "$actual_exit" -ne 0 ]; then + echo " ✗ $name: expected exit 0 after empty DNS list, got $actual_exit" >&2 case_fail=1 fi fi @@ -100,7 +124,6 @@ MOCK fi rm -rf "$tmp" - unset curl_body curl_exit tmp } echo "Test: sweep-cf-orphans live-org fetch fail-closed" @@ -110,7 +133,7 @@ run_case "prod API returns 500" 22 '{"error":"internal" run_case "prod API returns malformed JSON" 0 'this is not json' true run_case "prod API returns JSON without orgs" 0 '{"foo":"bar"}' true run_case "prod API returns orgs as string" 0 '{"orgs":"not-an-array"}' true -run_case "prod API returns valid empty orgs" 0 '{"orgs":[]}' false +run_case "prod API returns valid empty orgs (proceeds)" 0 '{"orgs":[]}' false echo echo "passed=$PASS failed=$FAIL" -- 2.52.0 From 0bf4ec72efa9f2b3a0ea7e445b2400403ae25bcd Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Mon, 22 Jun 2026 02:38:23 +0000 Subject: [PATCH 3/3] test(sweep-cf-orphans): strengthen fail-closed boundary test per review feedback Address reviewer feedback on #3139 by making the boundary between CP live-org fetch and AWS/CF classification explicit: - Separate AWS_SENTINEL and CF_SENTINEL so fail-closed cases prove neither AWS EC2 gather nor CF DNS list/classify is reached. - Happy-path control now asserts both AWS and CF boundaries are crossed when valid empty orgs arrays are returned. - Keep URL-aware curl mock so CF preflight succeeds independently of the bad CP orgs responses under test. Local run: 5/5 pass. --- .../ops/test_sweep_cf_orphans_fail_closed.sh | 43 +++++++++++-------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/tests/ops/test_sweep_cf_orphans_fail_closed.sh b/tests/ops/test_sweep_cf_orphans_fail_closed.sh index 402c4378..f73ec656 100644 --- a/tests/ops/test_sweep_cf_orphans_fail_closed.sh +++ b/tests/ops/test_sweep_cf_orphans_fail_closed.sh @@ -12,14 +12,14 @@ FAIL=0 run_case() { local name="$1" cp_exit="$2" cp_body="$3" - local expect_abort="${4:-true}" # true = must stop before CF/orphan classification + local expect_abort="${4:-true}" # true = must stop before AWS/CF boundary local tmp tmp=$(mktemp -d -t cf-orphans-fail-closed-XXXXXX) - local sentinel="$tmp/cf_reached" - # Generate a smart curl mock that lets CF preflight succeed and only - # returns the bad response for the CP admin orgs endpoints. This proves - # the abort happens at the live-org fetch boundary, not at CF preflight. + # Generate a URL-aware curl mock. CF token/zone preflight and the CF DNS + # list must return valid JSON so the test can prove a bad CP orgs response + # aborts at the live-org fetch boundary, not during preflight or after + # reaching AWS/CF classification. cat > "$tmp/curl" <<'MOCK' #!/usr/bin/env bash url="" @@ -49,7 +49,7 @@ case "$url" in ;; *) echo '{"success":true,"result":[]}' - echo '{"success":true,"result":[]}' > "$CF_SENTINEL" + echo 'reached' > "$CF_SENTINEL" exit 0 ;; esac @@ -60,11 +60,12 @@ MOCK sed -i "s|__CP_BODY__|cat \"$tmp/cp_body.txt\"|g; s|__CP_EXIT__|$cp_exit|g" "$tmp/curl" chmod +x "$tmp/curl" - # Mock aws cli: required by the script. Writes a sentinel if reached. - # Returns valid empty EC2 JSON so the script proceeds to CF DNS list. + # Mock aws cli: required by the script. Returns valid empty EC2 JSON in the + # happy path; writes a sentinel if reached so fail-closed cases prove AWS + # gather was not entered. cat > "$tmp/aws" <<'MOCK' #!/usr/bin/env bash -echo "reached" > "$CF_SENTINEL" +echo "reached" > "$AWS_SENTINEL" echo '{"Reservations":[]}' exit 0 MOCK @@ -78,20 +79,25 @@ MOCK CP_STAGING_ADMIN_API_TOKEN=tok-staging \ AWS_ACCESS_KEY_ID=ak \ AWS_SECRET_ACCESS_KEY=sk \ - CF_SENTINEL="$sentinel" \ + CF_SENTINEL="$tmp/cf_reached" \ + AWS_SENTINEL="$tmp/aws_reached" \ bash "$SCRIPT" --execute > "$out" 2> "$err" local actual_exit=$? local case_fail=0 if [ "$expect_abort" = "true" ]; then # Fail-closed cases: script must abort at the CP live-org fetch, - # before AWS EC2 gather or CF DNS list/classify. + # before AWS EC2 gather or CF DNS list/classify/delete. if [ "$actual_exit" -eq 0 ]; then echo " ✗ $name: exited 0 instead of aborting" >&2 case_fail=1 fi - if [ -f "$sentinel" ]; then - echo " ✗ $name: CF/AWS sentinel exists — sweep reached classification" >&2 + if [ -f "$tmp/cf_reached" ]; then + echo " ✗ $name: CF sentinel exists — sweep reached DNS list/classify" >&2 + case_fail=1 + fi + if [ -f "$tmp/aws_reached" ]; then + echo " ✗ $name: AWS sentinel exists — sweep reached EC2 gather" >&2 case_fail=1 fi if grep -qE '== Sweep plan ==|would delete:|orphan-' "$out" "$err" 2>/dev/null; then @@ -99,13 +105,16 @@ MOCK case_fail=1 fi else - # Happy-path control: valid empty orgs array must pass the fetch guard - # and reach Cloudflare DNS listing. The CF catch-all mock writes the - # sentinel and returns an empty result list, so the script exits 0. - if [ ! -f "$sentinel" ]; then + # Happy-path control: valid empty orgs arrays must pass the fetch guard + # and reach both AWS EC2 gather and Cloudflare DNS listing. + if [ ! -f "$tmp/cf_reached" ]; then echo " ✗ $name: CF sentinel missing — sweep did not reach DNS list" >&2 case_fail=1 fi + if [ ! -f "$tmp/aws_reached" ]; then + echo " ✗ $name: AWS sentinel missing — sweep did not reach EC2 gather" >&2 + case_fail=1 + fi if [ "$actual_exit" -ne 0 ]; then echo " ✗ $name: expected exit 0 after empty DNS list, got $actual_exit" >&2 case_fail=1 -- 2.52.0