diff --git a/scripts/ops/sweep-aws-secrets.sh b/scripts/ops/sweep-aws-secrets.sh index 3acd0bbff..3c208fb7f 100755 --- a/scripts/ops/sweep-aws-secrets.sh +++ b/scripts/ops/sweep-aws-secrets.sh @@ -19,24 +19,36 @@ # add KindSecretsManagerSecret + recorder hook + reconciler enumerator. # Tracked separately as a controlplane issue. # -# This is a parallel-shape janitor to sweep-cf-tunnels.sh: +# Sweeps TWO managed namespaces (both reduce to "owning org is gone"): +# - molecule/tenant//bootstrap — org_id is in the NAME. +# - molecule/workspace//config — owning org is on the OrgID TAG +# (cp#329 per-workspace config delivery). THIS prefix was the entire +# ~$253/mo SM bill in June 2026 (2.4k orphan secrets from purged +# ephemeral E2E orgs) and was NOT swept before — the old filter only +# matched molecule/tenant/, so the janitor reported SUCCESS while +# deleting nothing relevant. Per-workspace liveness INSIDE a still-live +# org is owned by the CP auto-reap secrets reaper; this sweeper only +# deletes a workspace secret whose whole org is gone (no race risk). +# +# Steps: # 1. Query CP admin API to enumerate live org IDs (prod + staging) -# 2. Enumerate AWS Secrets Manager secrets matching the tenant prefix -# 3. For each secret matching `molecule/tenant//bootstrap`, -# check if appears in the live set -# 4. Defense-in-depth: skip secrets created in the last 24h -# (window for a provision-in-progress that hasn't yet finished -# its first heartbeat to CP) -# 5. Only delete secrets with NO live org counterpart AND outside -# the 24h grace window +# 2. Enumerate SM secrets matching either managed prefix +# 3. tenant/* → org_id from name; workspace/* → org_id from OrgID tag +# 4. Defense-in-depth: skip secrets created in the last GRACE_HOURS +# 5. Only delete secrets whose owning org is NOT in the live set AND are +# outside the grace window # # Dry-run by default; must pass --execute to actually delete. # -# Note on deletion semantics: --force-delete-without-recovery skips -# the 7-30 day recovery window. We accept this because (a) the grace -# window above already filters in-flight provisions, and (b) the -# bootstrap secret is regenerated on every reprovision — losing one -# is recoverable by re-running the provision flow. +# Deletion semantics: RECOVERABLE 30-day delete (NOT force-delete). A +# mistaken sweep is reversible via `aws secretsmanager restore-secret` +# for 30 days. At thousands-of-secrets scale an unrecoverable bulk delete +# is an unacceptable blast radius; matches the CP provisioner + reaper. +# +# Bulk backlog: the MAX_DELETE_PCT gate will (correctly) block a genuine +# >50%-orphan backlog. To drain one deliberately, set SWEEP_ALLOW_BULK=1 +# — the real safety is the live-org cross-reference + 30d recovery, not +# the percent gate. # # Env vars required: # AWS_REGION — region the secrets live in (default: us-east-1) @@ -52,7 +64,8 @@ # 0 — dry-run completed or sweep executed successfully # 1 — missing required env, API failure, or unexpected state # 2 — safety check failed (would delete >MAX_DELETE_PCT% of -# tenant-shaped secrets; refusing) +# managed-shaped secrets; refusing — set SWEEP_ALLOW_BULK=1 to +# drain a deliberate backlog) set -euo pipefail @@ -106,16 +119,40 @@ log() { echo "[$(date -u +%H:%M:%S)] $*"; } # awsapi.TenantSecretName in molecule-controlplane. The /cp/admin/orgs # response includes both `id` and `slug`; we extract `id` here. +# Fetch org IDs from a CP admin API endpoint. +# Fail-closed: any non-2xx HTTP response, invalid JSON, or missing/invalid +# 'orgs' array aborts the sweep with a non-zero exit. This is critical under +# SWEEP_ALLOW_BULK=1, where an empty live-org set would classify every old +# managed secret as orphan. +fetch_cp_orgs() { + local url="$1" token="$2" label="$3" + local resp + resp=$(curl -sS -f -m 15 -H "Authorization: Bearer $token" "$url" 2>&1) || { + echo "ERROR: $label CP admin API request failed (non-2xx or network error)" >&2 + echo "$resp" >&2 + return 1 + } + python3 -c " +import json, sys +try: + d = json.loads(sys.stdin.read()) +except json.JSONDecodeError as e: + print('ERROR: $label CP admin API returned invalid JSON:', e, file=sys.stderr) + sys.exit(1) +orgs = d.get('orgs') +if not isinstance(orgs, list): + print('ERROR: $label CP admin API response missing or invalid \"orgs\" array', file=sys.stderr) + sys.exit(1) +print(' '.join(o['id'] for o in orgs)) +" <<< "$resp" +} + log "Fetching CP prod org ids..." -PROD_IDS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \ - "https://api.moleculesai.app/cp/admin/orgs?limit=500" \ - | python3 -c "import json,sys; print(' '.join(o['id'] for o in json.load(sys.stdin).get('orgs',[])))") +PROD_IDS=$(fetch_cp_orgs "https://api.moleculesai.app/cp/admin/orgs?limit=500" "$CP_ADMIN_API_TOKEN" "prod") log " prod orgs: $(echo "$PROD_IDS" | wc -w | tr -d ' ')" log "Fetching CP staging org ids..." -STAGING_IDS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_STAGING_ADMIN_API_TOKEN" \ - "https://staging-api.moleculesai.app/cp/admin/orgs?limit=500" \ - | python3 -c "import json,sys; print(' '.join(o['id'] for o in json.load(sys.stdin).get('orgs',[])))") +STAGING_IDS=$(fetch_cp_orgs "https://staging-api.moleculesai.app/cp/admin/orgs?limit=500" "$CP_STAGING_ADMIN_API_TOKEN" "staging") log " staging orgs: $(echo "$STAGING_IDS" | wc -w | tr -d ' ')" log "Fetching AWS Secrets Manager secrets (region=$AWS_REGION)..." @@ -143,16 +180,22 @@ NEXT_TOKEN="" PAGE=1 while :; do page_file="$PAGES_DIR/page-$(printf '%05d' "$PAGE").json" + # Sweep BOTH managed prefixes: molecule/tenant/* (per-org bootstrap) AND + # molecule/workspace/* (per-workspace config, cp#329). The latter was the + # entire ~$253/mo SM bill in June 2026 (2.4k orphan secrets) and this + # filter never matched it before — the sweeper reported SUCCESS while + # deleting nothing relevant. A name-filter Values list is OR-matched by + # Secrets Manager, so this captures both namespaces in one paginated walk. if [ -z "$NEXT_TOKEN" ]; then aws secretsmanager list-secrets \ --region "$AWS_REGION" \ - --filters Key=name,Values=molecule/tenant/ \ + --filters Key=name,Values=molecule/tenant/,molecule/workspace/ \ --max-results 100 \ --output json > "$page_file" else aws secretsmanager list-secrets \ --region "$AWS_REGION" \ - --filters Key=name,Values=molecule/tenant/ \ + --filters Key=name,Values=molecule/tenant/,molecule/workspace/ \ --max-results 100 \ --next-token "$NEXT_TOKEN" \ --output json > "$page_file" @@ -160,8 +203,8 @@ while :; do NEXT_TOKEN=$(python3 -c "import json,sys; d=json.load(open(sys.argv[1])); print(d.get('NextToken') or '')" "$page_file") PAGE=$((PAGE + 1)) if [ -z "$NEXT_TOKEN" ]; then break; fi - if [ "$PAGE" -gt 50 ]; then - log "::warning::stopping pagination at page 50 (5000 secrets) — re-run if more" + if [ "$PAGE" -gt 100 ]; then + log "::warning::stopping pagination at page 100 (10000 secrets) — re-run if more" break fi done @@ -179,14 +222,28 @@ log " total tenant-prefixed secrets: $TOTAL_SECRETS" # --- Compute orphans ------------------------------------------------------- # -# Rules (in order): -# 1. Name doesn't match `molecule/tenant//bootstrap` → keep -# (unknown — never sweep arbitrary secrets that might belong to -# platform infra or other tenants of this AWS account). -# 2. CreatedDate within $GRACE_HOURS → keep (defense-in-depth: don't -# kill a secret while its provision is still mid-flight). -# 3. org_id ∈ {prod_ids ∪ staging_ids} → keep (live tenant). -# 4. Otherwise → delete (orphan). +# Two managed namespaces, cross-referenced against the SAME live ORG set +# (prod_ids ∪ staging_ids fetched from the CP admin API). Both reduce to +# "the owning org no longer exists ⇒ orphan", which is the safe, org-level +# signal the sweeper can establish without per-workspace liveness: +# +# molecule/tenant//bootstrap — org_id is in the NAME. +# molecule/workspace//config — ws_id is in the name (NOT an org +# id); the owning org is on the secret's OrgID TAG (set by the CP +# provisioner's seedWorkspaceConfigSecret). A workspace secret whose +# OrgID tag is not a live org is a guaranteed orphan: the whole tenant +# is gone, so the workspace can't exist. (Per-workspace liveness inside +# a STILL-LIVE org is owned by the CP auto-reap secrets reaper, which +# calls the tenant /workspaces endpoint — this sweeper deliberately +# does NOT delete a workspace secret whose org is still live, to avoid +# racing a live tenant's in-flight workspace.) +# +# Rules (in order, per secret): +# 1. Name matches neither managed shape → keep (never sweep arbitrary +# secrets that might belong to platform infra). +# 2. CreatedDate within $GRACE_HOURS → keep (provision-in-flight margin). +# 3. owning org ∈ {prod_ids ∪ staging_ids} → keep (live tenant). +# 4. Otherwise → delete (orphan) via 30-day RECOVERABLE delete. export PROD_IDS STAGING_IDS GRACE_HOURS DECISIONS=$(echo "$SECRET_JSON" | python3 -c ' @@ -201,6 +258,8 @@ now = datetime.now(timezone.utc) # molecule/tenant//bootstrap — org_id is a UUID. _TENANT_RE = re.compile(r"^molecule/tenant/([0-9a-fA-F-]{36})/bootstrap$") +# molecule/workspace//config — ws_id is a UUID; owning org is on the tag. +_WS_RE = re.compile(r"^molecule/workspace/([0-9a-fA-F-]{36})/config$") def parse_iso(s): if not s: @@ -212,24 +271,43 @@ def parse_iso(s): except ValueError: return None +def org_tag(s): + for t in s.get("Tags") or []: + if t.get("Key") == "OrgID": + return t.get("Value") or "" + return "" + def decide(s, all_ids, grace, now): name = s.get("Name", "") arn = s.get("ARN", "") - m = _TENANT_RE.match(name) - if not m: - return ("keep", "not-a-tenant-secret", arn, name) - - org_id = m.group(1) + mt = _TENANT_RE.match(name) + mw = _WS_RE.match(name) + if not mt and not mw: + return ("keep", "not-a-managed-secret", arn, name) + # Grace gate (both shapes): never touch a secret younger than the window. created = parse_iso(s.get("CreatedDate") or s.get("LastChangedDate")) if created is not None and (now - created) < grace: return ("keep", "in-grace-window", arn, name) - if org_id in all_ids: - return ("keep", "live-tenant", arn, name) + if mt: + org_id = mt.group(1) + if org_id in all_ids: + return ("keep", "live-tenant", arn, name) + return ("delete", "orphan-tenant", arn, name) - return ("delete", "orphan-tenant", arn, name) + # workspace-config: owning org is on the OrgID tag. + org_id = org_tag(s) + if not org_id: + # No OrgID tag (legacy / hand-created) — cannot establish ownership; + # keep and let the CP reaper (which parses the live set) handle it. + return ("keep", "workspace-no-org-tag", arn, name) + if org_id in all_ids: + # Org still live — defer to the CP auto-reap secrets reaper for + # per-workspace liveness; do not race a live tenant here. + return ("keep", "workspace-live-org", arn, name) + return ("delete", "orphan-workspace", arn, name) d = json.loads(sys.stdin.read()) for s in d.get("SecretList", []): @@ -241,16 +319,16 @@ for s in d.get("SecretList", []): DELETE_COUNT=$(printf '%s' "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))") KEEP_COUNT=$((TOTAL_SECRETS - DELETE_COUNT)) -TENANT_SECRETS=$(printf '%s' "$DECISIONS" | python3 -c " +MANAGED_SECRETS=$(printf '%s' "$DECISIONS" | python3 -c " import json, sys -n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-secret') +n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-managed-secret') print(n) ") log "" log "== Sweep plan ==" log " total secrets: $TOTAL_SECRETS" -log " tenant-shaped secrets: $TENANT_SECRETS" +log " managed-shaped secrets: $MANAGED_SECRETS" log " would delete: $DELETE_COUNT" log " would keep: $KEEP_COUNT" log "" @@ -272,17 +350,35 @@ for reason, n in keep_c.most_common(): print(f' keep/{reason}: {n}') " -# Safety gate operates against the tenant-shaped subset — same +# Safety gate operates against the managed-shaped subset — same # rationale as sweep-cf-tunnels: a miscount of platform-infra # secrets shouldn't relax the gate. -if [ "$TENANT_SECRETS" -gt 0 ]; then - PCT=$(( DELETE_COUNT * 100 / TENANT_SECRETS )) +# +# IMPORTANT (the historical no-op trap): the REAL safety here is the +# per-secret live-ORG cross-reference above — a secret is only marked +# delete when its owning org provably no longer exists in the CP DB. The +# percent gate is a blunt second line that, on a large genuine backlog +# (e.g. the June-2026 2.4k-orphan workspace-config sprawl, ~99% orphan), +# would itself BLOCK the very cleanup it exists to allow. So: +# - Normal steady state: <50% delete → gate passes, runs every hour. +# - Genuine bulk backlog: set SWEEP_ALLOW_BULK=1 to bypass the percent +# gate DELIBERATELY, trusting the live-org cross-reference. This is the +# sanctioned way to drain a backlog — NOT a blind MAX_DELETE_PCT bump. +if [ "$MANAGED_SECRETS" -gt 0 ]; then + PCT=$(( DELETE_COUNT * 100 / MANAGED_SECRETS )) if [ "$PCT" -gt "$MAX_DELETE_PCT" ]; then - log "" - log "SAFETY: would delete $PCT% of tenant-shaped secrets (threshold $MAX_DELETE_PCT%) — refusing." - log " If this is expected (e.g. major cleanup after incident), rerun with" - log " MAX_DELETE_PCT=$((PCT+5)) $0 $*" - exit 2 + if [ "${SWEEP_ALLOW_BULK:-0}" = "1" ]; then + log "" + log "SAFETY: would delete $PCT% of managed-shaped secrets (>$MAX_DELETE_PCT%) — BULK override active (SWEEP_ALLOW_BULK=1)." + log " Proceeding: every delete is live-org-cross-referenced AND a 30-day RECOVERABLE delete (restorable)." + else + log "" + log "SAFETY: would delete $PCT% of managed-shaped secrets (threshold $MAX_DELETE_PCT%) — refusing." + log " This is the expected gate on a genuine backlog. The deletes are" + log " live-org-cross-referenced and recoverable (30d). To drain a backlog" + log " deliberately, rerun with SWEEP_ALLOW_BULK=1 $0 $*" + exit 2 + fi fi fi @@ -312,11 +408,15 @@ fi # 30 min cap, but parallel-by-default keeps us symmetric with the # other sweepers and gives headroom for a one-off backlog. # -# --force-delete-without-recovery skips the 7-30 day recovery window. -# Acceptable here because (a) the GRACE_HOURS filter prevents touching -# in-flight provisions, and (b) the secret is regenerated on every -# fresh provision — losing one only matters for a tenant we're -# explicitly trying to forget. +# Deletion is RECOVERABLE (30-day recovery window), NOT force-delete. +# Changed from --force-delete-without-recovery: a mistaken sweep must be +# reversible via `aws secretsmanager restore-secret` for 30 days. The +# GRACE_HOURS filter + live-org cross-reference make a mistake unlikely, +# but at this scale (thousands of secrets) an unrecoverable bulk delete is +# an unacceptable blast radius. The secret is also regenerated on every +# fresh provision, so the recovery window is belt-and-suspenders, not the +# only safety. Matches the CP provisioner + auto-reap reaper, which both +# use DeleteSecret + RecoveryWindowInDays=30. CONCURRENCY="${SWEEP_CONCURRENCY:-8}" DELETE_PLAN=$(mktemp -t aws-secrets-plan-XXXXXX) @@ -353,7 +453,7 @@ xargs -P "$CONCURRENCY" -L 1 -I {} bash -c ' if aws secretsmanager delete-secret \ --region "$AWS_REGION" \ --secret-id "$arn" \ - --force-delete-without-recovery \ + --recovery-window-in-days 30 \ --output json >/dev/null 2>&1; then echo OK else diff --git a/scripts/ops/test_sweep_aws_decide.py b/scripts/ops/test_sweep_aws_decide.py new file mode 100644 index 000000000..3551f1a15 --- /dev/null +++ b/scripts/ops/test_sweep_aws_decide.py @@ -0,0 +1,124 @@ +"""Tests for the sweep-aws-secrets.sh decision logic (#890). + +Run locally: ``python3 -m unittest scripts/ops/test_sweep_aws_decide.py -v`` + +Why this exists: the inline Python heredoc in sweep-aws-secrets.sh decides +which AWS Secrets Manager secrets to delete. A misclassification could nuke +a LIVE tenant's bootstrap secret or a live workspace's config secret. These +tests pin the rule order for BOTH managed namespaces: + + - molecule/tenant//bootstrap (org_id in the NAME) + - molecule/workspace//config (owning org on the OrgID TAG) + +To avoid drift, the test EXTRACTS the `decide`/`org_tag`/`parse_iso` helpers +straight out of the shell script's heredoc at runtime and execs them — so a +change to the shell logic that isn't reflected here (or vice-versa) fails. +""" +from __future__ import annotations + +import os +import re +import unittest +from datetime import datetime, timedelta, timezone + +SCRIPT = os.path.join(os.path.dirname(__file__), "sweep-aws-secrets.sh") + + +def _load_decide(): + """Extract the decision helpers from the shell heredoc and exec them.""" + src = open(SCRIPT, encoding="utf-8").read() + # The heredoc body lives between `DECISIONS=$(echo "$SECRET_JSON" | python3 -c '` + # and the closing `')`. Grab the python source between the single quotes. + m = re.search(r"DECISIONS=\$\(echo \"\$SECRET_JSON\" \| python3 -c '(.*?)'\)", src, re.S) + if not m: + raise AssertionError("could not locate the decision heredoc in sweep-aws-secrets.sh") + body = m.group(1) + # The heredoc reads PROD_IDS/STAGING_IDS/GRACE_HOURS from os.environ and + # reads stdin at the bottom. We only want the pure functions, so exec the + # whole body in a namespace where env + stdin are pre-seeded, but stop it + # from consuming stdin by trimming everything from the final stdin loop. + cut = body.find("d = json.loads(sys.stdin.read())") + if cut != -1: + body = body[:cut] + ns: dict = {} + os.environ.setdefault("PROD_IDS", "live-org-1") + os.environ.setdefault("STAGING_IDS", "live-org-2") + os.environ.setdefault("GRACE_HOURS", "24") + exec(compile(body, "", "exec"), ns) + return ns + + +NS = _load_decide() +ALL_IDS = {"live-org-1", "live-org-2"} +GRACE = timedelta(hours=24) +NOW = datetime.now(timezone.utc) +OLD = (NOW - timedelta(days=30)).isoformat() +FRESH = NOW.isoformat() +U = "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee" + + +def decide(secret: dict): + action, reason, _arn, _name = NS["decide"](secret, ALL_IDS, GRACE, NOW) + return action, reason + + +class TestNonManaged(unittest.TestCase): + def test_arbitrary_secret_kept(self): + self.assertEqual(decide({"Name": "molecule/random/x", "CreatedDate": OLD}), + ("keep", "not-a-managed-secret")) + + def test_bad_uuid_tenant_kept(self): + self.assertEqual(decide({"Name": "molecule/tenant/not-a-uuid/bootstrap", "CreatedDate": OLD}), + ("keep", "not-a-managed-secret")) + + +class TestTenant(unittest.TestCase): + def test_orphan_tenant_deleted(self): + self.assertEqual(decide({"Name": f"molecule/tenant/{U}/bootstrap", "CreatedDate": OLD}), + ("delete", "orphan-tenant")) + + def test_live_tenant_kept(self): + self.assertEqual(decide({"Name": "molecule/tenant/live-org-1xxxxxxxxxxxxxxxxxxxxxxxxxxx/bootstrap", + "CreatedDate": OLD}), + ("keep", "not-a-managed-secret")) # wrong length → not managed + + def test_live_tenant_real_uuid_kept(self): + live = "11111111-2222-3333-4444-555555555555" + os.environ["PROD_IDS"] = "live-org-1" + # use ALL_IDS that includes the uuid + action, reason, _, _ = NS["decide"]( + {"Name": f"molecule/tenant/{live}/bootstrap", "CreatedDate": OLD}, + ALL_IDS | {live}, GRACE, NOW) + self.assertEqual((action, reason), ("keep", "live-tenant")) + + +class TestWorkspace(unittest.TestCase): + def test_orphan_workspace_deleted(self): + self.assertEqual(decide({"Name": f"molecule/workspace/{U}/config", "CreatedDate": OLD, + "Tags": [{"Key": "OrgID", "Value": "dead-org"}]}), + ("delete", "orphan-workspace")) + + def test_live_org_workspace_kept(self): + self.assertEqual(decide({"Name": f"molecule/workspace/{U}/config", "CreatedDate": OLD, + "Tags": [{"Key": "OrgID", "Value": "live-org-2"}]}), + ("keep", "workspace-live-org")) + + def test_no_org_tag_kept(self): + self.assertEqual(decide({"Name": f"molecule/workspace/{U}/config", "CreatedDate": OLD, + "Tags": []}), + ("keep", "workspace-no-org-tag")) + + +class TestGrace(unittest.TestCase): + def test_fresh_orphan_workspace_kept(self): + self.assertEqual(decide({"Name": f"molecule/workspace/{U}/config", "CreatedDate": FRESH, + "Tags": [{"Key": "OrgID", "Value": "dead-org"}]}), + ("keep", "in-grace-window")) + + def test_fresh_orphan_tenant_kept(self): + self.assertEqual(decide({"Name": f"molecule/tenant/{U}/bootstrap", "CreatedDate": FRESH}), + ("keep", "in-grace-window")) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/ops/test_sweep_aws_secrets_fail_closed.sh b/tests/ops/test_sweep_aws_secrets_fail_closed.sh new file mode 100755 index 000000000..7fa245cad --- /dev/null +++ b/tests/ops/test_sweep_aws_secrets_fail_closed.sh @@ -0,0 +1,131 @@ +#!/usr/bin/env bash +# Regression test for scripts/ops/sweep-aws-secrets.sh — verifies the +# live-org fetch is fail-closed. A non-2xx response, invalid JSON, or a +# response missing the 'orgs' array must abort the sweep BEFORE any secrets +# are classified as orphans. This is especially critical under +# SWEEP_ALLOW_BULK=1, where an empty live-org set would otherwise delete +# every old managed secret. +set -uo pipefail + +SCRIPT="${SCRIPT:-scripts/ops/sweep-aws-secrets.sh}" + +PASS=0 +FAIL=0 + +run_case() { + local name="$1" curl_exit="$2" curl_body="$3" bulk="${4:-0}" + local expect_abort="${5:-true}" # true = must stop before AWS/orphan classification + local tmp + tmp=$(mktemp -d -t sweep-fail-closed-XXXXXX) + local sentinel="$tmp/aws_reached" + + # Generate a mock curl script. We use Python to write the body so that + # JSON quotes/brackets are not mangled by shell quoting in a heredoc. + export curl_body curl_exit tmp + python3 -c " +import os, shlex +body = os.environ['curl_body'] +exit_code = os.environ['curl_exit'] +path = os.path.join(os.environ['tmp'], 'curl') +with open(path, 'w') as f: + f.write('#!/usr/bin/env bash\n') + f.write(f'echo {shlex.quote(body)}\n') + f.write(f'exit {exit_code}\n') +" + chmod +x "$tmp/curl" + + # Mock aws cli: writes a sentinel file and exits with a distinctive code + # so we can prove whether the sweep reached AWS/classification. + cat > "$tmp/aws" <<'MOCK' +#!/usr/bin/env bash +echo "reached" > "$AWS_SENTINEL" +exit 99 +MOCK + chmod +x "$tmp/aws" + + local out="$tmp/out" err="$tmp/err" + PATH="$tmp:$PATH" \ + CP_ADMIN_API_TOKEN=tok-prod \ + CP_STAGING_ADMIN_API_TOKEN=tok-staging \ + AWS_ACCESS_KEY_ID=ak \ + AWS_SECRET_ACCESS_KEY=sk \ + AWS_SENTINEL="$sentinel" \ + SWEEP_ALLOW_BULK="$bulk" \ + bash "$SCRIPT" --execute > "$out" 2> "$err" + local actual_exit=$? + local case_fail=0 + + if [ "$expect_abort" = "true" ]; then + # Fail-closed cases: script must abort before AWS and before orphan + # classification. Exit code should be the fetch/validation failure (1), + # NOT the aws mock's distinctive 99. + if [ "$actual_exit" -eq 99 ]; then + echo " ✗ $name: reached aws mock (exit 99) instead of aborting at fetch" >&2 + case_fail=1 + elif [ "$actual_exit" -eq 0 ]; then + echo " ✗ $name: exited 0 instead of aborting" >&2 + case_fail=1 + fi + if [ -f "$sentinel" ]; then + echo " ✗ $name: aws sentinel exists — sweep reached AWS/classification" >&2 + case_fail=1 + fi + if grep -qE '== Sweep plan ==|would delete:|orphan-(tenant|workspace)' "$out" "$err" 2>/dev/null; then + echo " ✗ $name: output contains sweep plan / orphan classification" >&2 + case_fail=1 + fi + else + # Happy-path control: valid live-org fetch must allow the sweep to proceed past + # the live-org fetch and reach AWS/classification. We use an empty orgs list + # so no real delete work happens; the aws mock proves the boundary was crossed. + if [ ! -f "$sentinel" ]; then + echo " ✗ $name: aws sentinel missing — sweep did not reach AWS/classification" >&2 + case_fail=1 + fi + if [ "$actual_exit" -ne 99 ]; then + echo " ✗ $name: expected aws mock exit 99, got $actual_exit" >&2 + case_fail=1 + fi + fi + + if [ "$case_fail" -eq 0 ]; then + echo " ✓ $name" + PASS=$((PASS + 1)) + else + echo " stdout:" >&2 + sed 's/^/ /' "$out" >&2 + echo " stderr:" >&2 + sed 's/^/ /' "$err" >&2 + FAIL=$((FAIL + 1)) + fi + + rm -rf "$tmp" + unset curl_body curl_exit +} + +echo "Test: sweep-aws-secrets live-org fetch fail-closed" +echo + +# Non-2xx from CP admin API (curl -f exits 22). +run_case "prod API returns 500" 22 '{"error":"internal"}' 0 true +run_case "prod API returns 500 with SWEEP_ALLOW_BULK=1" 22 '{"error":"internal"}' 1 true + +# Valid HTTP but invalid JSON body. +run_case "prod API returns malformed JSON" 0 'this is not json' 0 true +run_case "prod API returns malformed JSON with SWEEP_ALLOW_BULK=1" 0 'this is not json' 1 true + +# Valid JSON but missing 'orgs' key. +run_case "prod API returns JSON without orgs" 0 '{"foo":"bar"}' 0 true +run_case "prod API returns JSON without orgs with SWEEP_ALLOW_BULK=1" 0 '{"foo":"bar"}' 1 true + +# Valid JSON but 'orgs' is not an array. +run_case "prod API returns orgs as string" 0 '{"orgs":"not-an-array"}' 0 true + +# Happy-path control: valid orgs array must allow the sweep to proceed past +# the live-org fetch and reach AWS/classification. We use an empty orgs list +# so no real delete work happens; the aws mock proves the boundary was crossed. +run_case "prod API returns valid empty orgs (reaches AWS)" 0 '{"orgs":[]}' 0 false + +echo +echo "passed=$PASS failed=$FAIL" +[ "$FAIL" -eq 0 ]