fix(sweep-aws-secrets): sweep molecule/workspace/*config too + recoverable delete + bulk gate override (#890) #3134

Merged
core-devops merged 3 commits from fix/sweep-aws-workspace-config-secrets into main 2026-06-22 01:31:53 +00:00
3 changed files with 413 additions and 58 deletions
+158 -58
View File
@@ -19,24 +19,36 @@
# add KindSecretsManagerSecret + recorder hook + reconciler enumerator.
# Tracked separately as a controlplane issue.
#
# This is a parallel-shape janitor to sweep-cf-tunnels.sh:
# Sweeps TWO managed namespaces (both reduce to "owning org is gone"):
# - molecule/tenant/<org_id>/bootstrap — org_id is in the NAME.
# - molecule/workspace/<ws_id>/config — owning org is on the OrgID TAG
# (cp#329 per-workspace config delivery). THIS prefix was the entire
# ~$253/mo SM bill in June 2026 (2.4k orphan secrets from purged
# ephemeral E2E orgs) and was NOT swept before — the old filter only
# matched molecule/tenant/, so the janitor reported SUCCESS while
# deleting nothing relevant. Per-workspace liveness INSIDE a still-live
# org is owned by the CP auto-reap secrets reaper; this sweeper only
# deletes a workspace secret whose whole org is gone (no race risk).
#
# Steps:
# 1. Query CP admin API to enumerate live org IDs (prod + staging)
# 2. Enumerate AWS Secrets Manager secrets matching the tenant prefix
# 3. For each secret matching `molecule/tenant/<org_id>/bootstrap`,
# check if <org_id> appears in the live set
# 4. Defense-in-depth: skip secrets created in the last 24h
# (window for a provision-in-progress that hasn't yet finished
# its first heartbeat to CP)
# 5. Only delete secrets with NO live org counterpart AND outside
# the 24h grace window
# 2. Enumerate SM secrets matching either managed prefix
# 3. tenant/* → org_id from name; workspace/* → org_id from OrgID tag
# 4. Defense-in-depth: skip secrets created in the last GRACE_HOURS
# 5. Only delete secrets whose owning org is NOT in the live set AND are
# outside the grace window
#
# Dry-run by default; must pass --execute to actually delete.
#
# Note on deletion semantics: --force-delete-without-recovery skips
# the 7-30 day recovery window. We accept this because (a) the grace
# window above already filters in-flight provisions, and (b) the
# bootstrap secret is regenerated on every reprovision — losing one
# is recoverable by re-running the provision flow.
# Deletion semantics: RECOVERABLE 30-day delete (NOT force-delete). A
# mistaken sweep is reversible via `aws secretsmanager restore-secret`
# for 30 days. At thousands-of-secrets scale an unrecoverable bulk delete
# is an unacceptable blast radius; matches the CP provisioner + reaper.
#
# Bulk backlog: the MAX_DELETE_PCT gate will (correctly) block a genuine
# >50%-orphan backlog. To drain one deliberately, set SWEEP_ALLOW_BULK=1
# — the real safety is the live-org cross-reference + 30d recovery, not
# the percent gate.
#
# Env vars required:
# AWS_REGION — region the secrets live in (default: us-east-1)
@@ -52,7 +64,8 @@
# 0 — dry-run completed or sweep executed successfully
# 1 — missing required env, API failure, or unexpected state
# 2 — safety check failed (would delete >MAX_DELETE_PCT% of
# tenant-shaped secrets; refusing)
# managed-shaped secrets; refusing — set SWEEP_ALLOW_BULK=1 to
# drain a deliberate backlog)
set -euo pipefail
@@ -106,16 +119,40 @@ log() { echo "[$(date -u +%H:%M:%S)] $*"; }
# awsapi.TenantSecretName in molecule-controlplane. The /cp/admin/orgs
# response includes both `id` and `slug`; we extract `id` here.
# Fetch org IDs from a CP admin API endpoint.
# Fail-closed: any non-2xx HTTP response, invalid JSON, or missing/invalid
# 'orgs' array aborts the sweep with a non-zero exit. This is critical under
# SWEEP_ALLOW_BULK=1, where an empty live-org set would classify every old
# managed secret as orphan.
fetch_cp_orgs() {
local url="$1" token="$2" label="$3"
local resp
resp=$(curl -sS -f -m 15 -H "Authorization: Bearer $token" "$url" 2>&1) || {
echo "ERROR: $label CP admin API request failed (non-2xx or network error)" >&2
echo "$resp" >&2
return 1
}
python3 -c "
import json, sys
try:
d = json.loads(sys.stdin.read())
except json.JSONDecodeError as e:
print('ERROR: $label CP admin API returned invalid JSON:', e, file=sys.stderr)
sys.exit(1)
orgs = d.get('orgs')
if not isinstance(orgs, list):
print('ERROR: $label CP admin API response missing or invalid \"orgs\" array', file=sys.stderr)
sys.exit(1)
print(' '.join(o['id'] for o in orgs))
" <<< "$resp"
}
log "Fetching CP prod org ids..."
PROD_IDS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
"https://api.moleculesai.app/cp/admin/orgs?limit=500" \
| python3 -c "import json,sys; print(' '.join(o['id'] for o in json.load(sys.stdin).get('orgs',[])))")
PROD_IDS=$(fetch_cp_orgs "https://api.moleculesai.app/cp/admin/orgs?limit=500" "$CP_ADMIN_API_TOKEN" "prod")
log " prod orgs: $(echo "$PROD_IDS" | wc -w | tr -d ' ')"
log "Fetching CP staging org ids..."
STAGING_IDS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_STAGING_ADMIN_API_TOKEN" \
"https://staging-api.moleculesai.app/cp/admin/orgs?limit=500" \
| python3 -c "import json,sys; print(' '.join(o['id'] for o in json.load(sys.stdin).get('orgs',[])))")
STAGING_IDS=$(fetch_cp_orgs "https://staging-api.moleculesai.app/cp/admin/orgs?limit=500" "$CP_STAGING_ADMIN_API_TOKEN" "staging")
log " staging orgs: $(echo "$STAGING_IDS" | wc -w | tr -d ' ')"
log "Fetching AWS Secrets Manager secrets (region=$AWS_REGION)..."
@@ -143,16 +180,22 @@ NEXT_TOKEN=""
PAGE=1
while :; do
page_file="$PAGES_DIR/page-$(printf '%05d' "$PAGE").json"
# Sweep BOTH managed prefixes: molecule/tenant/* (per-org bootstrap) AND
# molecule/workspace/* (per-workspace config, cp#329). The latter was the
# entire ~$253/mo SM bill in June 2026 (2.4k orphan secrets) and this
# filter never matched it before — the sweeper reported SUCCESS while
# deleting nothing relevant. A name-filter Values list is OR-matched by
# Secrets Manager, so this captures both namespaces in one paginated walk.
if [ -z "$NEXT_TOKEN" ]; then
aws secretsmanager list-secrets \
--region "$AWS_REGION" \
--filters Key=name,Values=molecule/tenant/ \
--filters Key=name,Values=molecule/tenant/,molecule/workspace/ \
--max-results 100 \
--output json > "$page_file"
else
aws secretsmanager list-secrets \
--region "$AWS_REGION" \
--filters Key=name,Values=molecule/tenant/ \
--filters Key=name,Values=molecule/tenant/,molecule/workspace/ \
--max-results 100 \
--next-token "$NEXT_TOKEN" \
--output json > "$page_file"
@@ -160,8 +203,8 @@ while :; do
NEXT_TOKEN=$(python3 -c "import json,sys; d=json.load(open(sys.argv[1])); print(d.get('NextToken') or '')" "$page_file")
PAGE=$((PAGE + 1))
if [ -z "$NEXT_TOKEN" ]; then break; fi
if [ "$PAGE" -gt 50 ]; then
log "::warning::stopping pagination at page 50 (5000 secrets) — re-run if more"
if [ "$PAGE" -gt 100 ]; then
log "::warning::stopping pagination at page 100 (10000 secrets) — re-run if more"
break
fi
done
@@ -179,14 +222,28 @@ log " total tenant-prefixed secrets: $TOTAL_SECRETS"
# --- Compute orphans -------------------------------------------------------
#
# Rules (in order):
# 1. Name doesn't match `molecule/tenant/<org_id>/bootstrap` → keep
# (unknown — never sweep arbitrary secrets that might belong to
# platform infra or other tenants of this AWS account).
# 2. CreatedDate within $GRACE_HOURS → keep (defense-in-depth: don't
# kill a secret while its provision is still mid-flight).
# 3. org_id ∈ {prod_ids staging_ids} → keep (live tenant).
# 4. Otherwise → delete (orphan).
# Two managed namespaces, cross-referenced against the SAME live ORG set
# (prod_ids staging_ids fetched from the CP admin API). Both reduce to
# "the owning org no longer exists ⇒ orphan", which is the safe, org-level
# signal the sweeper can establish without per-workspace liveness:
#
# molecule/tenant/<org_id>/bootstrap — org_id is in the NAME.
# molecule/workspace/<ws_id>/config — ws_id is in the name (NOT an org
# id); the owning org is on the secret's OrgID TAG (set by the CP
# provisioner's seedWorkspaceConfigSecret). A workspace secret whose
# OrgID tag is not a live org is a guaranteed orphan: the whole tenant
# is gone, so the workspace can't exist. (Per-workspace liveness inside
# a STILL-LIVE org is owned by the CP auto-reap secrets reaper, which
# calls the tenant /workspaces endpoint — this sweeper deliberately
# does NOT delete a workspace secret whose org is still live, to avoid
# racing a live tenant's in-flight workspace.)
#
# Rules (in order, per secret):
# 1. Name matches neither managed shape → keep (never sweep arbitrary
# secrets that might belong to platform infra).
# 2. CreatedDate within $GRACE_HOURS → keep (provision-in-flight margin).
# 3. owning org ∈ {prod_ids staging_ids} → keep (live tenant).
# 4. Otherwise → delete (orphan) via 30-day RECOVERABLE delete.
export PROD_IDS STAGING_IDS GRACE_HOURS
DECISIONS=$(echo "$SECRET_JSON" | python3 -c '
@@ -201,6 +258,8 @@ now = datetime.now(timezone.utc)
# molecule/tenant/<org_id>/bootstrap — org_id is a UUID.
_TENANT_RE = re.compile(r"^molecule/tenant/([0-9a-fA-F-]{36})/bootstrap$")
# molecule/workspace/<ws_id>/config — ws_id is a UUID; owning org is on the tag.
_WS_RE = re.compile(r"^molecule/workspace/([0-9a-fA-F-]{36})/config$")
def parse_iso(s):
if not s:
@@ -212,24 +271,43 @@ def parse_iso(s):
except ValueError:
return None
def org_tag(s):
for t in s.get("Tags") or []:
if t.get("Key") == "OrgID":
return t.get("Value") or ""
return ""
def decide(s, all_ids, grace, now):
name = s.get("Name", "")
arn = s.get("ARN", "")
m = _TENANT_RE.match(name)
if not m:
return ("keep", "not-a-tenant-secret", arn, name)
org_id = m.group(1)
mt = _TENANT_RE.match(name)
mw = _WS_RE.match(name)
if not mt and not mw:
return ("keep", "not-a-managed-secret", arn, name)
# Grace gate (both shapes): never touch a secret younger than the window.
created = parse_iso(s.get("CreatedDate") or s.get("LastChangedDate"))
if created is not None and (now - created) < grace:
return ("keep", "in-grace-window", arn, name)
if org_id in all_ids:
return ("keep", "live-tenant", arn, name)
if mt:
org_id = mt.group(1)
if org_id in all_ids:
return ("keep", "live-tenant", arn, name)
return ("delete", "orphan-tenant", arn, name)
return ("delete", "orphan-tenant", arn, name)
# workspace-config: owning org is on the OrgID tag.
org_id = org_tag(s)
if not org_id:
# No OrgID tag (legacy / hand-created) — cannot establish ownership;
# keep and let the CP reaper (which parses the live set) handle it.
return ("keep", "workspace-no-org-tag", arn, name)
if org_id in all_ids:
# Org still live — defer to the CP auto-reap secrets reaper for
# per-workspace liveness; do not race a live tenant here.
return ("keep", "workspace-live-org", arn, name)
return ("delete", "orphan-workspace", arn, name)
d = json.loads(sys.stdin.read())
for s in d.get("SecretList", []):
@@ -241,16 +319,16 @@ for s in d.get("SecretList", []):
DELETE_COUNT=$(printf '%s' "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
KEEP_COUNT=$((TOTAL_SECRETS - DELETE_COUNT))
TENANT_SECRETS=$(printf '%s' "$DECISIONS" | python3 -c "
MANAGED_SECRETS=$(printf '%s' "$DECISIONS" | python3 -c "
import json, sys
n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-secret')
n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-managed-secret')
print(n)
")
log ""
log "== Sweep plan =="
log " total secrets: $TOTAL_SECRETS"
log " tenant-shaped secrets: $TENANT_SECRETS"
log " managed-shaped secrets: $MANAGED_SECRETS"
log " would delete: $DELETE_COUNT"
log " would keep: $KEEP_COUNT"
log ""
@@ -272,17 +350,35 @@ for reason, n in keep_c.most_common():
print(f' keep/{reason}: {n}')
"
# Safety gate operates against the tenant-shaped subset — same
# Safety gate operates against the managed-shaped subset — same
# rationale as sweep-cf-tunnels: a miscount of platform-infra
# secrets shouldn't relax the gate.
if [ "$TENANT_SECRETS" -gt 0 ]; then
PCT=$(( DELETE_COUNT * 100 / TENANT_SECRETS ))
#
# IMPORTANT (the historical no-op trap): the REAL safety here is the
# per-secret live-ORG cross-reference above — a secret is only marked
# delete when its owning org provably no longer exists in the CP DB. The
# percent gate is a blunt second line that, on a large genuine backlog
# (e.g. the June-2026 2.4k-orphan workspace-config sprawl, ~99% orphan),
# would itself BLOCK the very cleanup it exists to allow. So:
# - Normal steady state: <50% delete → gate passes, runs every hour.
# - Genuine bulk backlog: set SWEEP_ALLOW_BULK=1 to bypass the percent
# gate DELIBERATELY, trusting the live-org cross-reference. This is the
# sanctioned way to drain a backlog — NOT a blind MAX_DELETE_PCT bump.
if [ "$MANAGED_SECRETS" -gt 0 ]; then
PCT=$(( DELETE_COUNT * 100 / MANAGED_SECRETS ))
if [ "$PCT" -gt "$MAX_DELETE_PCT" ]; then
log ""
log "SAFETY: would delete $PCT% of tenant-shaped secrets (threshold $MAX_DELETE_PCT%) — refusing."
log " If this is expected (e.g. major cleanup after incident), rerun with"
log " MAX_DELETE_PCT=$((PCT+5)) $0 $*"
exit 2
if [ "${SWEEP_ALLOW_BULK:-0}" = "1" ]; then
log ""
log "SAFETY: would delete $PCT% of managed-shaped secrets (>$MAX_DELETE_PCT%) — BULK override active (SWEEP_ALLOW_BULK=1)."
log " Proceeding: every delete is live-org-cross-referenced AND a 30-day RECOVERABLE delete (restorable)."
else
log ""
log "SAFETY: would delete $PCT% of managed-shaped secrets (threshold $MAX_DELETE_PCT%) — refusing."
log " This is the expected gate on a genuine backlog. The deletes are"
log " live-org-cross-referenced and recoverable (30d). To drain a backlog"
log " deliberately, rerun with SWEEP_ALLOW_BULK=1 $0 $*"
exit 2
fi
fi
fi
@@ -312,11 +408,15 @@ fi
# 30 min cap, but parallel-by-default keeps us symmetric with the
# other sweepers and gives headroom for a one-off backlog.
#
# --force-delete-without-recovery skips the 7-30 day recovery window.
# Acceptable here because (a) the GRACE_HOURS filter prevents touching
# in-flight provisions, and (b) the secret is regenerated on every
# fresh provision — losing one only matters for a tenant we're
# explicitly trying to forget.
# Deletion is RECOVERABLE (30-day recovery window), NOT force-delete.
# Changed from --force-delete-without-recovery: a mistaken sweep must be
# reversible via `aws secretsmanager restore-secret` for 30 days. The
# GRACE_HOURS filter + live-org cross-reference make a mistake unlikely,
# but at this scale (thousands of secrets) an unrecoverable bulk delete is
# an unacceptable blast radius. The secret is also regenerated on every
# fresh provision, so the recovery window is belt-and-suspenders, not the
# only safety. Matches the CP provisioner + auto-reap reaper, which both
# use DeleteSecret + RecoveryWindowInDays=30.
CONCURRENCY="${SWEEP_CONCURRENCY:-8}"
DELETE_PLAN=$(mktemp -t aws-secrets-plan-XXXXXX)
@@ -353,7 +453,7 @@ xargs -P "$CONCURRENCY" -L 1 -I {} bash -c '
if aws secretsmanager delete-secret \
--region "$AWS_REGION" \
--secret-id "$arn" \
--force-delete-without-recovery \
--recovery-window-in-days 30 \
--output json >/dev/null 2>&1; then
echo OK
else
+124
View File
@@ -0,0 +1,124 @@
"""Tests for the sweep-aws-secrets.sh decision logic (#890).
Run locally: ``python3 -m unittest scripts/ops/test_sweep_aws_decide.py -v``
Why this exists: the inline Python heredoc in sweep-aws-secrets.sh decides
which AWS Secrets Manager secrets to delete. A misclassification could nuke
a LIVE tenant's bootstrap secret or a live workspace's config secret. These
tests pin the rule order for BOTH managed namespaces:
- molecule/tenant/<org_id>/bootstrap (org_id in the NAME)
- molecule/workspace/<ws_id>/config (owning org on the OrgID TAG)
To avoid drift, the test EXTRACTS the `decide`/`org_tag`/`parse_iso` helpers
straight out of the shell script's heredoc at runtime and execs them — so a
change to the shell logic that isn't reflected here (or vice-versa) fails.
"""
from __future__ import annotations
import os
import re
import unittest
from datetime import datetime, timedelta, timezone
SCRIPT = os.path.join(os.path.dirname(__file__), "sweep-aws-secrets.sh")
def _load_decide():
"""Extract the decision helpers from the shell heredoc and exec them."""
src = open(SCRIPT, encoding="utf-8").read()
# The heredoc body lives between `DECISIONS=$(echo "$SECRET_JSON" | python3 -c '`
# and the closing `')`. Grab the python source between the single quotes.
m = re.search(r"DECISIONS=\$\(echo \"\$SECRET_JSON\" \| python3 -c '(.*?)'\)", src, re.S)
if not m:
raise AssertionError("could not locate the decision heredoc in sweep-aws-secrets.sh")
body = m.group(1)
# The heredoc reads PROD_IDS/STAGING_IDS/GRACE_HOURS from os.environ and
# reads stdin at the bottom. We only want the pure functions, so exec the
# whole body in a namespace where env + stdin are pre-seeded, but stop it
# from consuming stdin by trimming everything from the final stdin loop.
cut = body.find("d = json.loads(sys.stdin.read())")
if cut != -1:
body = body[:cut]
ns: dict = {}
os.environ.setdefault("PROD_IDS", "live-org-1")
os.environ.setdefault("STAGING_IDS", "live-org-2")
os.environ.setdefault("GRACE_HOURS", "24")
exec(compile(body, "<heredoc>", "exec"), ns)
return ns
NS = _load_decide()
ALL_IDS = {"live-org-1", "live-org-2"}
GRACE = timedelta(hours=24)
NOW = datetime.now(timezone.utc)
OLD = (NOW - timedelta(days=30)).isoformat()
FRESH = NOW.isoformat()
U = "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"
def decide(secret: dict):
action, reason, _arn, _name = NS["decide"](secret, ALL_IDS, GRACE, NOW)
return action, reason
class TestNonManaged(unittest.TestCase):
def test_arbitrary_secret_kept(self):
self.assertEqual(decide({"Name": "molecule/random/x", "CreatedDate": OLD}),
("keep", "not-a-managed-secret"))
def test_bad_uuid_tenant_kept(self):
self.assertEqual(decide({"Name": "molecule/tenant/not-a-uuid/bootstrap", "CreatedDate": OLD}),
("keep", "not-a-managed-secret"))
class TestTenant(unittest.TestCase):
def test_orphan_tenant_deleted(self):
self.assertEqual(decide({"Name": f"molecule/tenant/{U}/bootstrap", "CreatedDate": OLD}),
("delete", "orphan-tenant"))
def test_live_tenant_kept(self):
self.assertEqual(decide({"Name": "molecule/tenant/live-org-1xxxxxxxxxxxxxxxxxxxxxxxxxxx/bootstrap",
"CreatedDate": OLD}),
("keep", "not-a-managed-secret")) # wrong length → not managed
def test_live_tenant_real_uuid_kept(self):
live = "11111111-2222-3333-4444-555555555555"
os.environ["PROD_IDS"] = "live-org-1"
# use ALL_IDS that includes the uuid
action, reason, _, _ = NS["decide"](
{"Name": f"molecule/tenant/{live}/bootstrap", "CreatedDate": OLD},
ALL_IDS | {live}, GRACE, NOW)
self.assertEqual((action, reason), ("keep", "live-tenant"))
class TestWorkspace(unittest.TestCase):
def test_orphan_workspace_deleted(self):
self.assertEqual(decide({"Name": f"molecule/workspace/{U}/config", "CreatedDate": OLD,
"Tags": [{"Key": "OrgID", "Value": "dead-org"}]}),
("delete", "orphan-workspace"))
def test_live_org_workspace_kept(self):
self.assertEqual(decide({"Name": f"molecule/workspace/{U}/config", "CreatedDate": OLD,
"Tags": [{"Key": "OrgID", "Value": "live-org-2"}]}),
("keep", "workspace-live-org"))
def test_no_org_tag_kept(self):
self.assertEqual(decide({"Name": f"molecule/workspace/{U}/config", "CreatedDate": OLD,
"Tags": []}),
("keep", "workspace-no-org-tag"))
class TestGrace(unittest.TestCase):
def test_fresh_orphan_workspace_kept(self):
self.assertEqual(decide({"Name": f"molecule/workspace/{U}/config", "CreatedDate": FRESH,
"Tags": [{"Key": "OrgID", "Value": "dead-org"}]}),
("keep", "in-grace-window"))
def test_fresh_orphan_tenant_kept(self):
self.assertEqual(decide({"Name": f"molecule/tenant/{U}/bootstrap", "CreatedDate": FRESH}),
("keep", "in-grace-window"))
if __name__ == "__main__":
unittest.main()
+131
View File
@@ -0,0 +1,131 @@
#!/usr/bin/env bash
# Regression test for scripts/ops/sweep-aws-secrets.sh — verifies the
# live-org fetch is fail-closed. A non-2xx response, invalid JSON, or a
# response missing the 'orgs' array must abort the sweep BEFORE any secrets
# are classified as orphans. This is especially critical under
# SWEEP_ALLOW_BULK=1, where an empty live-org set would otherwise delete
# every old managed secret.
set -uo pipefail
SCRIPT="${SCRIPT:-scripts/ops/sweep-aws-secrets.sh}"
PASS=0
FAIL=0
run_case() {
local name="$1" curl_exit="$2" curl_body="$3" bulk="${4:-0}"
local expect_abort="${5:-true}" # true = must stop before AWS/orphan classification
local tmp
tmp=$(mktemp -d -t sweep-fail-closed-XXXXXX)
local sentinel="$tmp/aws_reached"
# Generate a mock curl script. We use Python to write the body so that
# JSON quotes/brackets are not mangled by shell quoting in a heredoc.
export curl_body curl_exit tmp
python3 -c "
import os, shlex
body = os.environ['curl_body']
exit_code = os.environ['curl_exit']
path = os.path.join(os.environ['tmp'], 'curl')
with open(path, 'w') as f:
f.write('#!/usr/bin/env bash\n')
f.write(f'echo {shlex.quote(body)}\n')
f.write(f'exit {exit_code}\n')
"
chmod +x "$tmp/curl"
# Mock aws cli: writes a sentinel file and exits with a distinctive code
# so we can prove whether the sweep reached AWS/classification.
cat > "$tmp/aws" <<'MOCK'
#!/usr/bin/env bash
echo "reached" > "$AWS_SENTINEL"
exit 99
MOCK
chmod +x "$tmp/aws"
local out="$tmp/out" err="$tmp/err"
PATH="$tmp:$PATH" \
CP_ADMIN_API_TOKEN=tok-prod \
CP_STAGING_ADMIN_API_TOKEN=tok-staging \
AWS_ACCESS_KEY_ID=ak \
AWS_SECRET_ACCESS_KEY=sk \
AWS_SENTINEL="$sentinel" \
SWEEP_ALLOW_BULK="$bulk" \
bash "$SCRIPT" --execute > "$out" 2> "$err"
local actual_exit=$?
local case_fail=0
if [ "$expect_abort" = "true" ]; then
# Fail-closed cases: script must abort before AWS and before orphan
# classification. Exit code should be the fetch/validation failure (1),
# NOT the aws mock's distinctive 99.
if [ "$actual_exit" -eq 99 ]; then
echo "$name: reached aws mock (exit 99) instead of aborting at fetch" >&2
case_fail=1
elif [ "$actual_exit" -eq 0 ]; then
echo "$name: exited 0 instead of aborting" >&2
case_fail=1
fi
if [ -f "$sentinel" ]; then
echo "$name: aws sentinel exists — sweep reached AWS/classification" >&2
case_fail=1
fi
if grep -qE '== Sweep plan ==|would delete:|orphan-(tenant|workspace)' "$out" "$err" 2>/dev/null; then
echo "$name: output contains sweep plan / orphan classification" >&2
case_fail=1
fi
else
# Happy-path control: valid live-org fetch must allow the sweep to proceed past
# the live-org fetch and reach AWS/classification. We use an empty orgs list
# so no real delete work happens; the aws mock proves the boundary was crossed.
if [ ! -f "$sentinel" ]; then
echo "$name: aws sentinel missing — sweep did not reach AWS/classification" >&2
case_fail=1
fi
if [ "$actual_exit" -ne 99 ]; then
echo "$name: expected aws mock exit 99, got $actual_exit" >&2
case_fail=1
fi
fi
if [ "$case_fail" -eq 0 ]; then
echo "$name"
PASS=$((PASS + 1))
else
echo " stdout:" >&2
sed 's/^/ /' "$out" >&2
echo " stderr:" >&2
sed 's/^/ /' "$err" >&2
FAIL=$((FAIL + 1))
fi
rm -rf "$tmp"
unset curl_body curl_exit
}
echo "Test: sweep-aws-secrets live-org fetch fail-closed"
echo
# Non-2xx from CP admin API (curl -f exits 22).
run_case "prod API returns 500" 22 '{"error":"internal"}' 0 true
run_case "prod API returns 500 with SWEEP_ALLOW_BULK=1" 22 '{"error":"internal"}' 1 true
# Valid HTTP but invalid JSON body.
run_case "prod API returns malformed JSON" 0 'this is not json' 0 true
run_case "prod API returns malformed JSON with SWEEP_ALLOW_BULK=1" 0 'this is not json' 1 true
# Valid JSON but missing 'orgs' key.
run_case "prod API returns JSON without orgs" 0 '{"foo":"bar"}' 0 true
run_case "prod API returns JSON without orgs with SWEEP_ALLOW_BULK=1" 0 '{"foo":"bar"}' 1 true
# Valid JSON but 'orgs' is not an array.
run_case "prod API returns orgs as string" 0 '{"orgs":"not-an-array"}' 0 true
# Happy-path control: valid orgs array must allow the sweep to proceed past
# the live-org fetch and reach AWS/classification. We use an empty orgs list
# so no real delete work happens; the aws mock proves the boundary was crossed.
run_case "prod API returns valid empty orgs (reaches AWS)" 0 '{"orgs":[]}' 0 false
echo
echo "passed=$PASS failed=$FAIL"
[ "$FAIL" -eq 0 ]