fix(sweep-aws-secrets): sweep molecule/workspace/*config too + recoverable delete + bulk gate override (#890) #3134
@@ -19,24 +19,36 @@
|
||||
# add KindSecretsManagerSecret + recorder hook + reconciler enumerator.
|
||||
# Tracked separately as a controlplane issue.
|
||||
#
|
||||
# This is a parallel-shape janitor to sweep-cf-tunnels.sh:
|
||||
# Sweeps TWO managed namespaces (both reduce to "owning org is gone"):
|
||||
# - molecule/tenant/<org_id>/bootstrap — org_id is in the NAME.
|
||||
# - molecule/workspace/<ws_id>/config — owning org is on the OrgID TAG
|
||||
# (cp#329 per-workspace config delivery). THIS prefix was the entire
|
||||
# ~$253/mo SM bill in June 2026 (2.4k orphan secrets from purged
|
||||
# ephemeral E2E orgs) and was NOT swept before — the old filter only
|
||||
# matched molecule/tenant/, so the janitor reported SUCCESS while
|
||||
# deleting nothing relevant. Per-workspace liveness INSIDE a still-live
|
||||
# org is owned by the CP auto-reap secrets reaper; this sweeper only
|
||||
# deletes a workspace secret whose whole org is gone (no race risk).
|
||||
#
|
||||
# Steps:
|
||||
# 1. Query CP admin API to enumerate live org IDs (prod + staging)
|
||||
# 2. Enumerate AWS Secrets Manager secrets matching the tenant prefix
|
||||
# 3. For each secret matching `molecule/tenant/<org_id>/bootstrap`,
|
||||
# check if <org_id> appears in the live set
|
||||
# 4. Defense-in-depth: skip secrets created in the last 24h
|
||||
# (window for a provision-in-progress that hasn't yet finished
|
||||
# its first heartbeat to CP)
|
||||
# 5. Only delete secrets with NO live org counterpart AND outside
|
||||
# the 24h grace window
|
||||
# 2. Enumerate SM secrets matching either managed prefix
|
||||
# 3. tenant/* → org_id from name; workspace/* → org_id from OrgID tag
|
||||
# 4. Defense-in-depth: skip secrets created in the last GRACE_HOURS
|
||||
# 5. Only delete secrets whose owning org is NOT in the live set AND are
|
||||
# outside the grace window
|
||||
#
|
||||
# Dry-run by default; must pass --execute to actually delete.
|
||||
#
|
||||
# Note on deletion semantics: --force-delete-without-recovery skips
|
||||
# the 7-30 day recovery window. We accept this because (a) the grace
|
||||
# window above already filters in-flight provisions, and (b) the
|
||||
# bootstrap secret is regenerated on every reprovision — losing one
|
||||
# is recoverable by re-running the provision flow.
|
||||
# Deletion semantics: RECOVERABLE 30-day delete (NOT force-delete). A
|
||||
# mistaken sweep is reversible via `aws secretsmanager restore-secret`
|
||||
# for 30 days. At thousands-of-secrets scale an unrecoverable bulk delete
|
||||
# is an unacceptable blast radius; matches the CP provisioner + reaper.
|
||||
#
|
||||
# Bulk backlog: the MAX_DELETE_PCT gate will (correctly) block a genuine
|
||||
# >50%-orphan backlog. To drain one deliberately, set SWEEP_ALLOW_BULK=1
|
||||
# — the real safety is the live-org cross-reference + 30d recovery, not
|
||||
# the percent gate.
|
||||
#
|
||||
# Env vars required:
|
||||
# AWS_REGION — region the secrets live in (default: us-east-1)
|
||||
@@ -52,7 +64,8 @@
|
||||
# 0 — dry-run completed or sweep executed successfully
|
||||
# 1 — missing required env, API failure, or unexpected state
|
||||
# 2 — safety check failed (would delete >MAX_DELETE_PCT% of
|
||||
# tenant-shaped secrets; refusing)
|
||||
# managed-shaped secrets; refusing — set SWEEP_ALLOW_BULK=1 to
|
||||
# drain a deliberate backlog)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
@@ -106,16 +119,40 @@ log() { echo "[$(date -u +%H:%M:%S)] $*"; }
|
||||
# awsapi.TenantSecretName in molecule-controlplane. The /cp/admin/orgs
|
||||
# response includes both `id` and `slug`; we extract `id` here.
|
||||
|
||||
# Fetch org IDs from a CP admin API endpoint.
|
||||
# Fail-closed: any non-2xx HTTP response, invalid JSON, or missing/invalid
|
||||
# 'orgs' array aborts the sweep with a non-zero exit. This is critical under
|
||||
# SWEEP_ALLOW_BULK=1, where an empty live-org set would classify every old
|
||||
# managed secret as orphan.
|
||||
fetch_cp_orgs() {
|
||||
local url="$1" token="$2" label="$3"
|
||||
local resp
|
||||
resp=$(curl -sS -f -m 15 -H "Authorization: Bearer $token" "$url" 2>&1) || {
|
||||
echo "ERROR: $label CP admin API request failed (non-2xx or network error)" >&2
|
||||
echo "$resp" >&2
|
||||
return 1
|
||||
}
|
||||
python3 -c "
|
||||
import json, sys
|
||||
try:
|
||||
d = json.loads(sys.stdin.read())
|
||||
except json.JSONDecodeError as e:
|
||||
print('ERROR: $label CP admin API returned invalid JSON:', e, file=sys.stderr)
|
||||
sys.exit(1)
|
||||
orgs = d.get('orgs')
|
||||
if not isinstance(orgs, list):
|
||||
print('ERROR: $label CP admin API response missing or invalid \"orgs\" array', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
print(' '.join(o['id'] for o in orgs))
|
||||
" <<< "$resp"
|
||||
}
|
||||
|
||||
log "Fetching CP prod org ids..."
|
||||
PROD_IDS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
|
||||
"https://api.moleculesai.app/cp/admin/orgs?limit=500" \
|
||||
| python3 -c "import json,sys; print(' '.join(o['id'] for o in json.load(sys.stdin).get('orgs',[])))")
|
||||
PROD_IDS=$(fetch_cp_orgs "https://api.moleculesai.app/cp/admin/orgs?limit=500" "$CP_ADMIN_API_TOKEN" "prod")
|
||||
log " prod orgs: $(echo "$PROD_IDS" | wc -w | tr -d ' ')"
|
||||
|
||||
log "Fetching CP staging org ids..."
|
||||
STAGING_IDS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_STAGING_ADMIN_API_TOKEN" \
|
||||
"https://staging-api.moleculesai.app/cp/admin/orgs?limit=500" \
|
||||
| python3 -c "import json,sys; print(' '.join(o['id'] for o in json.load(sys.stdin).get('orgs',[])))")
|
||||
STAGING_IDS=$(fetch_cp_orgs "https://staging-api.moleculesai.app/cp/admin/orgs?limit=500" "$CP_STAGING_ADMIN_API_TOKEN" "staging")
|
||||
log " staging orgs: $(echo "$STAGING_IDS" | wc -w | tr -d ' ')"
|
||||
|
||||
log "Fetching AWS Secrets Manager secrets (region=$AWS_REGION)..."
|
||||
@@ -143,16 +180,22 @@ NEXT_TOKEN=""
|
||||
PAGE=1
|
||||
while :; do
|
||||
page_file="$PAGES_DIR/page-$(printf '%05d' "$PAGE").json"
|
||||
# Sweep BOTH managed prefixes: molecule/tenant/* (per-org bootstrap) AND
|
||||
# molecule/workspace/* (per-workspace config, cp#329). The latter was the
|
||||
# entire ~$253/mo SM bill in June 2026 (2.4k orphan secrets) and this
|
||||
# filter never matched it before — the sweeper reported SUCCESS while
|
||||
# deleting nothing relevant. A name-filter Values list is OR-matched by
|
||||
# Secrets Manager, so this captures both namespaces in one paginated walk.
|
||||
if [ -z "$NEXT_TOKEN" ]; then
|
||||
aws secretsmanager list-secrets \
|
||||
--region "$AWS_REGION" \
|
||||
--filters Key=name,Values=molecule/tenant/ \
|
||||
--filters Key=name,Values=molecule/tenant/,molecule/workspace/ \
|
||||
--max-results 100 \
|
||||
--output json > "$page_file"
|
||||
else
|
||||
aws secretsmanager list-secrets \
|
||||
--region "$AWS_REGION" \
|
||||
--filters Key=name,Values=molecule/tenant/ \
|
||||
--filters Key=name,Values=molecule/tenant/,molecule/workspace/ \
|
||||
--max-results 100 \
|
||||
--next-token "$NEXT_TOKEN" \
|
||||
--output json > "$page_file"
|
||||
@@ -160,8 +203,8 @@ while :; do
|
||||
NEXT_TOKEN=$(python3 -c "import json,sys; d=json.load(open(sys.argv[1])); print(d.get('NextToken') or '')" "$page_file")
|
||||
PAGE=$((PAGE + 1))
|
||||
if [ -z "$NEXT_TOKEN" ]; then break; fi
|
||||
if [ "$PAGE" -gt 50 ]; then
|
||||
log "::warning::stopping pagination at page 50 (5000 secrets) — re-run if more"
|
||||
if [ "$PAGE" -gt 100 ]; then
|
||||
log "::warning::stopping pagination at page 100 (10000 secrets) — re-run if more"
|
||||
break
|
||||
fi
|
||||
done
|
||||
@@ -179,14 +222,28 @@ log " total tenant-prefixed secrets: $TOTAL_SECRETS"
|
||||
|
||||
# --- Compute orphans -------------------------------------------------------
|
||||
#
|
||||
# Rules (in order):
|
||||
# 1. Name doesn't match `molecule/tenant/<org_id>/bootstrap` → keep
|
||||
# (unknown — never sweep arbitrary secrets that might belong to
|
||||
# platform infra or other tenants of this AWS account).
|
||||
# 2. CreatedDate within $GRACE_HOURS → keep (defense-in-depth: don't
|
||||
# kill a secret while its provision is still mid-flight).
|
||||
# 3. org_id ∈ {prod_ids ∪ staging_ids} → keep (live tenant).
|
||||
# 4. Otherwise → delete (orphan).
|
||||
# Two managed namespaces, cross-referenced against the SAME live ORG set
|
||||
# (prod_ids ∪ staging_ids fetched from the CP admin API). Both reduce to
|
||||
# "the owning org no longer exists ⇒ orphan", which is the safe, org-level
|
||||
# signal the sweeper can establish without per-workspace liveness:
|
||||
#
|
||||
# molecule/tenant/<org_id>/bootstrap — org_id is in the NAME.
|
||||
# molecule/workspace/<ws_id>/config — ws_id is in the name (NOT an org
|
||||
# id); the owning org is on the secret's OrgID TAG (set by the CP
|
||||
# provisioner's seedWorkspaceConfigSecret). A workspace secret whose
|
||||
# OrgID tag is not a live org is a guaranteed orphan: the whole tenant
|
||||
# is gone, so the workspace can't exist. (Per-workspace liveness inside
|
||||
# a STILL-LIVE org is owned by the CP auto-reap secrets reaper, which
|
||||
# calls the tenant /workspaces endpoint — this sweeper deliberately
|
||||
# does NOT delete a workspace secret whose org is still live, to avoid
|
||||
# racing a live tenant's in-flight workspace.)
|
||||
#
|
||||
# Rules (in order, per secret):
|
||||
# 1. Name matches neither managed shape → keep (never sweep arbitrary
|
||||
# secrets that might belong to platform infra).
|
||||
# 2. CreatedDate within $GRACE_HOURS → keep (provision-in-flight margin).
|
||||
# 3. owning org ∈ {prod_ids ∪ staging_ids} → keep (live tenant).
|
||||
# 4. Otherwise → delete (orphan) via 30-day RECOVERABLE delete.
|
||||
|
||||
export PROD_IDS STAGING_IDS GRACE_HOURS
|
||||
DECISIONS=$(echo "$SECRET_JSON" | python3 -c '
|
||||
@@ -201,6 +258,8 @@ now = datetime.now(timezone.utc)
|
||||
|
||||
# molecule/tenant/<org_id>/bootstrap — org_id is a UUID.
|
||||
_TENANT_RE = re.compile(r"^molecule/tenant/([0-9a-fA-F-]{36})/bootstrap$")
|
||||
# molecule/workspace/<ws_id>/config — ws_id is a UUID; owning org is on the tag.
|
||||
_WS_RE = re.compile(r"^molecule/workspace/([0-9a-fA-F-]{36})/config$")
|
||||
|
||||
def parse_iso(s):
|
||||
if not s:
|
||||
@@ -212,24 +271,43 @@ def parse_iso(s):
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def org_tag(s):
|
||||
for t in s.get("Tags") or []:
|
||||
if t.get("Key") == "OrgID":
|
||||
return t.get("Value") or ""
|
||||
return ""
|
||||
|
||||
def decide(s, all_ids, grace, now):
|
||||
name = s.get("Name", "")
|
||||
arn = s.get("ARN", "")
|
||||
|
||||
m = _TENANT_RE.match(name)
|
||||
if not m:
|
||||
return ("keep", "not-a-tenant-secret", arn, name)
|
||||
|
||||
org_id = m.group(1)
|
||||
mt = _TENANT_RE.match(name)
|
||||
mw = _WS_RE.match(name)
|
||||
if not mt and not mw:
|
||||
return ("keep", "not-a-managed-secret", arn, name)
|
||||
|
||||
# Grace gate (both shapes): never touch a secret younger than the window.
|
||||
created = parse_iso(s.get("CreatedDate") or s.get("LastChangedDate"))
|
||||
if created is not None and (now - created) < grace:
|
||||
return ("keep", "in-grace-window", arn, name)
|
||||
|
||||
if org_id in all_ids:
|
||||
return ("keep", "live-tenant", arn, name)
|
||||
if mt:
|
||||
org_id = mt.group(1)
|
||||
if org_id in all_ids:
|
||||
return ("keep", "live-tenant", arn, name)
|
||||
return ("delete", "orphan-tenant", arn, name)
|
||||
|
||||
return ("delete", "orphan-tenant", arn, name)
|
||||
# workspace-config: owning org is on the OrgID tag.
|
||||
org_id = org_tag(s)
|
||||
if not org_id:
|
||||
# No OrgID tag (legacy / hand-created) — cannot establish ownership;
|
||||
# keep and let the CP reaper (which parses the live set) handle it.
|
||||
return ("keep", "workspace-no-org-tag", arn, name)
|
||||
if org_id in all_ids:
|
||||
# Org still live — defer to the CP auto-reap secrets reaper for
|
||||
# per-workspace liveness; do not race a live tenant here.
|
||||
return ("keep", "workspace-live-org", arn, name)
|
||||
return ("delete", "orphan-workspace", arn, name)
|
||||
|
||||
d = json.loads(sys.stdin.read())
|
||||
for s in d.get("SecretList", []):
|
||||
@@ -241,16 +319,16 @@ for s in d.get("SecretList", []):
|
||||
|
||||
DELETE_COUNT=$(printf '%s' "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
|
||||
KEEP_COUNT=$((TOTAL_SECRETS - DELETE_COUNT))
|
||||
TENANT_SECRETS=$(printf '%s' "$DECISIONS" | python3 -c "
|
||||
MANAGED_SECRETS=$(printf '%s' "$DECISIONS" | python3 -c "
|
||||
import json, sys
|
||||
n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-secret')
|
||||
n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-managed-secret')
|
||||
print(n)
|
||||
")
|
||||
|
||||
log ""
|
||||
log "== Sweep plan =="
|
||||
log " total secrets: $TOTAL_SECRETS"
|
||||
log " tenant-shaped secrets: $TENANT_SECRETS"
|
||||
log " managed-shaped secrets: $MANAGED_SECRETS"
|
||||
log " would delete: $DELETE_COUNT"
|
||||
log " would keep: $KEEP_COUNT"
|
||||
log ""
|
||||
@@ -272,17 +350,35 @@ for reason, n in keep_c.most_common():
|
||||
print(f' keep/{reason}: {n}')
|
||||
"
|
||||
|
||||
# Safety gate operates against the tenant-shaped subset — same
|
||||
# Safety gate operates against the managed-shaped subset — same
|
||||
# rationale as sweep-cf-tunnels: a miscount of platform-infra
|
||||
# secrets shouldn't relax the gate.
|
||||
if [ "$TENANT_SECRETS" -gt 0 ]; then
|
||||
PCT=$(( DELETE_COUNT * 100 / TENANT_SECRETS ))
|
||||
#
|
||||
# IMPORTANT (the historical no-op trap): the REAL safety here is the
|
||||
# per-secret live-ORG cross-reference above — a secret is only marked
|
||||
# delete when its owning org provably no longer exists in the CP DB. The
|
||||
# percent gate is a blunt second line that, on a large genuine backlog
|
||||
# (e.g. the June-2026 2.4k-orphan workspace-config sprawl, ~99% orphan),
|
||||
# would itself BLOCK the very cleanup it exists to allow. So:
|
||||
# - Normal steady state: <50% delete → gate passes, runs every hour.
|
||||
# - Genuine bulk backlog: set SWEEP_ALLOW_BULK=1 to bypass the percent
|
||||
# gate DELIBERATELY, trusting the live-org cross-reference. This is the
|
||||
# sanctioned way to drain a backlog — NOT a blind MAX_DELETE_PCT bump.
|
||||
if [ "$MANAGED_SECRETS" -gt 0 ]; then
|
||||
PCT=$(( DELETE_COUNT * 100 / MANAGED_SECRETS ))
|
||||
if [ "$PCT" -gt "$MAX_DELETE_PCT" ]; then
|
||||
log ""
|
||||
log "SAFETY: would delete $PCT% of tenant-shaped secrets (threshold $MAX_DELETE_PCT%) — refusing."
|
||||
log " If this is expected (e.g. major cleanup after incident), rerun with"
|
||||
log " MAX_DELETE_PCT=$((PCT+5)) $0 $*"
|
||||
exit 2
|
||||
if [ "${SWEEP_ALLOW_BULK:-0}" = "1" ]; then
|
||||
log ""
|
||||
log "SAFETY: would delete $PCT% of managed-shaped secrets (>$MAX_DELETE_PCT%) — BULK override active (SWEEP_ALLOW_BULK=1)."
|
||||
log " Proceeding: every delete is live-org-cross-referenced AND a 30-day RECOVERABLE delete (restorable)."
|
||||
else
|
||||
log ""
|
||||
log "SAFETY: would delete $PCT% of managed-shaped secrets (threshold $MAX_DELETE_PCT%) — refusing."
|
||||
log " This is the expected gate on a genuine backlog. The deletes are"
|
||||
log " live-org-cross-referenced and recoverable (30d). To drain a backlog"
|
||||
log " deliberately, rerun with SWEEP_ALLOW_BULK=1 $0 $*"
|
||||
exit 2
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -312,11 +408,15 @@ fi
|
||||
# 30 min cap, but parallel-by-default keeps us symmetric with the
|
||||
# other sweepers and gives headroom for a one-off backlog.
|
||||
#
|
||||
# --force-delete-without-recovery skips the 7-30 day recovery window.
|
||||
# Acceptable here because (a) the GRACE_HOURS filter prevents touching
|
||||
# in-flight provisions, and (b) the secret is regenerated on every
|
||||
# fresh provision — losing one only matters for a tenant we're
|
||||
# explicitly trying to forget.
|
||||
# Deletion is RECOVERABLE (30-day recovery window), NOT force-delete.
|
||||
# Changed from --force-delete-without-recovery: a mistaken sweep must be
|
||||
# reversible via `aws secretsmanager restore-secret` for 30 days. The
|
||||
# GRACE_HOURS filter + live-org cross-reference make a mistake unlikely,
|
||||
# but at this scale (thousands of secrets) an unrecoverable bulk delete is
|
||||
# an unacceptable blast radius. The secret is also regenerated on every
|
||||
# fresh provision, so the recovery window is belt-and-suspenders, not the
|
||||
# only safety. Matches the CP provisioner + auto-reap reaper, which both
|
||||
# use DeleteSecret + RecoveryWindowInDays=30.
|
||||
|
||||
CONCURRENCY="${SWEEP_CONCURRENCY:-8}"
|
||||
DELETE_PLAN=$(mktemp -t aws-secrets-plan-XXXXXX)
|
||||
@@ -353,7 +453,7 @@ xargs -P "$CONCURRENCY" -L 1 -I {} bash -c '
|
||||
if aws secretsmanager delete-secret \
|
||||
--region "$AWS_REGION" \
|
||||
--secret-id "$arn" \
|
||||
--force-delete-without-recovery \
|
||||
--recovery-window-in-days 30 \
|
||||
--output json >/dev/null 2>&1; then
|
||||
echo OK
|
||||
else
|
||||
|
||||
@@ -0,0 +1,124 @@
|
||||
"""Tests for the sweep-aws-secrets.sh decision logic (#890).
|
||||
|
||||
Run locally: ``python3 -m unittest scripts/ops/test_sweep_aws_decide.py -v``
|
||||
|
||||
Why this exists: the inline Python heredoc in sweep-aws-secrets.sh decides
|
||||
which AWS Secrets Manager secrets to delete. A misclassification could nuke
|
||||
a LIVE tenant's bootstrap secret or a live workspace's config secret. These
|
||||
tests pin the rule order for BOTH managed namespaces:
|
||||
|
||||
- molecule/tenant/<org_id>/bootstrap (org_id in the NAME)
|
||||
- molecule/workspace/<ws_id>/config (owning org on the OrgID TAG)
|
||||
|
||||
To avoid drift, the test EXTRACTS the `decide`/`org_tag`/`parse_iso` helpers
|
||||
straight out of the shell script's heredoc at runtime and execs them — so a
|
||||
change to the shell logic that isn't reflected here (or vice-versa) fails.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
import unittest
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
SCRIPT = os.path.join(os.path.dirname(__file__), "sweep-aws-secrets.sh")
|
||||
|
||||
|
||||
def _load_decide():
|
||||
"""Extract the decision helpers from the shell heredoc and exec them."""
|
||||
src = open(SCRIPT, encoding="utf-8").read()
|
||||
# The heredoc body lives between `DECISIONS=$(echo "$SECRET_JSON" | python3 -c '`
|
||||
# and the closing `')`. Grab the python source between the single quotes.
|
||||
m = re.search(r"DECISIONS=\$\(echo \"\$SECRET_JSON\" \| python3 -c '(.*?)'\)", src, re.S)
|
||||
if not m:
|
||||
raise AssertionError("could not locate the decision heredoc in sweep-aws-secrets.sh")
|
||||
body = m.group(1)
|
||||
# The heredoc reads PROD_IDS/STAGING_IDS/GRACE_HOURS from os.environ and
|
||||
# reads stdin at the bottom. We only want the pure functions, so exec the
|
||||
# whole body in a namespace where env + stdin are pre-seeded, but stop it
|
||||
# from consuming stdin by trimming everything from the final stdin loop.
|
||||
cut = body.find("d = json.loads(sys.stdin.read())")
|
||||
if cut != -1:
|
||||
body = body[:cut]
|
||||
ns: dict = {}
|
||||
os.environ.setdefault("PROD_IDS", "live-org-1")
|
||||
os.environ.setdefault("STAGING_IDS", "live-org-2")
|
||||
os.environ.setdefault("GRACE_HOURS", "24")
|
||||
exec(compile(body, "<heredoc>", "exec"), ns)
|
||||
return ns
|
||||
|
||||
|
||||
NS = _load_decide()
|
||||
ALL_IDS = {"live-org-1", "live-org-2"}
|
||||
GRACE = timedelta(hours=24)
|
||||
NOW = datetime.now(timezone.utc)
|
||||
OLD = (NOW - timedelta(days=30)).isoformat()
|
||||
FRESH = NOW.isoformat()
|
||||
U = "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"
|
||||
|
||||
|
||||
def decide(secret: dict):
|
||||
action, reason, _arn, _name = NS["decide"](secret, ALL_IDS, GRACE, NOW)
|
||||
return action, reason
|
||||
|
||||
|
||||
class TestNonManaged(unittest.TestCase):
|
||||
def test_arbitrary_secret_kept(self):
|
||||
self.assertEqual(decide({"Name": "molecule/random/x", "CreatedDate": OLD}),
|
||||
("keep", "not-a-managed-secret"))
|
||||
|
||||
def test_bad_uuid_tenant_kept(self):
|
||||
self.assertEqual(decide({"Name": "molecule/tenant/not-a-uuid/bootstrap", "CreatedDate": OLD}),
|
||||
("keep", "not-a-managed-secret"))
|
||||
|
||||
|
||||
class TestTenant(unittest.TestCase):
|
||||
def test_orphan_tenant_deleted(self):
|
||||
self.assertEqual(decide({"Name": f"molecule/tenant/{U}/bootstrap", "CreatedDate": OLD}),
|
||||
("delete", "orphan-tenant"))
|
||||
|
||||
def test_live_tenant_kept(self):
|
||||
self.assertEqual(decide({"Name": "molecule/tenant/live-org-1xxxxxxxxxxxxxxxxxxxxxxxxxxx/bootstrap",
|
||||
"CreatedDate": OLD}),
|
||||
("keep", "not-a-managed-secret")) # wrong length → not managed
|
||||
|
||||
def test_live_tenant_real_uuid_kept(self):
|
||||
live = "11111111-2222-3333-4444-555555555555"
|
||||
os.environ["PROD_IDS"] = "live-org-1"
|
||||
# use ALL_IDS that includes the uuid
|
||||
action, reason, _, _ = NS["decide"](
|
||||
{"Name": f"molecule/tenant/{live}/bootstrap", "CreatedDate": OLD},
|
||||
ALL_IDS | {live}, GRACE, NOW)
|
||||
self.assertEqual((action, reason), ("keep", "live-tenant"))
|
||||
|
||||
|
||||
class TestWorkspace(unittest.TestCase):
|
||||
def test_orphan_workspace_deleted(self):
|
||||
self.assertEqual(decide({"Name": f"molecule/workspace/{U}/config", "CreatedDate": OLD,
|
||||
"Tags": [{"Key": "OrgID", "Value": "dead-org"}]}),
|
||||
("delete", "orphan-workspace"))
|
||||
|
||||
def test_live_org_workspace_kept(self):
|
||||
self.assertEqual(decide({"Name": f"molecule/workspace/{U}/config", "CreatedDate": OLD,
|
||||
"Tags": [{"Key": "OrgID", "Value": "live-org-2"}]}),
|
||||
("keep", "workspace-live-org"))
|
||||
|
||||
def test_no_org_tag_kept(self):
|
||||
self.assertEqual(decide({"Name": f"molecule/workspace/{U}/config", "CreatedDate": OLD,
|
||||
"Tags": []}),
|
||||
("keep", "workspace-no-org-tag"))
|
||||
|
||||
|
||||
class TestGrace(unittest.TestCase):
|
||||
def test_fresh_orphan_workspace_kept(self):
|
||||
self.assertEqual(decide({"Name": f"molecule/workspace/{U}/config", "CreatedDate": FRESH,
|
||||
"Tags": [{"Key": "OrgID", "Value": "dead-org"}]}),
|
||||
("keep", "in-grace-window"))
|
||||
|
||||
def test_fresh_orphan_tenant_kept(self):
|
||||
self.assertEqual(decide({"Name": f"molecule/tenant/{U}/bootstrap", "CreatedDate": FRESH}),
|
||||
("keep", "in-grace-window"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
+131
@@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env bash
|
||||
# Regression test for scripts/ops/sweep-aws-secrets.sh — verifies the
|
||||
# live-org fetch is fail-closed. A non-2xx response, invalid JSON, or a
|
||||
# response missing the 'orgs' array must abort the sweep BEFORE any secrets
|
||||
# are classified as orphans. This is especially critical under
|
||||
# SWEEP_ALLOW_BULK=1, where an empty live-org set would otherwise delete
|
||||
# every old managed secret.
|
||||
set -uo pipefail
|
||||
|
||||
SCRIPT="${SCRIPT:-scripts/ops/sweep-aws-secrets.sh}"
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
|
||||
run_case() {
|
||||
local name="$1" curl_exit="$2" curl_body="$3" bulk="${4:-0}"
|
||||
local expect_abort="${5:-true}" # true = must stop before AWS/orphan classification
|
||||
local tmp
|
||||
tmp=$(mktemp -d -t sweep-fail-closed-XXXXXX)
|
||||
local sentinel="$tmp/aws_reached"
|
||||
|
||||
# Generate a mock curl script. We use Python to write the body so that
|
||||
# JSON quotes/brackets are not mangled by shell quoting in a heredoc.
|
||||
export curl_body curl_exit tmp
|
||||
python3 -c "
|
||||
import os, shlex
|
||||
body = os.environ['curl_body']
|
||||
exit_code = os.environ['curl_exit']
|
||||
path = os.path.join(os.environ['tmp'], 'curl')
|
||||
with open(path, 'w') as f:
|
||||
f.write('#!/usr/bin/env bash\n')
|
||||
f.write(f'echo {shlex.quote(body)}\n')
|
||||
f.write(f'exit {exit_code}\n')
|
||||
"
|
||||
chmod +x "$tmp/curl"
|
||||
|
||||
# Mock aws cli: writes a sentinel file and exits with a distinctive code
|
||||
# so we can prove whether the sweep reached AWS/classification.
|
||||
cat > "$tmp/aws" <<'MOCK'
|
||||
#!/usr/bin/env bash
|
||||
echo "reached" > "$AWS_SENTINEL"
|
||||
exit 99
|
||||
MOCK
|
||||
chmod +x "$tmp/aws"
|
||||
|
||||
local out="$tmp/out" err="$tmp/err"
|
||||
PATH="$tmp:$PATH" \
|
||||
CP_ADMIN_API_TOKEN=tok-prod \
|
||||
CP_STAGING_ADMIN_API_TOKEN=tok-staging \
|
||||
AWS_ACCESS_KEY_ID=ak \
|
||||
AWS_SECRET_ACCESS_KEY=sk \
|
||||
AWS_SENTINEL="$sentinel" \
|
||||
SWEEP_ALLOW_BULK="$bulk" \
|
||||
bash "$SCRIPT" --execute > "$out" 2> "$err"
|
||||
local actual_exit=$?
|
||||
local case_fail=0
|
||||
|
||||
if [ "$expect_abort" = "true" ]; then
|
||||
# Fail-closed cases: script must abort before AWS and before orphan
|
||||
# classification. Exit code should be the fetch/validation failure (1),
|
||||
# NOT the aws mock's distinctive 99.
|
||||
if [ "$actual_exit" -eq 99 ]; then
|
||||
echo " ✗ $name: reached aws mock (exit 99) instead of aborting at fetch" >&2
|
||||
case_fail=1
|
||||
elif [ "$actual_exit" -eq 0 ]; then
|
||||
echo " ✗ $name: exited 0 instead of aborting" >&2
|
||||
case_fail=1
|
||||
fi
|
||||
if [ -f "$sentinel" ]; then
|
||||
echo " ✗ $name: aws sentinel exists — sweep reached AWS/classification" >&2
|
||||
case_fail=1
|
||||
fi
|
||||
if grep -qE '== Sweep plan ==|would delete:|orphan-(tenant|workspace)' "$out" "$err" 2>/dev/null; then
|
||||
echo " ✗ $name: output contains sweep plan / orphan classification" >&2
|
||||
case_fail=1
|
||||
fi
|
||||
else
|
||||
# Happy-path control: valid live-org fetch must allow the sweep to proceed past
|
||||
# the live-org fetch and reach AWS/classification. We use an empty orgs list
|
||||
# so no real delete work happens; the aws mock proves the boundary was crossed.
|
||||
if [ ! -f "$sentinel" ]; then
|
||||
echo " ✗ $name: aws sentinel missing — sweep did not reach AWS/classification" >&2
|
||||
case_fail=1
|
||||
fi
|
||||
if [ "$actual_exit" -ne 99 ]; then
|
||||
echo " ✗ $name: expected aws mock exit 99, got $actual_exit" >&2
|
||||
case_fail=1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$case_fail" -eq 0 ]; then
|
||||
echo " ✓ $name"
|
||||
PASS=$((PASS + 1))
|
||||
else
|
||||
echo " stdout:" >&2
|
||||
sed 's/^/ /' "$out" >&2
|
||||
echo " stderr:" >&2
|
||||
sed 's/^/ /' "$err" >&2
|
||||
FAIL=$((FAIL + 1))
|
||||
fi
|
||||
|
||||
rm -rf "$tmp"
|
||||
unset curl_body curl_exit
|
||||
}
|
||||
|
||||
echo "Test: sweep-aws-secrets live-org fetch fail-closed"
|
||||
echo
|
||||
|
||||
# Non-2xx from CP admin API (curl -f exits 22).
|
||||
run_case "prod API returns 500" 22 '{"error":"internal"}' 0 true
|
||||
run_case "prod API returns 500 with SWEEP_ALLOW_BULK=1" 22 '{"error":"internal"}' 1 true
|
||||
|
||||
# Valid HTTP but invalid JSON body.
|
||||
run_case "prod API returns malformed JSON" 0 'this is not json' 0 true
|
||||
run_case "prod API returns malformed JSON with SWEEP_ALLOW_BULK=1" 0 'this is not json' 1 true
|
||||
|
||||
# Valid JSON but missing 'orgs' key.
|
||||
run_case "prod API returns JSON without orgs" 0 '{"foo":"bar"}' 0 true
|
||||
run_case "prod API returns JSON without orgs with SWEEP_ALLOW_BULK=1" 0 '{"foo":"bar"}' 1 true
|
||||
|
||||
# Valid JSON but 'orgs' is not an array.
|
||||
run_case "prod API returns orgs as string" 0 '{"orgs":"not-an-array"}' 0 true
|
||||
|
||||
# Happy-path control: valid orgs array must allow the sweep to proceed past
|
||||
# the live-org fetch and reach AWS/classification. We use an empty orgs list
|
||||
# so no real delete work happens; the aws mock proves the boundary was crossed.
|
||||
run_case "prod API returns valid empty orgs (reaches AWS)" 0 '{"orgs":[]}' 0 false
|
||||
|
||||
echo
|
||||
echo "passed=$PASS failed=$FAIL"
|
||||
[ "$FAIL" -eq 0 ]
|
||||
Reference in New Issue
Block a user