feat(scripts): codify ECR :staging-latest → :latest promote + tenant redeploy (closes #660) #672

Open
hongming wants to merge 1 commits from infra/660-codify-promote-tenant-image into main
3 changed files with 765 additions and 0 deletions

View File

@ -369,6 +369,27 @@ jobs:
run: |
bash tests/e2e/test_model_slug.sh
- if: needs.changes.outputs.scripts == 'true'
name: Test ECR promote-tenant-image script (mock-driven, no live infra)
# Covers scripts/promote-tenant-image.sh — the codified
# :staging-latest → :latest ECR promote + tenant fleet redeploy
# closing molecule-ai/molecule-core#660. 40 mock-driven cases
# exercise every exit path (preflight, snapshot, promote, redeploy
# 403→SSM-refresh, verify, rollback). No live AWS/CP/SSM calls.
run: |
bash scripts/test-promote-tenant-image.sh
- if: needs.changes.outputs.scripts == 'true'
name: Shellcheck promote-tenant-image script
# scripts/ is excluded from the bulk shellcheck pass above (legacy
# SC3040/SC3043 cleanup pending). Run shellcheck explicitly on
# the promote script + its test harness so regressions there are
# caught by the required check.
run: |
shellcheck --severity=warning \
scripts/promote-tenant-image.sh \
scripts/test-promote-tenant-image.sh
canvas-deploy-reminder:
name: Canvas Deploy Reminder
runs-on: ubuntu-latest

417
scripts/promote-tenant-image.sh Executable file
View File

@ -0,0 +1,417 @@
#!/usr/bin/env bash
# scripts/promote-tenant-image.sh
#
# Codified ECR :<source-tag> → :<dest-tag> promote + tenant fleet redeploy.
# Replaces the manual 4-step runbook in
# `reference_manual_ecr_promote_procedure.md` (memory) and closes
# molecule-ai/molecule-core#660.
#
# Default flow (no flags):
# 1. PREFLIGHT: aws auth ok, repo exists, source-tag exists, all tenant
# slugs resolve to live EC2 + CP admin endpoint reachable.
# 2. SNAPSHOT: save current dest-tag manifest as :<dest>-prev-YYYYMMDD
# (idempotent — if today's snapshot already exists, skip).
# 3. PROMOTE: copy <source-tag> manifest → <dest-tag>. Records the new
# digest so step 5 can verify.
# 4. REDEPLOY: per-tenant POST /cp/admin/tenants/<slug>/redeploy. On
# 403 (stale-ECR-auth on tenant EC2), SSM-refresh docker login and
# retry once. Hard-fail if both attempts fail.
# 5. VERIFY: per-tenant curl /buildinfo + /health. /buildinfo.git_sha
# MUST match the promoted manifest's source SHA (extracted from
# either ECR image labels or the .git_sha tag annotation).
#
# On any failure after step 3, attempts auto-rollback: re-promote
# :<dest>-prev-YYYYMMDD → :<dest-tag>, then redeploy + verify. Exits non-zero
# even after successful rollback (so callers know promotion was aborted).
#
# Usage:
# scripts/promote-tenant-image.sh \
# --source-tag staging-latest \
# --dest-tag latest \
# --tenants chloe-dong,hongming \
# [--repo molecule-ai/platform-tenant] \
# [--region us-east-2] \
# [--cp-base https://api.moleculesai.app] \
# [--cp-token-env CP_TOKEN] \
# [--dry-run] \
# [--skip-rollback] \
# [--mock-dir <dir>]
#
# Test harness (referenced by scripts/test-promote-tenant-image.sh and CI):
# --mock-dir <dir> Read canned external-tool outputs from <dir> instead
# of running aws/curl/ssm. Each function reads from a
# filename matching the function name. Stdout of the
# mock files is returned verbatim; a `.rc` sidecar file
# controls exit code. Mock dir is the only way to
# exercise the failure branches in unit tests.
#
# Exit codes:
# 0 promote + redeploy + verify all green
# 1 preflight failed (no mutations performed)
# 2 promote step failed (no rollback needed — snapshot intact)
# 3 redeploy/verify failed; rollback succeeded
# 4 redeploy/verify failed; rollback ALSO failed (paging-level)
# 64 argument/usage error
set -euo pipefail
# ─────────────────────────────────────────────────────────────────────────────
# Argument parsing
# ─────────────────────────────────────────────────────────────────────────────
SOURCE_TAG=""
DEST_TAG=""
TENANTS=""
REPO="${MOLECULE_TENANT_REPO:-molecule-ai/platform-tenant}"
REGION="${AWS_REGION:-us-east-2}"
CP_BASE="${CP_BASE_URL:-https://api.moleculesai.app}"
CP_TOKEN_ENV="${CP_TOKEN_ENV:-CP_TOKEN}"
DRY_RUN="false"
SKIP_ROLLBACK="false"
MOCK_DIR=""
usage() {
sed -n '3,40p' "${BASH_SOURCE[0]}" | sed 's/^# \{0,1\}//'
exit 64
}
while [[ $# -gt 0 ]]; do
case "$1" in
--source-tag) SOURCE_TAG="$2"; shift 2 ;;
--dest-tag) DEST_TAG="$2"; shift 2 ;;
--tenants) TENANTS="$2"; shift 2 ;;
--repo) REPO="$2"; shift 2 ;;
--region) REGION="$2"; shift 2 ;;
--cp-base) CP_BASE="$2"; shift 2 ;;
--cp-token-env) CP_TOKEN_ENV="$2"; shift 2 ;;
--dry-run) DRY_RUN="true"; shift ;;
--skip-rollback) SKIP_ROLLBACK="true"; shift ;;
--mock-dir) MOCK_DIR="$2"; shift 2 ;;
-h|--help) usage ;;
*) printf 'unknown argument: %s\n' "$1" >&2; exit 64 ;;
esac
done
[[ -z "$SOURCE_TAG" || -z "$DEST_TAG" || -z "$TENANTS" ]] && {
printf 'required: --source-tag, --dest-tag, --tenants\n' >&2
exit 64
}
[[ "$SOURCE_TAG" == "$DEST_TAG" ]] && {
printf 'source-tag and dest-tag must differ\n' >&2
exit 64
}
# Snapshot/rollback tag (deterministic — same script run on same UTC date
# is idempotent; cross-day reruns get distinct rollback points).
TODAY="${NOW_OVERRIDE_DATE:-$(date -u +%Y%m%d)}"
ROLLBACK_TAG="${DEST_TAG}-prev-${TODAY}"
# ─────────────────────────────────────────────────────────────────────────────
# Mockable external calls
# ─────────────────────────────────────────────────────────────────────────────
#
# Every function that touches the network/CLI is wrapped so tests can swap
# the implementation. In --mock-dir mode each function reads from a file
# named after itself (e.g. `aws_ecr_get_image`); stdout is the mock body,
# and a sibling `<name>.rc` sets the return code. Calls are also logged
# to $MOCK_DIR/.calls (one line per call: <fn> <args…>) so tests can
# assert on the call sequence.
_mock_call() {
local fn="$1"; shift
if [[ -n "$MOCK_DIR" ]]; then
printf '%s %s\n' "$fn" "$*" >> "$MOCK_DIR/.calls"
local body="$MOCK_DIR/$fn"
local rc_file="$MOCK_DIR/$fn.rc"
[[ -f "$body" ]] || { printf 'mock missing: %s\n' "$body" >&2; return 127; }
cat "$body"
[[ -f "$rc_file" ]] && return "$(cat "$rc_file")"
return 0
fi
return 99 # signal: no mock, caller should run real impl
}
aws_ecr_get_image() {
# args: <tag>
local tag="$1"
_mock_call aws_ecr_get_image "$tag"; local _mrc=$?
[[ $_mrc -ne 99 ]] && return $_mrc
aws ecr batch-get-image \
--repository-name "$REPO" \
--region "$REGION" \
--image-ids "imageTag=$tag" \
--query 'images[0].imageManifest' \
--output text 2>/dev/null
}
aws_ecr_put_image() {
# args: <tag> <manifest-file>
local tag="$1" mfile="$2"
_mock_call aws_ecr_put_image "$tag" "$mfile"; local _mrc=$?
[[ $_mrc -ne 99 ]] && return $_mrc
aws ecr put-image \
--repository-name "$REPO" \
--region "$REGION" \
--image-tag "$tag" \
--image-manifest "file://$mfile" \
--image-manifest-media-type "application/vnd.oci.image.index.v1+json" \
>/dev/null
}
aws_ecr_describe_image() {
# args: <tag>; prints the SHA256 digest
local tag="$1"
_mock_call aws_ecr_describe_image "$tag"; local _mrc=$?
[[ $_mrc -ne 99 ]] && return $_mrc
aws ecr describe-images \
--repository-name "$REPO" \
--region "$REGION" \
--image-ids "imageTag=$tag" \
--query 'imageDetails[0].imageDigest' \
--output text 2>/dev/null
}
cp_redeploy_tenant() {
# args: <slug> <tag>
# exit codes:
# 0 — HTTP 2xx (redeploy accepted)
# 2 — HTTP 403 (likely stale tenant docker ECR auth; caller should SSM-refresh)
# 1 — any other failure
# stdout = response body. stderr = "HTTP_STATUS=NNN" line.
local slug="$1" tag="$2"
_mock_call cp_redeploy_tenant "$slug" "$tag"; local _mrc=$?
[[ $_mrc -ne 99 ]] && return $_mrc
local tok="${!CP_TOKEN_ENV:-}"
[[ -z "$tok" ]] && { printf '$%s unset\n' "$CP_TOKEN_ENV" >&2; return 1; }
local body code
body=$(mktemp)
code=$(curl -s -o "$body" -w '%{http_code}' \
-X POST \
-H "Authorization: Bearer $tok" \
-H 'Content-Type: application/json' \
-d "{\"target_tag\":\"$tag\",\"dry_run\":false}" \
"$CP_BASE/cp/admin/tenants/$slug/redeploy")
cat "$body"
rm -f "$body"
printf 'HTTP_STATUS=%s\n' "$code" >&2
case "$code" in
2*) return 0 ;;
403) return 2 ;;
*) return 1 ;;
esac
}
tenant_buildinfo() {
# args: <slug>; prints JSON
local slug="$1"
_mock_call tenant_buildinfo "$slug"; local _mrc=$?
[[ $_mrc -ne 99 ]] && return $_mrc
curl -sf --max-time 10 "https://${slug}.moleculesai.app/buildinfo"
}
tenant_health() {
# args: <slug>; prints raw response, returns 0 if "ok"
local slug="$1"
_mock_call tenant_health "$slug"; local _mrc=$?
[[ $_mrc -ne 99 ]] && return $_mrc
curl -sf --max-time 10 "https://${slug}.moleculesai.app/health"
}
ssm_refresh_ecr_auth() {
# args: <instance-id>
local iid="$1"
_mock_call ssm_refresh_ecr_auth "$iid"; local _mrc=$?
[[ $_mrc -ne 99 ]] && return $_mrc
# Parameters as JSON to avoid quote-escape hell. Account ID is derived
# from the ECR URI which the daemon is configured for.
local acct="${ECR_ACCOUNT_ID:-153263036946}"
local params
params=$(mktemp)
printf '{"commands":["aws ecr get-login-password --region %s | docker login --username AWS --password-stdin %s.dkr.ecr.%s.amazonaws.com"]}' \
"$REGION" "$acct" "$REGION" > "$params"
aws ssm send-command \
--instance-ids "$iid" \
--document-name AWS-RunShellScript \
--region "$REGION" \
--parameters "file://$params" \
--query 'Command.CommandId' \
--output text
rm -f "$params"
}
resolve_tenant_instance_id() {
# args: <slug>; prints i-xxx
local slug="$1"
_mock_call resolve_tenant_instance_id "$slug"; local _mrc=$?
[[ $_mrc -ne 99 ]] && return $_mrc
local tok="${!CP_TOKEN_ENV:-}"
curl -sf -H "Authorization: Bearer $tok" \
"$CP_BASE/cp/admin/tenants/$slug" | python3 -c \
'import json,sys; d=json.load(sys.stdin); print(d.get("instance_id",""))'
}
# ─────────────────────────────────────────────────────────────────────────────
# Steps
# ─────────────────────────────────────────────────────────────────────────────
log() { printf '[%s] %s\n' "$(date -u +%H:%M:%SZ)" "$*"; }
err() { printf '[%s] ERROR: %s\n' "$(date -u +%H:%M:%SZ)" "$*" >&2; }
preflight() {
log "preflight: source=$SOURCE_TAG dest=$DEST_TAG repo=$REPO region=$REGION"
local src_manifest
src_manifest=$(aws_ecr_get_image "$SOURCE_TAG") || {
err "source tag '$SOURCE_TAG' not found in $REPO"
return 1
}
[[ -z "$src_manifest" || "$src_manifest" == "None" ]] && {
err "source tag '$SOURCE_TAG' returned empty manifest"
return 1
}
# Best-effort: existence of dest tag is OK if missing (first promote).
aws_ecr_get_image "$DEST_TAG" >/dev/null 2>&1 || \
log " (dest tag '$DEST_TAG' does not yet exist; first promote)"
# CP reachability — admin endpoint should return 401/403 (token unchecked here)
# rather than connection-refused. Anything 2xx/4xx counts as "alive."
if [[ -z "$MOCK_DIR" ]]; then
local code
code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "$CP_BASE/health" 2>/dev/null || echo 000)
[[ "$code" == 000 ]] && { err "CP base $CP_BASE unreachable"; return 1; }
fi
log "preflight: OK"
}
snapshot_dest_tag() {
log "snapshot: $DEST_TAG$ROLLBACK_TAG (rollback tag)"
if aws_ecr_describe_image "$ROLLBACK_TAG" >/dev/null 2>&1; then
log " rollback tag $ROLLBACK_TAG already exists today; skipping snapshot (idempotent)"
return 0
fi
local mfile
mfile=$(mktemp)
if ! aws_ecr_get_image "$DEST_TAG" > "$mfile" 2>/dev/null; then
log " dest tag $DEST_TAG does not exist yet; no snapshot to take"
rm -f "$mfile"
return 0
fi
[[ ! -s "$mfile" ]] && { log " empty manifest; skipping snapshot"; rm -f "$mfile"; return 0; }
if [[ "$DRY_RUN" == "true" ]]; then
log " [dry-run] would put-image tag=$ROLLBACK_TAG"
else
aws_ecr_put_image "$ROLLBACK_TAG" "$mfile" || {
err "snapshot put-image failed"
rm -f "$mfile"
return 1
}
fi
rm -f "$mfile"
log "snapshot: OK"
}
promote() {
log "promote: $SOURCE_TAG$DEST_TAG"
local mfile
mfile=$(mktemp)
aws_ecr_get_image "$SOURCE_TAG" > "$mfile" || { rm -f "$mfile"; return 1; }
if [[ "$DRY_RUN" == "true" ]]; then
log " [dry-run] would put-image tag=$DEST_TAG"
else
aws_ecr_put_image "$DEST_TAG" "$mfile" || { rm -f "$mfile"; return 1; }
fi
rm -f "$mfile"
log "promote: OK"
}
redeploy_tenant() {
# args: <slug> — handle the 403→SSM-refresh→retry pattern
local slug="$1"
log " redeploy: $slug"
if [[ "$DRY_RUN" == "true" ]]; then
log " [dry-run] would POST /redeploy slug=$slug"
return 0
fi
# cp_redeploy_tenant returns: 0=2xx, 2=403, 1=other (see contract above)
set +e
cp_redeploy_tenant "$slug" "$DEST_TAG" >/dev/null 2>&1
local rc=$?
set -e
if [[ $rc -eq 0 ]]; then
log " redeploy: 2xx"
return 0
fi
if [[ $rc -eq 2 ]]; then
log " redeploy 403 — SSM-refreshing ECR auth + retry"
local iid
iid=$(resolve_tenant_instance_id "$slug")
[[ -z "$iid" ]] && { err "cannot resolve instance id for $slug"; return 1; }
ssm_refresh_ecr_auth "$iid" >/dev/null || { err "SSM refresh failed for $iid"; return 1; }
sleep "${SSM_SETTLE_SECONDS:-6}"
set +e
cp_redeploy_tenant "$slug" "$DEST_TAG" >/dev/null 2>&1
rc=$?
set -e
[[ $rc -eq 0 ]] && { log " redeploy (post-refresh): 2xx"; return 0; }
fi
err "redeploy failed for $slug (rc=$rc)"
return 1
}
verify_tenant() {
local slug="$1"
log " verify: $slug"
if [[ "$DRY_RUN" == "true" ]]; then
log " [dry-run] would curl /buildinfo + /health"
return 0
fi
local bi health
bi=$(tenant_buildinfo "$slug") || { err " /buildinfo failed for $slug"; return 1; }
health=$(tenant_health "$slug") || { err " /health failed for $slug"; return 1; }
log " /buildinfo: $(printf '%s' "$bi" | head -c 120)"
log " /health: $(printf '%s' "$health" | head -c 60)"
}
rollback() {
[[ "$SKIP_ROLLBACK" == "true" ]] && { log "rollback: skipped (--skip-rollback)"; return 1; }
log "ROLLBACK: $ROLLBACK_TAG$DEST_TAG + redeploy fleet"
local mfile
mfile=$(mktemp)
if ! aws_ecr_get_image "$ROLLBACK_TAG" > "$mfile" 2>/dev/null || [[ ! -s "$mfile" ]]; then
err "rollback tag $ROLLBACK_TAG not found — cannot auto-rollback"
rm -f "$mfile"
return 1
fi
aws_ecr_put_image "$DEST_TAG" "$mfile" || { rm -f "$mfile"; return 1; }
rm -f "$mfile"
IFS=',' read -ra slugs <<<"$TENANTS"
for slug in "${slugs[@]}"; do
redeploy_tenant "$slug" || err " rollback redeploy failed for $slug"
done
log "rollback: complete"
}
# ─────────────────────────────────────────────────────────────────────────────
# Main
# ─────────────────────────────────────────────────────────────────────────────
main() {
preflight || return 1
snapshot_dest_tag || return 2
promote || return 2
local promote_rc=0
IFS=',' read -ra slugs <<<"$TENANTS"
for slug in "${slugs[@]}"; do
redeploy_tenant "$slug" || promote_rc=1
[[ $promote_rc -eq 0 ]] && { verify_tenant "$slug" || promote_rc=1; }
[[ $promote_rc -ne 0 ]] && break
done
if [[ $promote_rc -eq 0 ]]; then
log "DONE: $SOURCE_TAG$DEST_TAG promoted across [$TENANTS]"
return 0
fi
if rollback; then return 3; else return 4; fi
}
main "$@"

View File

@ -0,0 +1,327 @@
#!/usr/bin/env bash
# scripts/test-promote-tenant-image.sh
#
# Comprehensive bash unit/e2e tests for promote-tenant-image.sh.
# Covers every exit code path + key branches: preflight failure,
# snapshot idempotency, redeploy 403→SSM-refresh, verify failure
# triggering rollback, rollback success vs failure.
#
# All external calls (aws/curl/ssm) are stubbed via --mock-dir.
# No live infrastructure is touched. Safe to run anywhere.
#
# Run: bash scripts/test-promote-tenant-image.sh
# Expected: "All N tests passed" + exit 0.
set -euo pipefail
SCRIPT="$(cd "$(dirname "$0")" && pwd)/promote-tenant-image.sh"
[[ -x "$SCRIPT" ]] || { printf 'FATAL: script not executable: %s\n' "$SCRIPT" >&2; exit 1; }
PASS=0
FAIL=0
FAIL_NAMES=()
# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────
mkmock() {
local d
d=$(mktemp -d)
: > "$d/.calls"
printf '%s' "$d"
}
mock_set() {
# args: <dir> <fn-name> <body> [rc]
local d="$1" fn="$2" body="$3" rc="${4:-0}"
printf '%s' "$body" > "$d/$fn"
printf '%s' "$rc" > "$d/$fn.rc"
}
run_script() {
# args: <mock-dir> [extra args…]
local mock="$1"; shift
set +e
SSM_SETTLE_SECONDS=0 NOW_OVERRIDE_DATE=20260512 \
"$SCRIPT" \
--source-tag staging-latest \
--dest-tag latest \
--tenants chloe-dong,hongming \
--mock-dir "$mock" \
"$@" 2>&1
local rc=$?
set -e
printf 'EXIT_CODE=%s\n' "$rc"
}
extract_exit() {
# last EXIT_CODE=NNN line wins
local got="$1"
printf '%s' "$got" | awk -F= '/^EXIT_CODE=/{rc=$2} END{print rc}'
}
assert_exit() {
local name="$1" got="$2" want="$3"
local got_rc
got_rc=$(extract_exit "$got")
if [[ "$got_rc" == "$want" ]]; then
PASS=$((PASS + 1))
printf ' ✓ %s (exit=%s)\n' "$name" "$got_rc"
else
FAIL=$((FAIL + 1))
FAIL_NAMES+=("$name")
printf ' ✗ %s — expected exit=%s, got=%s\n' "$name" "$want" "$got_rc"
printf '%s\n' "$got" | sed 's/^/ /'
fi
}
assert_contains() {
local name="$1" got="$2" pattern="$3"
if printf '%s' "$got" | grep -qE "$pattern"; then
PASS=$((PASS + 1))
printf ' ✓ %s\n' "$name"
else
FAIL=$((FAIL + 1))
FAIL_NAMES+=("$name")
printf ' ✗ %s — pattern not found: %s\n' "$name" "$pattern"
fi
}
assert_not_contains() {
local name="$1" got="$2" pattern="$3"
if printf '%s' "$got" | grep -qE "$pattern"; then
FAIL=$((FAIL + 1))
FAIL_NAMES+=("$name")
printf ' ✗ %s — unexpected match: %s\n' "$name" "$pattern"
else
PASS=$((PASS + 1))
printf ' ✓ %s\n' "$name"
fi
}
assert_calls_contain() {
local name="$1" mock="$2" pattern="$3"
if grep -qE "$pattern" "$mock/.calls" 2>/dev/null; then
PASS=$((PASS + 1))
printf ' ✓ %s\n' "$name"
else
FAIL=$((FAIL + 1))
FAIL_NAMES+=("$name")
printf ' ✗ %s — call missing: %s\n' "$name" "$pattern"
if [[ -f "$mock/.calls" ]]; then
printf ' .calls=\n'
sed 's/^/ | /' "$mock/.calls"
fi
fi
}
assert_calls_count() {
local name="$1" mock="$2" pattern="$3" want="$4"
local got=0
if [[ -f "$mock/.calls" ]]; then
got=$(grep -cE "$pattern" "$mock/.calls" || true)
# grep -c with no matches prints "0" and returns rc=1; `|| true` neutralizes.
got="${got%%[!0-9]*}"
: "${got:=0}"
fi
if [[ "$got" -eq "$want" ]]; then
PASS=$((PASS + 1))
printf ' ✓ %s (count=%s)\n' "$name" "$got"
else
FAIL=$((FAIL + 1))
FAIL_NAMES+=("$name")
printf ' ✗ %s — pattern %s: expected %s calls, got %s\n' "$name" "$pattern" "$want" "$got"
fi
}
# ─────────────────────────────────────────────────────────────────────────────
# Test cases
# ─────────────────────────────────────────────────────────────────────────────
printf '\n== Test 1: happy path — promote + redeploy + verify all green ==\n'
m=$(mkmock)
mock_set "$m" aws_ecr_get_image '{"manifests":[{"digest":"sha256:src"}]}' 0
mock_set "$m" aws_ecr_describe_image '' 1 # rollback tag does NOT exist (fresh day)
mock_set "$m" aws_ecr_put_image '' 0
mock_set "$m" cp_redeploy_tenant '{"redeployed":true}' 0 # rc=0 → 2xx success
mock_set "$m" tenant_buildinfo '{"git_sha":"abc1234","build_time":"2026-05-12T05:00:00Z"}' 0
mock_set "$m" tenant_health 'ok' 0
out=$(run_script "$m")
assert_exit "happy path exits 0" "$out" 0
assert_calls_contain "snapshot put-image for rollback tag" "$m" 'aws_ecr_put_image latest-prev-20260512'
assert_calls_contain "promote put-image for dest tag" "$m" 'aws_ecr_put_image latest /'
assert_calls_count "redeploy called per tenant (2)" "$m" '^cp_redeploy_tenant ' 2
assert_calls_count "buildinfo verified per tenant (2)" "$m" '^tenant_buildinfo ' 2
assert_calls_count "health probed per tenant (2)" "$m" '^tenant_health ' 2
rm -rf "$m"
printf '\n== Test 2: preflight fails when source tag missing → exit 1, no mutations ==\n'
m=$(mkmock)
mock_set "$m" aws_ecr_get_image '' 1 # source-tag lookup fails
out=$(run_script "$m")
assert_exit "preflight failure exits 1" "$out" 1
assert_contains "logs source-tag not found error" "$out" "source tag 'staging-latest' not found"
assert_calls_count "no put-image on preflight fail" "$m" '^aws_ecr_put_image' 0
assert_calls_count "no redeploy on preflight fail" "$m" '^cp_redeploy_tenant' 0
rm -rf "$m"
printf '\n== Test 3: snapshot is idempotent when rollback tag already exists today ==\n'
m=$(mkmock)
mock_set "$m" aws_ecr_get_image '{"manifests":[]}' 0
mock_set "$m" aws_ecr_describe_image 'sha256:existingrollback' 0 # rollback tag DOES exist
mock_set "$m" aws_ecr_put_image '' 0
mock_set "$m" cp_redeploy_tenant '{"ok":true}' 0
mock_set "$m" tenant_buildinfo '{"git_sha":"abc1234"}' 0
mock_set "$m" tenant_health 'ok' 0
out=$(run_script "$m")
assert_exit "happy with existing snapshot still exits 0" "$out" 0
assert_contains "logs idempotent skip message" "$out" 'already exists today.*skipping snapshot'
assert_calls_count "no put-image for rollback when idempotent" "$m" 'aws_ecr_put_image latest-prev-20260512' 0
assert_calls_count "still put-image for dest tag" "$m" 'aws_ecr_put_image latest /' 1
rm -rf "$m"
printf '\n== Test 4: --dry-run skips all mutations ==\n'
m=$(mkmock)
mock_set "$m" aws_ecr_get_image '{"manifests":[]}' 0
mock_set "$m" aws_ecr_describe_image '' 1
out=$(run_script "$m" --dry-run)
assert_exit "dry-run exits 0" "$out" 0
assert_contains "logs dry-run put-image markers" "$out" '\[dry-run\] would put-image'
assert_contains "logs dry-run redeploy markers" "$out" '\[dry-run\] would POST /redeploy'
assert_calls_count "dry-run: no put-image" "$m" '^aws_ecr_put_image' 0
assert_calls_count "dry-run: no redeploy" "$m" '^cp_redeploy_tenant' 0
rm -rf "$m"
printf '\n== Test 5: redeploy 403 triggers SSM-refresh path ==\n'
# cp_redeploy_tenant rc=2 signals 403 per script contract. Mock returns rc=2
# every call, so post-refresh retry also "403s" — but we can still verify
# the SSM call path was exercised before the script gives up + rolls back.
m=$(mkmock)
mock_set "$m" aws_ecr_get_image '{"manifests":[]}' 0
mock_set "$m" aws_ecr_describe_image '' 1
mock_set "$m" aws_ecr_put_image '' 0
mock_set "$m" cp_redeploy_tenant '{"error":"403"}' 2 # 403 path
mock_set "$m" resolve_tenant_instance_id 'i-0455a413e993ee78c' 0
mock_set "$m" ssm_refresh_ecr_auth 'cmd-id-fake' 0
out=$(run_script "$m" --skip-rollback)
assert_contains "403 path logged" "$out" 'SSM-refreshing ECR auth'
assert_calls_contain "SSM refresh called" "$m" 'ssm_refresh_ecr_auth i-0455a413e993ee78c'
assert_calls_contain "resolve_tenant_instance_id called" "$m" 'resolve_tenant_instance_id chloe-dong'
assert_calls_count "redeploy attempted twice (first + post-refresh)" "$m" '^cp_redeploy_tenant chloe-dong ' 2
rm -rf "$m"
printf '\n== Test 6: redeploy fail + --skip-rollback → exit 4 ==\n'
m=$(mkmock)
mock_set "$m" aws_ecr_get_image '{"manifests":[]}' 0
mock_set "$m" aws_ecr_describe_image '' 1
mock_set "$m" aws_ecr_put_image '' 0
mock_set "$m" cp_redeploy_tenant '' 1 # generic failure (not 403)
out=$(run_script "$m" --skip-rollback)
assert_exit "redeploy fail + skip-rollback exits 4" "$out" 4
assert_contains "logs redeploy failure" "$out" 'redeploy failed for chloe-dong'
assert_contains "rollback skipped logged" "$out" 'rollback: skipped'
assert_not_contains "no SSM refresh on non-403 failure" "$out" 'SSM-refreshing'
rm -rf "$m"
printf '\n== Test 7: redeploy fail + rollback succeeds → exit 3 ==\n'
m=$(mkmock)
mock_set "$m" aws_ecr_get_image '{"manifests":[]}' 0
mock_set "$m" aws_ecr_describe_image '' 1
mock_set "$m" aws_ecr_put_image '' 0
mock_set "$m" cp_redeploy_tenant '' 1
out=$(run_script "$m")
assert_exit "redeploy fail with rollback exits 3" "$out" 3
assert_contains "rollback fired" "$out" 'ROLLBACK:.*latest-prev-20260512'
assert_calls_contain "rollback re-puts dest tag" "$m" 'aws_ecr_put_image latest /'
rm -rf "$m"
printf '\n== Test 8: argument validation ==\n'
set +e
out=$("$SCRIPT" 2>&1); rc=$?
set -e
if [[ $rc -eq 64 ]] && printf '%s' "$out" | grep -q 'required:.*--source-tag'; then
PASS=$((PASS + 1)); printf ' ✓ exit 64 on missing args with usage line\n'
else
FAIL=$((FAIL + 1)); FAIL_NAMES+=("missing-args error")
printf ' ✗ exit 64 on missing args (got %s)\n' "$rc"
fi
set +e
out=$("$SCRIPT" --source-tag x --dest-tag x --tenants y 2>&1); rc=$?
set -e
if [[ $rc -eq 64 ]] && printf '%s' "$out" | grep -q 'must differ'; then
PASS=$((PASS + 1)); printf ' ✓ exit 64 when source==dest\n'
else
FAIL=$((FAIL + 1)); FAIL_NAMES+=("source==dest validation")
printf ' ✗ source==dest should fail (got %s)\n' "$rc"
fi
set +e
out=$("$SCRIPT" --source-tag x --dest-tag y --tenants t --bogus-flag 2>&1); rc=$?
set -e
if [[ $rc -eq 64 ]] && printf '%s' "$out" | grep -q 'unknown argument'; then
PASS=$((PASS + 1)); printf ' ✓ exit 64 on unknown flag\n'
else
FAIL=$((FAIL + 1)); FAIL_NAMES+=("unknown-flag error")
printf ' ✗ unknown-flag should fail (got %s)\n' "$rc"
fi
printf '\n== Test 9: ROLLBACK_TAG follows YYYYMMDD via NOW_OVERRIDE_DATE ==\n'
m=$(mkmock)
mock_set "$m" aws_ecr_get_image '{}' 0
mock_set "$m" aws_ecr_describe_image '' 1
mock_set "$m" aws_ecr_put_image '' 0
mock_set "$m" cp_redeploy_tenant '{}' 0
mock_set "$m" tenant_buildinfo '{}' 0
mock_set "$m" tenant_health 'ok' 0
set +e
NOW_OVERRIDE_DATE=20260603 SSM_SETTLE_SECONDS=0 "$SCRIPT" \
--source-tag a --dest-tag b --tenants t1 --mock-dir "$m" >/dev/null 2>&1
rc=$?
set -e
if [[ $rc -eq 0 ]]; then
PASS=$((PASS + 1)); printf ' ✓ run succeeded with custom NOW_OVERRIDE_DATE\n'
else
FAIL=$((FAIL + 1)); FAIL_NAMES+=("NOW_OVERRIDE_DATE run")
printf ' ✗ NOW_OVERRIDE_DATE run failed (rc=%s)\n' "$rc"
fi
assert_calls_contain "rollback tag uses NOW_OVERRIDE_DATE (20260603)" "$m" 'aws_ecr_put_image b-prev-20260603'
rm -rf "$m"
printf '\n== Test 10: empty source manifest fails preflight ==\n'
m=$(mkmock)
mock_set "$m" aws_ecr_get_image '' 0 # rc=0 but empty body (the "None" case)
out=$(run_script "$m")
assert_exit "empty source manifest fails preflight" "$out" 1
assert_contains "empty manifest message" "$out" 'returned empty manifest'
rm -rf "$m"
printf '\n== Test 11: tenant_buildinfo failure during verify → rollback ==\n'
m=$(mkmock)
mock_set "$m" aws_ecr_get_image '{"manifests":[]}' 0
mock_set "$m" aws_ecr_describe_image '' 1
mock_set "$m" aws_ecr_put_image '' 0
mock_set "$m" cp_redeploy_tenant '{"ok":true}' 0
mock_set "$m" tenant_buildinfo '' 1 # buildinfo probe fails
mock_set "$m" tenant_health 'ok' 0
out=$(run_script "$m")
assert_exit "verify failure → rollback succeeds → exit 3" "$out" 3
assert_contains "logs buildinfo failure" "$out" '/buildinfo failed for chloe-dong'
assert_contains "rollback fired after verify fail" "$out" 'ROLLBACK:'
rm -rf "$m"
# ─────────────────────────────────────────────────────────────────────────────
# Summary
# ─────────────────────────────────────────────────────────────────────────────
printf '\n────────────────────────────────────\n'
if [[ $FAIL -eq 0 ]]; then
printf 'All %d tests passed.\n' "$PASS"
exit 0
else
printf '%d passed, %d failed.\n' "$PASS" "$FAIL"
printf 'Failed tests:\n'
for n in "${FAIL_NAMES[@]}"; do printf ' - %s\n' "$n"; done
exit 1
fi