diff --git a/.github/workflows/redeploy-tenants-on-main.yml b/.github/workflows/redeploy-tenants-on-main.yml index efacbe69..46743347 100644 --- a/.github/workflows/redeploy-tenants-on-main.yml +++ b/.github/workflows/redeploy-tenants-on-main.yml @@ -306,6 +306,17 @@ jobs: if [ $UNREACHABLE_COUNT -gt 0 ]; then echo "::warning::$UNREACHABLE_COUNT tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages." fi + + # Belt-and-suspenders sanity floor: same logic as the staging + # variant — see that file's comment for the full rationale. + # Floor only applies when fleet >= 4; below that, canary-verify + # is the actual gate. + TOTAL_VERIFIED=${#SLUGS[@]} + if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then + echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race." + exit 1 + fi + if [ $STALE_COUNT -gt 0 ]; then echo "::error::$STALE_COUNT tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary." exit 1 diff --git a/.github/workflows/redeploy-tenants-on-staging.yml b/.github/workflows/redeploy-tenants-on-staging.yml index 125f25c1..7f191e8d 100644 --- a/.github/workflows/redeploy-tenants-on-staging.yml +++ b/.github/workflows/redeploy-tenants-on-staging.yml @@ -283,6 +283,25 @@ jobs: if [ $UNREACHABLE_COUNT -gt 0 ]; then echo "::warning::$UNREACHABLE_COUNT staging tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages." fi + + # Belt-and-suspenders sanity floor: if MORE than half the fleet is + # unreachable AND the fleet is large enough that "half down" is + # statistically meaningful, this is a real outage (e.g. new image + # crashes on startup), not a teardown race. Hard-fail. + # + # Floor only applies when TOTAL_VERIFIED >= 4 — below that, the + # canary-verify step is the actual gate for "all tenants down" + # detection (it runs against the canary first and aborts the + # rollout if the canary fails to come up). Without the >=4 gate, + # a 1-tenant fleet (e.g. a single ephemeral e2e-* tenant on a + # quiet staging push) would re-flake on the exact teardown-race + # condition #2402 fixed: 1 of 1 unreachable = 100% > 50% → fail. + TOTAL_VERIFIED=${#SLUGS[@]} + if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then + echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED staging tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race." + exit 1 + fi + if [ $STALE_COUNT -gt 0 ]; then echo "::error::$STALE_COUNT staging tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary." exit 1