From ec39fecda295adbf07641212b2419e3c3d411eca Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Thu, 30 Apr 2026 11:35:56 -0700
Subject: [PATCH] fix(ci): hard-fail when >50% of fleet unreachable
 post-redeploy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Belt-and-suspenders sanity floor on top of the unreachable-soft-warn
introduced earlier in this PR. Addresses the residual gap noted in
review: if a new image crashes on startup, every tenant ends up
unreachable, and the soft-warn alone would let that ship as a green
deploy. Canary-verify catches it on the canary tenant first, but this
guard is a fallback for canary-skip dispatches and same-batch races.

Threshold is 50% of healthz_ok-snapshotted tenants — comfortably above
the typical e2e-* teardown rate (5-10/hour, ~1 ephemeral tenant per
batch) but below any plausible real-outage scenario.

Mirrored across staging.yml + main.yml for shape parity.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/redeploy-tenants-on-main.yml    | 10 ++++++++++
 .github/workflows/redeploy-tenants-on-staging.yml | 12 ++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/.github/workflows/redeploy-tenants-on-main.yml b/.github/workflows/redeploy-tenants-on-main.yml
index efacbe69..466c0249 100644
--- a/.github/workflows/redeploy-tenants-on-main.yml
+++ b/.github/workflows/redeploy-tenants-on-main.yml
@@ -306,6 +306,16 @@ jobs:
           if [ $UNREACHABLE_COUNT -gt 0 ]; then
             echo "::warning::$UNREACHABLE_COUNT tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages."
           fi
+
+          # Belt-and-suspenders sanity floor: same logic as the staging
+          # variant — if MORE than half the prod fleet is unreachable,
+          # this is a real outage, not a teardown race. Hard-fail.
+          TOTAL_VERIFIED=${#SLUGS[@]}
+          if [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then
+            echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED tenant(s) unreachable — exceeds 50% threshold. Likely real outage, not teardown race."
+            exit 1
+          fi
+
           if [ $STALE_COUNT -gt 0 ]; then
             echo "::error::$STALE_COUNT tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary."
             exit 1
diff --git a/.github/workflows/redeploy-tenants-on-staging.yml b/.github/workflows/redeploy-tenants-on-staging.yml
index 125f25c1..381d0a65 100644
--- a/.github/workflows/redeploy-tenants-on-staging.yml
+++ b/.github/workflows/redeploy-tenants-on-staging.yml
@@ -283,6 +283,18 @@ jobs:
           if [ $UNREACHABLE_COUNT -gt 0 ]; then
             echo "::warning::$UNREACHABLE_COUNT staging tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages."
           fi
+
+          # Belt-and-suspenders sanity floor: if MORE than half the fleet is
+          # unreachable, this isn't a teardown race — it's a real outage
+          # (e.g. the new image crashes on startup). Hard-fail. Canary-verify
+          # would catch this on the canary tenant first; this guard is a
+          # fallback for canary-skip dispatches and same-batch races.
+          TOTAL_VERIFIED=${#SLUGS[@]}
+          if [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then
+            echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED staging tenant(s) unreachable — exceeds 50% threshold. Likely real outage, not teardown race."
+            exit 1
+          fi
+
           if [ $STALE_COUNT -gt 0 ]; then
             echo "::error::$STALE_COUNT staging tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary."
             exit 1